Merge branch 'develop' of github.com:PaddlePaddle/Paddle into dist_recordio

wangkuiyi-patch-1
Yancey1989 7 years ago
commit 8855d4a755

@ -4,6 +4,7 @@
| backyes | Yan-Fei Wang |
| baiyfbupt | Yi-Fan Bai |
| beckett1124 | Bin Qi |
| ChengduoZH | Cheng-Duo Zhao|
| chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng |

@ -29,7 +29,7 @@ RUN apt-get update && \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \
automake locales clang-format swig cmake \
liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool ccache && \

@ -0,0 +1,22 @@
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
RUN pip install -U pip
RUN pip install -U kubernetes opencv-python paddlepaddle
# IMPORTANT:
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py dataset.py models/ /workspace/

@ -44,11 +44,25 @@ Currently supported `--model` argument include:
## Run Distributed Benchmark on Kubernetes Cluster
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
have to start all those processes mannually on each node, which is not recommended.
To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
download it from
http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
build it by your own. Once you've got the "whl" package, put it under the current directory and run:
```bash
docker build -t [your docker image name]:[your docker image tag] .
```
Then push the image to a Docker registry that your Kubernetes cluster can reach.
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
```
Then the yaml files are generated under directory `myjob`, you can run:

@ -49,7 +49,7 @@ def parse_args():
parser.add_argument(
'--fluid', default=1, type=int, help='whether is fluid job')
parser.add_argument(
'--rdma', action='store_ture', help='whether mount rdma libs')
'--rdma', action='store_true', help='whether mount rdma libs')
parser.add_argument(
'--disttype',
default="pserver",

@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \
-l 1 &
# mnist
# mnist gpu mnist 128
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=mnist \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
# vgg16
# gpu cifar10 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=vgg16 \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
2>&1 | tee -a vgg16_gpu_128.log
# flowers gpu 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=vgg16 \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
# resnet50
# resnet50 gpu cifar10 128
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=resnet50 \
--device=GPU \
--batch_size=128 \
--data_set=cifar10 \
--model=resnet_cifar10 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_128.log
# resnet50 gpu flowers 64
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=resnet50 \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
--model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_flowers_64.log
# lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=stacked_dynamic_lstm \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
--hidden_dim=512 \
--emb_dim=512 \
--crop_size=1500 \
2>&1 | tee -a lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=machine_translation \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \

@ -1009,3 +1009,9 @@ ____
.. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
:noindex:
gather
____
.. autofunction:: paddle.fluid.layers.gather
:noindex:

@ -86,7 +86,7 @@
<br>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_compiler.png" width=100%>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
</p>
---

@ -17,3 +17,4 @@
:maxdepth: 1
concepts/use_concepts_cn.rst
developer's_guide_to_paddle_fluid.md

@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
:maxdepth: 1
concepts/index_en.rst
developer's_guide_to_paddle_fluid.md

@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装目前支持CentOS 6以上, Ubuntu 14.
pip install paddlepaddle
如果需要安装支持GPU的版本cuda7.5_cudnn5_avx_openblas需要执行
如果需要安装支持GPU的版本cuda8.0_cudnn5_avx_openblas需要执行
.. code-block:: bash

@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
pip install paddlepaddle
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
.. code-block:: bash

@ -4,5 +4,5 @@
.. toctree::
:maxdepth: 1
inference/index_cn.rst
optimization/index_cn.rst
inference/inference_support_in_fluid.md

@ -5,4 +5,3 @@ HOW TO
:maxdepth: 1
optimization/index_en.rst
inference/inference_support_in_fluid.md

@ -0,0 +1,96 @@
安装与编译C++预测库
===========================
直接下载安装
-------------
====================== ========================================
版本说明 C++预测库
====================== ========================================
cpu_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_
cpu_avx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
====================== ========================================
从源码编译
----------
用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项:
================= =========
选项 值
================= =========
CMAKE_BUILD_TYPE Release
FLUID_INSTALL_DIR 安装路径
WITH_FLUID_ONLY ON推荐
WITH_SWIG_PY OFF推荐
WITH_PYTHON OFF推荐
WITH_GPU ON/OFF
WITH_MKL ON/OFF
================= =========
建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
下面的代码片段从github拉取最新代码配制编译选项需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径
.. code-block:: bash
pip install paddlepaddle-gpu
PADDLE_ROOT=/path/of/capi
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir build
cd build
cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_FLUID_ONLY=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_PYTHON=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=OFF \
..
make
make inference_lib_dist
成功编译后使用C++预测库所需的依赖包括1编译出的PaddlePaddle预测库和头文件2第三方链接库和头文件3版本信息与编译选项信息
均会存放于PADDLE_ROOT目录中。目录结构如下
.. code-block:: text
PaddleRoot/
├── CMakeCache.txt
├── paddle
│   └── fluid
│   ├── framework
│   ├── inference
│   ├── memory
│   ├── platform
│   ├── pybind
│   └── string
├── third_party
│   ├── boost
│   │   └── boost
│   ├── eigen3
│   │   ├── Eigen
│   │   └── unsupported
│   └── install
│   ├── gflags
│   ├── glog
│   ├── mklml
│   ├── protobuf
│   ├── snappy
│   ├── snappystream
│   └── zlib
└── version.txt
version.txt 中记录了该预测库的版本信息包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号
.. code-block:: text
GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
WITH_MKL: ON
WITH_GPU: ON
CUDA version: 8.0
CUDNN version: v5

@ -0,0 +1,8 @@
预测库
------------
.. toctree::
:maxdepth: 1
build_and_install_lib_cn.rst
inference_support_in_fluid_cn.md

@ -1,9 +1,8 @@
# Fluid Inference使用指南
# 使用指南
## 目录:
- Python Inference API
- 编译Fluid Inference库
- Inference C++ API
- Inference实例
- Inference计算优化
@ -55,62 +54,6 @@
return [program, feed_target_names, fetch_targets]
```
## 编译Fluid Inference库
- **不需要额外的CMake选项**
- 1、 配置CMake命令更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
```bash
$ git clone https://github.com/PaddlePaddle/Paddle.git
$ cd Paddle
$ mkdir build
$ cd build
$ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_PYTHON=ON \
-DWITH_MKL=OFF \
-DWITH_GPU=OFF \
..
```
- 2、 编译PaddlePaddle
```bash
$ make
```
- 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
```bash
$ make inference_lib_dist
```
- 目录结构
```bash
$ cd your/path/to/paddle_inference_lib
$ tree
.
|-- paddle
| `-- fluid
| |-- framework
| |-- inference
| | |-- io.h
| | `-- libpaddle_fluid.so
| |-- memory
| |-- platform
| `-- string
|-- third_party
| |-- eigen3
| `-- install
| |-- gflags
| |-- glog
| `-- protobuf
`-- ...
```
假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
## 链接Fluid Inference库
- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))

@ -17,46 +17,33 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE)
function(inference_api_test TARGET_NAME TEST_SRC)
function(inference_api_test TARGET_NAME)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
cc_test(test_paddle_inference_${TARGET_NAME}
SRCS test_paddle_inference_${TARGET_NAME}.cc
DEPS paddle_fluid_api paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS})
list(APPEND arg_list "_${arg}")
endforeach()
else()
list(APPEND arg_list "_")
set_tests_properties(test_paddle_inference_${TARGET_NAME}
PROPERTIES DEPENDS "${inference_test_ARGS}")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(${TARGET_NAME}
SRCS ${TEST_SRC}
DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
# TODO(panyx0178): Figure out how to add word2vec and image_classification
# as deps.
# set_tests_properties(${TARGET_NAME}
# PROPERTIES DEPENDS ${DEP_TEST})
endforeach()
endfunction(inference_api_test)
cc_library(paddle_inference_api
SRCS paddle_inference_api.cc
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_impl
SRCS paddle_inference_api_impl.cc
DEPS paddle_inference_api paddle_fluid_api)
if(WITH_TESTING)
cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api)
cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api)
inference_api_test(test_paddle_inference_api_impl
test_paddle_inference_api_impl.cc)
inference_api_test(api_impl
ARGS test_word2vec test_image_classification)
endif()

@ -40,15 +40,24 @@ struct PaddleBuf {
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
// TODO(Superjomn) support following engines latter.
// kAnakin, // Use Anakin for inference.
// kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
// kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
};
/*
* A simple Inference API for Paddle. Currently this API might just be used by
* non-sequence scenerios.
* TODO(Superjomn) Prepare another API for NLP-related usages.
*/
* A simple Inference API for Paddle. Currently this API can be used by
* non-sequence scenerios.
*/
class PaddlePredictor {
public:
struct Config;
@ -66,34 +75,35 @@ class PaddlePredictor {
// be thread-safe.
virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
virtual bool InitShared() { return false; }
// Destroy the Predictor.
virtual ~PaddlePredictor() {}
friend std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
const PaddlePredictor::Config& config);
// The common configs for all the predictors.
struct Config {
enum class EngineKind;
std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on
// third-party engines.
EngineKind engine_kind{Config::EngineKind::kNone};
enum class EngineKind {
kNone = -1, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kTensorRT, // Use TensorRT for inference.
kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
};
};
};
// A factory to help create difference predictor.
template <typename ConfigT>
struct NativeConfig : public PaddlePredictor::Config {
// GPU related fields.
bool use_gpu{false};
int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
std::string prog_file;
std::string param_file;
};
// A factory to help create different predictors.
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type and engine kind. Similar
// configs can be merged, but there shouldn't be a huge config containing
// different fields for more than one kind of predictors.
//
// Similarly, each engine kind should map to a unique predictor implementation.
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle

@ -54,11 +54,10 @@ std::string num2str(T a) {
}
} // namespace
bool PaddlePredictorImpl::Init() {
bool NativePaddlePredictor::Init() {
VLOG(3) << "Predictor::init()";
// TODO(panyx0718): Should CPU vs GPU device be decided by id?
if (config_.device >= 0) {
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
@ -85,19 +84,21 @@ bool PaddlePredictorImpl::Init() {
}
ctx_ = executor_->Prepare(*inference_program_, 0);
// Create variables
// TODO(panyx0718): Why need to test share_variables here?
if (config_.share_variables) {
executor_->CreateVariables(*inference_program_, scope_.get(), 0);
}
// Create temporary variables first, so that the first batch do not need to
// create variables in the runtime. This is the logics of the old inference
// API.
// TODO(Superjomn) this should be modified when `Clone` is valid for
// multi-thread application.
executor_->CreateVariables(*inference_program_, scope_.get(), 0);
// Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true;
}
bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
VLOG(3) << "Predictor::predict";
Timer timer;
timer.tic();
@ -124,7 +125,7 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
scope_.get(),
&feed_targets,
&fetch_targets,
!config_.share_variables);
false /* don't create variable eatch time */);
if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs";
return false;
@ -133,59 +134,20 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
return true;
}
std::unique_ptr<PaddlePredictor> PaddlePredictorImpl::Clone() {
std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
VLOG(3) << "Predictor::clone";
std::unique_ptr<PaddlePredictor> cls(new PaddlePredictorImpl(config_));
if (!cls->InitShared()) {
LOG(ERROR) << "fail to call InitShared";
std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init()) {
LOG(ERROR) << "fail to call Init";
return nullptr;
}
// fix manylinux compile error.
return std::move(cls);
}
// TODO(panyx0718): Consider merge with Init()?
bool PaddlePredictorImpl::InitShared() {
VLOG(3) << "Predictor::init_shared";
// 1. Define place, executor, scope
if (this->config_.device >= 0) {
place_ = platform::CUDAPlace();
} else {
place_ = platform::CPUPlace();
}
this->executor_.reset(new framework::Executor(this->place_));
this->scope_.reset(new framework::Scope());
// Initialize the inference program
if (!this->config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
this->inference_program_ = inference::Load(
this->executor_.get(), this->scope_.get(), this->config_.model_dir);
} else if (!this->config_.prog_file.empty() &&
!this->config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
this->inference_program_ = inference::Load(this->executor_.get(),
this->scope_.get(),
this->config_.prog_file,
this->config_.param_file);
}
this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
// 3. create variables
// TODO(panyx0718): why test share_variables.
if (config_.share_variables) {
this->executor_->CreateVariables(
*this->inference_program_, this->scope_.get(), 0);
}
// 4. Get the feed_target_names and fetch_target_names
this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
return true;
}
bool PaddlePredictorImpl::SetFeed(const std::vector<PaddleTensor> &inputs,
std::vector<framework::LoDTensor> *feeds) {
bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
std::vector<framework::LoDTensor> *feeds) {
VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feed_target_names_.size()) {
LOG(ERROR) << "wrong feed input size.";
@ -213,7 +175,7 @@ bool PaddlePredictorImpl::SetFeed(const std::vector<PaddleTensor> &inputs,
return true;
}
bool PaddlePredictorImpl::GetFetch(
bool NativePaddlePredictor::GetFetch(
const std::vector<framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *outputs) {
VLOG(3) << "Predictor::get_fetch";
@ -280,23 +242,29 @@ bool PaddlePredictorImpl::GetFetch(
}
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
const ConfigImpl &config) {
VLOG(3) << "create PaddlePredictorImpl";
// 1. GPU memeroy
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
num2str<float>(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
const NativeConfig &config) {
VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE(
config.fraction_of_gpu_memory > 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
num2str<float>(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
}
}
std::unique_ptr<PaddlePredictor> predictor(new PaddlePredictorImpl(config));
if (!dynamic_cast<PaddlePredictorImpl *>(predictor.get())->Init()) {
std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init()) {
return nullptr;
}
return std::move(predictor);

@ -29,17 +29,10 @@
namespace paddle {
struct ConfigImpl : public PaddlePredictor::Config {
int device;
float fraction_of_gpu_memory;
std::string prog_file;
std::string param_file;
bool share_variables;
};
class PaddlePredictorImpl : public PaddlePredictor {
class NativePaddlePredictor : public PaddlePredictor {
public:
explicit PaddlePredictorImpl(const ConfigImpl &config) : config_(config) {}
explicit NativePaddlePredictor(const NativeConfig &config)
: config_(config) {}
bool Init();
@ -48,16 +41,15 @@ class PaddlePredictorImpl : public PaddlePredictor {
std::unique_ptr<PaddlePredictor> Clone() override;
~PaddlePredictorImpl() override{};
~NativePaddlePredictor() override{};
private:
bool InitShared() override;
bool SetFeed(const std::vector<PaddleTensor> &input_datas,
std::vector<framework::LoDTensor> *feeds);
bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *output_data);
ConfigImpl config_;
NativeConfig config_;
platform::Place place_;
std::unique_ptr<framework::Executor> executor_;
std::unique_ptr<framework::Scope> scope_;

@ -40,19 +40,19 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
return pt;
}
ConfigImpl GetConfig() {
ConfigImpl config;
NativeConfig GetConfig() {
NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15;
config.use_gpu = true;
config.device = 0;
config.share_variables = true;
return config;
}
TEST(paddle_inference_api_impl, word2vec) {
ConfigImpl config = GetConfig();
std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
NativeConfig config = GetConfig();
auto predictor = CreatePaddlePredictor<NativeConfig>(config);
framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}};
@ -104,7 +104,7 @@ TEST(paddle_inference_api_impl, image_classification) {
int batch_size = 2;
bool use_mkldnn = false;
bool repeat = false;
ConfigImpl config = GetConfig();
NativeConfig config = GetConfig();
config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model";
@ -133,7 +133,7 @@ TEST(paddle_inference_api_impl, image_classification) {
is_combined,
use_mkldnn);
std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
auto predictor = CreatePaddlePredictor(config);
std::vector<PaddleTensor> paddle_tensor_feeds;
paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
@ -144,8 +144,7 @@ TEST(paddle_inference_api_impl, image_classification) {
float* data = static_cast<float*>(outputs[0].data.data);
float* lod_data = output1.data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_LT(lod_data[j] - data[j], 1e-10);
EXPECT_GT(lod_data[j] - data[j], -1e-10);
EXPECT_NEAR(lod_data[j], data[j], 1e-3);
}
free(data);
}

@ -200,7 +200,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
vars_[var_desc.name()].reset(new VarDesc(var_desc));
}
for (const proto::OpDesc &op_desc : desc_->ops()) {
ops_.emplace_back(new OpDesc(op_desc, prog, this));
ops_.emplace_back(new OpDesc(op_desc, this));
}
}
@ -209,7 +209,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
: prog_(prog), desc_(desc) {
need_update_ = true;
for (auto &op : other.ops_) {
ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
ops_.emplace_back(new OpDesc(*op, this));
}
for (auto &it : other.vars_) {
auto *var = new VarDesc(*it.second);

@ -105,7 +105,7 @@ class BlockDesc {
size_t OpSize() const { return ops_.size(); }
OpDesc *Op(int idx) { return ops_.at(idx).get(); }
OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
void Flush();

@ -11,11 +11,15 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include <algorithm>
#include <fstream>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
@ -26,9 +30,6 @@
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif
#include <string>
#include <vector>
DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
"the ssa graph path only print with GLOG_v=10,"
"default /tmp/graph.dot");
@ -148,9 +149,9 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
std::unordered_map<std::string, proto::VarType::Type> var_types;
std::unordered_map<std::string, VarDesc *> all_vars;
for (auto *var : program.Block(0).AllVars()) {
var_types[var->Name()] = var->GetType();
all_vars[var->Name()] = var;
}
auto graph = new SSAGraph();
@ -167,12 +168,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
auto send_vars = FindDistTrainSendVars(program);
auto recv_vars = FindDistTrainRecvVars(program);
size_t cur_device_id = 0;
std::vector<std::unordered_set<std::string>> var_name_on_devices;
std::vector<std::unordered_set<std::string>> bcast_var_name_set;
var_name_on_devices.resize(places_.size());
bcast_var_name_set.resize(places_.size());
size_t cur_device_id = 0;
std::vector<int64_t> balance_grads(places_.size(), 0);
auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
auto var_desc = all_vars.at(g_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GE(numel, 0);
auto smallest =
std::min_element(std::begin(balance_grads), std::end(balance_grads));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
balance_grads[dev_id] += numel;
return dev_id;
};
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
if (boost::get<int>(
@ -220,13 +237,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce:
cur_device_id = get_appropriate_dev(g_name);
CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices[cur_device_id].emplace(g_name);
bcast_var_name_set[cur_device_id].emplace(p_name);
cur_device_id = (cur_device_id + 1) % places_.size();
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(var_types, g_name)) {
if (IsSparseGradient(all_vars, g_name)) {
CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0);
} else {
@ -269,10 +286,10 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
}
bool MultiDevSSAGraphBuilder::IsSparseGradient(
const std::unordered_map<std::string, proto::VarType::Type> &var_types,
const std::unordered_map<std::string, VarDesc *> &all_vars,
const std::string &og) const {
PADDLE_ENFORCE(var_types.count(og) != 0);
if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
PADDLE_ENFORCE(all_vars.count(og) != 0);
if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
return true;
}
return false;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save