diff --git a/CMakeLists.txt b/CMakeLists.txt
index c649aafedd..23bbe829ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,6 +179,7 @@ set(EXTERNAL_LIBS
if(WITH_GPU)
include(cuda)
+ include(tensorrt)
endif(WITH_GPU)
if(WITH_AMD_GPU)
diff --git a/Dockerfile b/Dockerfile
index 0f13acabc3..870304a6ac 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
# install glide
RUN curl -s -q https://glide.sh/get | sh
+# Install TensorRT
+# The unnecessary files has been removed to make the library small. It only contains include and lib now.
+RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+ tar -xz -C /usr/local && \
+ cp -rf /usr/local/TensorRT/include /usr && \
+ cp -rf /usr/local/TensorRT/lib /usr
+
# git credential to skip password typing
RUN git config --global credential.helper store
diff --git a/Dockerfile.android b/Dockerfile.android
index cc022d596b..848a7eba6f 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
# Fix locales to en_US.UTF-8
RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
pip install -U 'protobuf==3.1.0' && \
pip install -U wheel sphinx && \
pip install pre-commit
diff --git a/paddle/scripts/check_env.sh b/benchmark/paddle/image/check_env.sh
similarity index 100%
rename from paddle/scripts/check_env.sh
rename to benchmark/paddle/image/check_env.sh
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index f726405c47..e490397cc0 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -80,6 +80,16 @@ if(WITH_GPU)
# Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE})
+
+ if(TENSORRT_FOUND)
+ if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+ message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+ endif()
+ if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+ message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+ endif()
+ include_directories(${TENSORRT_INCLUDE_DIR})
+ endif()
elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index aa24915947..e90948782b 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git"
- GIT_TAG "v1.11.x"
+ GIT_TAG "v1.10.x"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
new file mode 100644
index 0000000000..0c07d36bed
--- /dev/null
+++ b/cmake/tensorrt.cmake
@@ -0,0 +1,33 @@
+if(NOT WITH_GPU)
+ return()
+endif()
+
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+ PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
+ $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
+ NO_DEFAULT_PATH
+)
+
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+ PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
+ $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
+ NO_DEFAULT_PATH
+ DOC "Path to TensorRT library.")
+
+if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+ set(TENSORRT_FOUND ON)
+else()
+ set(TENSORRT_FOUND OFF)
+endif()
+
+if(TENSORRT_FOUND)
+ file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+ string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
+ "${TENSORRT_VERSION_FILE_CONTENTS}")
+ string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+ TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+
+ message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+ "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+endif()
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 7066637a7c..0f95216169 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL
add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn
- paddle_fluid_docs paddle_fluid_docs_cn)
+ paddle_fluid_docs paddle_fluid_docs_cn
+ paddle_mobile_docs paddle_mobile_docs_cn)
add_subdirectory(v2)
add_subdirectory(fluid)
+add_subdirectory(mobile)
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index ee69925fda..f186c9c85a 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,3 +33,45 @@ Xavier
:members:
:noindex:
+MSRA
+------
+
+.. autoclass:: paddle.fluid.initializer.MSRA
+ :members:
+ :noindex:
+
+ConstantInitializer
+-------------------
+
+.. autoclass:: paddle.fluid.initializer.ConstantInitializer
+ :members:
+ :noindex:
+
+UniformInitializer
+------------------
+
+.. autoclass:: paddle.fluid.initializer.UniformInitializer
+ :members:
+ :noindex:
+
+NormalInitializer
+-----------------
+
+.. autoclass:: paddle.fluid.initializer.NormalInitializer
+ :members:
+ :noindex:
+
+XavierInitializer
+-----------------
+
+.. autoclass:: paddle.fluid.initializer.XavierInitializer
+ :members:
+ :noindex:
+ MSRA
+ ------
+
+MSRAInitializer
+-----------------
+.. autoclass:: paddle.fluid.initializer.MSRAInitializer
+ :members:
+ :noindex:
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 22e6fb13d7..5c02886efd 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -473,6 +473,12 @@ multiplex
.. autofunction:: paddle.fluid.layers.multiplex
:noindex:
+label_smooth
+------------
+
+.. autofunction:: paddle.fluid.layers.label_smooth
+ :noindex:
+
ops
===
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
index 9aed3b059a..4f88e27bed 100644
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -84,7 +84,7 @@ Running an operator can be asynchronized. There is a thread pool to execute an `
## Synchronize GPU Kernels
-The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
1. `OpHandle` will record `DeviceContext` that it is used.
2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000..2dd652d8bd
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+
+
+1. Pserver
+ 1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+ 1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+ 1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+ 1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+ 1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+ 1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+ 1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+ 1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+ 1. Pserver will run the optimization block to optimize the parameters.
+ 1. After optimization, pserver set `barrier_condition_` to 1.
+ 1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+ 1. The trainer uses GetVariable to get all the parameters from pserver.
+ 1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+ 1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+
+
+1. Pserver:
+ 1. Each parameter has a queue to receive its gradient from trainers.
+ 1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+ 1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+ 1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+ 1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+ 2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000..6a0835b761
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,58 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
+1. After the PServer node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
+so all the Trainers would get the same parameters.
+
+In the synchronously distributed training, there should be a `Barrier` to synchronise the
+parameters after the optimizing stage. The performance of a distributed training job would
+depend on the slowest node if there were hundreds or thousands of training nodes in a
+Job, the performance of synchronously distributed training might be very poor because of
+the slow node. So this design doc would introduce an approach to implement
+*asynchronously* distributed training in PaddlePaddle Fluid.
+
+## Design
+
+
+
+As the figure above, we describe a global view of asynchronously update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PServer
+instances and then send them.
+1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch latest parameter from PServer before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PServer instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
new file mode 100644
index 0000000000..4ad3afc7b7
--- /dev/null
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args: Need add execute args to assign when and how to use MPI operations.
+* New ops: Need new op ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add ```mpi_send_op``` and ```mpi_listenandserve_op``` to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI``` compile args to control MPI to use or not. If the ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer), The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests, the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives, the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if confirm to use MPI, we will modify ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000..3b53ab59c0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000..d230161177
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000..7d900b0c0e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000..3a63188868
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000..3e8db973f4
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000..1168801bc1
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
new file mode 100644
index 0000000000..e6b6a3e5d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/mpi_module.png differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000..e4f9a221fe
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index b123b756e2..ad798003f5 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,7 @@
.. toctree::
:maxdepth: 1
+ api_doc_std_cn.md
new_op_cn.md
new_op_kernel.md
use_eigen_cn.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 98988fc22d..80c899a82f 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,7 @@ Development
.. toctree::
:maxdepth: 1
+ api_doc_std_en.md
new_op_en.md
new_op_kernel.md
use_eigen_en.md
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
new file mode 100644
index 0000000000..b104a6318d
--- /dev/null
+++ b/doc/mobile/CMakeLists.txt
@@ -0,0 +1,53 @@
+if(NOT DEFINED SPHINX_THEME)
+ set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+ set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+ "${BINARY_BUILD_DIR_EN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_mobile_docs
+ html
+ ${BINARY_BUILD_DIR_EN}
+ ${SPHINX_CACHE_DIR_EN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+ "${BINARY_BUILD_DIR_CN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_mobile_docs_cn
+ html
+ ${BINARY_BUILD_DIR_CN}
+ ${SPHINX_CACHE_DIR_CN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000..8297316e8f
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+移动端
+=====
+
+.. toctree::
+ :maxdepth: 1
+
+ cross_compiling_for_android_cn.md
+ cross_compiling_for_ios_cn.md
+ cross_compiling_for_raspberry_cn.md
\ No newline at end of file
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000..e0acdff028
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+Mobile
+======
+
+.. toctree::
+ :maxdepth: 1
+
+ cross_compiling_for_android_en.md
+ cross_compiling_for_ios_en.md
+ cross_compiling_for_raspberry_en.md
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index dfc52b012f..bcd61335be 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -77,14 +77,9 @@ struct TestBroadcastOpHandle {
local_scopes_[input_scope_idx]->Var("input");
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
-
- vars_.emplace_back(new VarHandle());
- VarHandle* in_var_handle = static_cast(vars_.back().get());
- in_var_handle->place_ = gpu_list_[input_scope_idx];
- in_var_handle->name_ = "input";
- in_var_handle->version_ = 1;
- in_var_handle->scope_idx_ = input_scope_idx;
- in_var_handle->generated_op_ = nullptr;
+ auto* in_var_handle =
+ new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+ vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add dummy var
@@ -96,12 +91,8 @@ struct TestBroadcastOpHandle {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
- vars_.emplace_back(new VarHandle());
- VarHandle* out_var_handle = static_cast(vars_.back().get());
- out_var_handle->place_ = gpu_list_[j];
- out_var_handle->name_ = "out";
- out_var_handle->version_ = 2;
- out_var_handle->scope_idx_ = j;
+ VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+ vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 10839f239d..2da8c89d2d 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -79,13 +79,8 @@ struct TestGatherOpHandle {
// add input
for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
- vars_.emplace_back(new VarHandle());
- VarHandle* in_var_handle = static_cast(vars_.back().get());
- in_var_handle->place_ = gpu_list_[j];
- in_var_handle->name_ = "input";
- in_var_handle->version_ = 1;
- in_var_handle->scope_idx_ = j;
- in_var_handle->generated_op_ = nullptr;
+ auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+ vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
}
@@ -97,12 +92,9 @@ struct TestGatherOpHandle {
op_handle_->AddInput(in_dummy_var_handle);
// add output
- vars_.emplace_back(new VarHandle());
- VarHandle* out_var_handle = static_cast(vars_.back().get());
- out_var_handle->place_ = gpu_list_[input_scope_idx];
- out_var_handle->name_ = "out";
- out_var_handle->version_ = 2;
- out_var_handle->scope_idx_ = input_scope_idx;
+ auto* out_var_handle =
+ new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+ vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
// add dummy var
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index e0dd9e6068..4d76dbf7f6 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,21 +55,21 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
}
}
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
+ const OpDesc &op,
const platform::Place &p,
const size_t &i) const {
auto *op_handle = result->ops_.back().get();
- op_handle->dev_ctxes_[p] = const_cast(
- platform::DeviceContextPool::Instance().Get(p));
+ op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
- auto var_names = op->InputArgumentNames();
+ auto var_names = op.InputArgumentNames();
for (auto &each_var_name : var_names) {
VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
op_handle->AddInput(var);
}
- var_names = op->OutputArgumentNames();
+ var_names = op.OutputArgumentNames();
for (auto &each_var_name : var_names) {
CreateOpOutput(result, op_handle, each_var_name, p, i);
@@ -107,7 +107,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
result.ops_.emplace_back(new SendOpHandle(*op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
- CreateOpHandleIOs(&result, op, p, 0);
+ CreateOpHandleIOs(&result, *op, p, 0);
continue;
}
@@ -117,7 +117,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
auto *op_handle = result.ops_.back().get();
- CreateOpHandleIOs(&result, op, p, i);
+ CreateOpHandleIOs(&result, *op, p, i);
auto var_names = op->OutputArgumentNames();
@@ -177,13 +177,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
auto &prev_grad = vars[vars.size() - 1];
op_handle->AddInput(prev_grad.get());
- vars.emplace_back(new VarHandle);
- auto &var = vars.back();
- var->place_ = p;
- var->name_ = og;
- var->version_ = vars.size() - 1;
-
- op_handle->AddOutput(var.get());
+ auto var = new VarHandle(vars.size() - 1, i, og, p);
+ vars.emplace_back(var);
+ op_handle->AddOutput(var);
}
#else
PADDLE_ENFORCE("Not implemented");
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index de34caab1b..f1518d75b4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -45,8 +45,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
std::unique_ptr Build(const ProgramDesc &program) const override;
private:
- void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
- const size_t &i) const;
+ void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
+ const platform::Place &p, const size_t &i) const;
private:
std::string loss_var_name_;
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index be5fb75775..25e8c77bb4 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -54,13 +54,8 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
auto &var_holder = var_holders[each_var_name];
VarHandle *var = nullptr;
if (var_holder.empty()) {
- var_holder.emplace_back(new VarHandle);
- auto &init_var = var_holder[0];
- init_var->place_ = place;
- init_var->name_ = each_var_name;
- init_var->generated_op_ = nullptr;
- init_var->version_ = 0;
- var = init_var.get();
+ var = new VarHandle(0, place_offset, each_var_name, place);
+ var_holder.emplace_back(var);
} else {
var = var_holder.rbegin()->get();
}
@@ -73,12 +68,9 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
size_t place_offset) {
auto &vars = graph->vars_[place_offset][each_var_name];
size_t version = vars.size();
- vars.emplace_back(new VarHandle());
- auto &var = vars.back();
- var->version_ = version;
- var->name_ = each_var_name;
- var->place_ = place;
- op_handle->AddOutput(var.get());
+ auto var = new VarHandle(version, place_offset, each_var_name, place);
+ vars.emplace_back(var);
+ op_handle->AddOutput(var);
}
template
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index a371ee10fe..3d2bd633af 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -33,13 +33,6 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
running_ops_(0),
allow_op_delay_(allow_op_delay) {}
-void ThreadedSSAGraphExecutor::RunDelayedOps(
- const std::unordered_set &delayed_ops) {
- for (auto op : delayed_ops) {
- op->Run(use_event_);
- }
-}
-
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector &fetch_tensors) {
std::unordered_map pending_ops;
@@ -51,8 +44,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// together since we currently cannot overlap computation and memcpy streams.
// Should revisit it if overlapping is available.
std::unordered_set delayed_ops;
- std::unordered_set blocked_by_delayed_ops;
- std::unordered_set delayed_vars;
auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
pending_vars.insert(&var);
@@ -122,24 +113,26 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
InsertPendingOp(*op);
}
- auto run_all_ready_ops = [&] {
- for (auto *op : ready_ops) {
- if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
- delayed_ops.insert(op);
- delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
- ready_vars.Extend(op->outputs_);
- continue;
- }
+ auto run_all_ops = [&](std::unordered_set &set) {
+ for (auto *op : set) {
running_ops_++;
RunOp(&ready_vars, op);
}
- ready_ops.clear();
+ set.clear();
};
// Step 3. Execution
- while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
+ while (!pending_vars.empty()) {
// 1. Run All Ready ops
- run_all_ready_ops();
+ // Keep loop until all vars are ready.
+ //
+ // NOTE: DelayedOps have a lower priority. It will be scheduled after all
+ // ready_ops have been performed.
+ if (ready_ops.empty() && allow_op_delay_) {
+ run_all_ops(delayed_ops);
+ } else {
+ run_all_ops(ready_ops);
+ }
// 2. Find ready variable
bool timeout;
@@ -160,29 +153,16 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto &deps = pending_ops[op];
--deps;
if (deps == 0) {
- if (delayed_vars.find(ready_var) != delayed_vars.end()) {
- blocked_by_delayed_ops.insert(op);
+ if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+ delayed_ops.insert(op);
} else {
ready_ops.insert(op);
}
}
}
}
- // When there are no other ops to schedule, schedule buffered delayed
- // ops and unblock other ops.
- if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
- RunDelayedOps(delayed_ops);
- delayed_ops.clear();
- for (auto *op : blocked_by_delayed_ops) {
- ready_ops.insert(op);
- }
- blocked_by_delayed_ops.clear();
- }
- // Keep loop until all vars are ready.
}
PADDLE_ENFORCE(ready_ops.empty());
- PADDLE_ENFORCE(delayed_ops.empty());
- PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
// Wait FetchOps.
if (!fetch_ops.empty()) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index bb5e837b13..d70bbd4ef0 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -88,8 +88,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
void RunOp(BlockingQueue *ready_var_q,
details::OpHandleBase *op);
- void RunDelayedOps(const std::unordered_set &delayed_ops);
-
private:
std::unique_ptr<::ThreadPool> pool_;
std::vector local_scopes_;
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 871e41343f..2b887c67e6 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -16,6 +16,7 @@
#include
#include
#include
+#include
#include "paddle/fluid/platform/place.h"
@@ -33,10 +34,10 @@ struct VarHandleBase {
// The operator who generate this variable. nullptr if the variable
// is a root node.
- OpHandleBase *generated_op_;
+ OpHandleBase* generated_op_{nullptr};
// Operators which depend on this variable ready.
- std::unordered_set pending_ops_;
+ std::unordered_set pending_ops_;
};
// VarHandle is actually a single version of Runtime Variable.
@@ -47,6 +48,13 @@ struct VarHandleBase {
struct VarHandle : public VarHandleBase {
std::string DebugString() const override;
+ VarHandle(size_t version, size_t scope_index, std::string name,
+ platform::Place place)
+ : version_(version),
+ scope_idx_(scope_index),
+ name_(std::move(name)),
+ place_(std::move(place)) {}
+
// version field currently is not used, however, just store the version to
// debug easily.
size_t version_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c1486b527d..0962f40c4a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -155,13 +155,9 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif
}
-void ParallelExecutor::Run(
- const std::vector &fetch_tensors,
- const std::string &fetched_var_name,
- const std::unordered_map &feed_tensors) {
+void ParallelExecutor::Run(const std::vector &fetch_tensors,
+ const std::string &fetched_var_name) {
platform::RecordBlock b(0);
- SplitTensorToPlaces(feed_tensors);
-
// Create local scopes.
for (auto &scope : member_->local_scopes_) {
Scope &local_scope = scope->NewScope();
@@ -195,14 +191,28 @@ void ParallelExecutor::Run(
auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable();
scope->DeleteScope(local_scope);
- local_scope = nullptr;
}
}
-void ParallelExecutor::SplitTensorToPlaces(
- const std::unordered_map &feed_tensors) {
- for (auto it : feed_tensors) {
- auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+void ParallelExecutor::FeedTensorsIntoLocalScopes(
+ const std::vector> &tensors) {
+ PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
+
+ for (size_t i = 0; i < tensors.size(); ++i) {
+ auto &map = tensors[i];
+ auto *scope = member_->local_scopes_[i];
+ for (auto &pair : map) {
+ auto *trg = scope->Var(pair.first)->GetMutable();
+ trg->ShareDataWith(pair.second);
+ trg->set_lod(pair.second.lod());
+ }
+ }
+}
+
+void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
+ const std::unordered_map &tensors) {
+ for (auto pair : tensors) {
+ auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
PADDLE_ENFORCE_EQ(
member_->places_.size(), lod_tensors.size(),
"The number of samples of current batch is less than the count of "
@@ -211,7 +221,7 @@ void ParallelExecutor::SplitTensorToPlaces(
for (size_t j = 0; j < member_->places_.size(); ++j) {
// TODO(panxy0718): Do I need to delete this var?
auto t =
- member_->local_scopes_[j]->Var(it.first)->GetMutable();
+ member_->local_scopes_[j]->Var(pair.first)->GetMutable();
t->ShareDataWith(lod_tensors[j]);
t->set_lod(lod_tensors[j].lod());
}
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index b4f16dba85..303ac3bc55 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -44,16 +44,22 @@ class ParallelExecutor {
std::vector& GetLocalScopes();
+ /**
+ * Feed tensors to local scopes. The size of tensors should be equal to the
+ * size of local scopes.
+ */
+ void FeedTensorsIntoLocalScopes(
+ const std::vector>& tensors);
+
+ void FeedAndSplitTensorIntoLocalScopes(
+ const std::unordered_map& tensors);
+
void Run(const std::vector& fetch_tensors,
- const std::string& fetched_var_name,
- const std::unordered_map& feed_tensors);
+ const std::string& fetched_var_name);
void BCastParamsToGPUs(const std::unordered_set& vars) const;
private:
- void SplitTensorToPlaces(
- const std::unordered_map& feed_tensors);
-
ParallelExecutorPrivate* member_;
};
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 66618a291b..6c46e9aad5 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -66,7 +66,7 @@ TEST(ProgramDesc, copy_ctor) {
for (size_t i = 0; i < global_block->OpSize(); ++i) {
auto op_origin = global_block->Op(i);
- auto op_copy = global_block->Op(i);
+ auto op_copy = global_block_copy->Op(i);
ASSERT_EQ(op_origin->Type(), op_copy->Type());
ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
@@ -131,7 +131,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
for (size_t i = 0; i < global_block->OpSize(); ++i) {
auto op_origin = global_block->Op(i);
- auto op_restored = global_block->Op(i);
+ auto op_restored = global_block_restored->Op(i);
ASSERT_EQ(op_origin->Type(), op_restored->Type());
ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index e53bcf2384..cc45bfe9b1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,4 +21,7 @@ endif()
if(WITH_TESTING)
add_subdirectory(tests/book)
+ if (TENSORRT_FOUND)
+ add_subdirectory(tensorrt)
+ endif()
endif()
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000..e39c0daac7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -0,0 +1 @@
+nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
new file mode 100644
index 0000000000..a81a708e7a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include
+#include
+#include "NvInfer.h"
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+
+namespace dy = paddle::platform::dynload;
+
+class Logger : public nvinfer1::ILogger {
+ public:
+ void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+ switch (severity) {
+ case Severity::kINFO:
+ LOG(INFO) << msg;
+ break;
+ case Severity::kWARNING:
+ LOG(WARNING) << msg;
+ break;
+ case Severity::kINTERNAL_ERROR:
+ case Severity::kERROR:
+ LOG(ERROR) << msg;
+ break;
+ default:
+ break;
+ }
+ }
+};
+
+class ScopedWeights {
+ public:
+ ScopedWeights(float value) : value_(value) {
+ w.type = nvinfer1::DataType::kFLOAT;
+ w.values = &value_;
+ w.count = 1;
+ }
+ const nvinfer1::Weights& get() { return w; }
+
+ private:
+ float value_;
+ nvinfer1::Weights w;
+};
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) {
+ return static_cast(
+ dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) {
+ return static_cast(
+ dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+
+// Creates a network to compute y = 2x + 3
+nvinfer1::IHostMemory* CreateNetwork() {
+ Logger logger;
+ // Create the engine.
+ nvinfer1::IBuilder* builder = createInferBuilder(logger);
+ ScopedWeights weights(2.);
+ ScopedWeights bias(3.);
+
+ nvinfer1::INetworkDefinition* network = builder->createNetwork();
+ // Add the input
+ auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+ nvinfer1::DimsCHW{1, 1, 1});
+ EXPECT_NE(input, nullptr);
+ // Add the hidden layer.
+ auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+ EXPECT_NE(layer, nullptr);
+ // Mark the output.
+ auto output = layer->getOutput(0);
+ output->setName(kOutputTensor);
+ network->markOutput(*output);
+ // Build the engine.
+ builder->setMaxBatchSize(1);
+ builder->setMaxWorkspaceSize(1 << 10);
+ auto engine = builder->buildCudaEngine(*network);
+ EXPECT_NE(engine, nullptr);
+ // Serialize the engine to create a model, then close.
+ nvinfer1::IHostMemory* model = engine->serialize();
+ network->destroy();
+ engine->destroy();
+ builder->destroy();
+ return model;
+}
+
+void Execute(nvinfer1::IExecutionContext& context, const float* input,
+ float* output) {
+ const nvinfer1::ICudaEngine& engine = context.getEngine();
+ // Two binds, input and output
+ ASSERT_EQ(engine.getNbBindings(), 2);
+ const int input_index = engine.getBindingIndex(kInputTensor);
+ const int output_index = engine.getBindingIndex(kOutputTensor);
+ // Create GPU buffers and a stream
+ void* buffers[2];
+ ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+ ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+ cudaStream_t stream;
+ ASSERT_EQ(0, cudaStreamCreate(&stream));
+ // Copy the input to the GPU, execute the network, and copy the output back.
+ ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+ cudaMemcpyHostToDevice, stream));
+ context.enqueue(1, buffers, stream, nullptr);
+ ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+ cudaMemcpyDeviceToHost, stream));
+ cudaStreamSynchronize(stream);
+
+ // Release the stream and the buffers
+ cudaStreamDestroy(stream);
+ ASSERT_EQ(0, cudaFree(buffers[input_index]));
+ ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+
+TEST(TensorrtTest, BasicFunction) {
+ // Create the network serialized model.
+ nvinfer1::IHostMemory* model = CreateNetwork();
+
+ // Use the model to create an engine and an execution context.
+ Logger logger;
+ nvinfer1::IRuntime* runtime = createInferRuntime(logger);
+ nvinfer1::ICudaEngine* engine =
+ runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+ model->destroy();
+ nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+
+ // Execute the network.
+ float input = 1234;
+ float output;
+ Execute(*context, &input, &output);
+ EXPECT_EQ(output, input * 2 + 3);
+
+ // Destroy the engine.
+ context->destroy();
+ engine->destroy();
+ runtime->destroy();
+}
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 718f469d38..4a8dfd4b54 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 3cc6ed3105..4cb0457d92 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
+#include
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
@@ -87,7 +88,7 @@ struct BeamSearchDecoder {
*/
std::vector> PackTwoSteps(
const LoDTensor& cur_ids, const LoDTensor& cur_scores,
- std::vector>& prefixes_list,
+ std::vector>* prefixes_list,
std::vector>* sentence_vector_list) const;
/**
@@ -140,7 +141,7 @@ Sentence BeamSearchDecoder::MakeSentence(const BeamNode* node) const {
template
std::vector> BeamSearchDecoder::PackTwoSteps(
const LoDTensor& cur_ids, const LoDTensor& cur_scores,
- std::vector>& prefixes_list,
+ std::vector>* prefixes_list,
std::vector>* sentence_vector_list) const {
std::vector> result;
@@ -153,7 +154,7 @@ std::vector> BeamSearchDecoder::PackTwoSteps(
// if prefixes size is 0, it means this is the first step. In this step,
// all candidate id is the start of candidate sentences.
- if (prefixes_list.empty()) {
+ if (prefixes_list->empty()) {
PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
cur_ids.lod().at(kSentenceLevel).back(),
"in the first step");
@@ -162,7 +163,7 @@ std::vector> BeamSearchDecoder::PackTwoSteps(
cur_ids.data()[id_idx], cur_scores.data()[id_idx])));
}
} else {
- BeamNodeVector& prefixes = prefixes_list[src_idx];
+ BeamNodeVector& prefixes = prefixes_list->at(src_idx);
SentenceVector& sentence_vector = (*sentence_vector_list)[src_idx];
PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
@@ -262,7 +263,7 @@ void BeamSearchDecoder::PackAllSteps(const LoDTensorArray& step_ids,
for (size_t step_id = 0; step_id < step_num; ++step_id) {
beamnode_vector_list =
PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
- beamnode_vector_list, &sentence_vector_list);
+ &beamnode_vector_list, &sentence_vector_list);
}
// append last beam_node to result
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index c3faf46e09..36f9594969 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -125,7 +125,7 @@ TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
BeamSearchDecoder helper;
beamnode_vector_list = helper.PackTwoSteps(
- ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+ ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
ASSERT_EQ(beamnode_vector_list.size(), 2UL);
ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
@@ -167,7 +167,7 @@ TEST(BeamSearchDecodeOp, PackTwoSteps) {
BeamSearchDecoder helper1;
beamnode_vector_list = helper1.PackTwoSteps(
- ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+ ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index e848b1f12c..fdab4e92f4 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -14,7 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
+#include
#include