From 16bc8f2a755438cac5a279f27b082dbaf0e3e3a0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 19 Nov 2018 21:52:59 +0800 Subject: [PATCH 01/36] Add debug info --- .dockerignore | 1 + Dockerfile | 86 +++++++++++++++++++++++++-------------------------- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/.dockerignore b/.dockerignore index 2b2e74053d..49adfe4f0a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ *.DS_Store build/ +build* *.user .vscode .idea diff --git a/Dockerfile b/Dockerfile index c8b9eed6d6..b36102175c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,46 +71,46 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 install -U wheel && \ - pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - easy_install -U pip && \ - pip install -U pip setuptools wheel && \ - pip install -U docopt PyYAML sphinx==1.5.6 && \ - pip install sphinx-rtd-theme==0.1.9 recommonmark - -RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 install opencv-python && \ - pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python - -#For docstring checker -RUN pip3 install pylint pytest astroid isort -RUN pip install pylint pytest astroid isort LinkChecker - -COPY ./python/requirements.txt /root/ -RUN pip3 install -r /root/requirements.txt -RUN pip install -r /root/requirements.txt - -# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use -# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -RUN apt-get install -y libssl-dev libffi-dev -RUN pip3 install certifi urllib3[secure] -RUN pip install certifi urllib3[secure] - - -# Install woboq_codebrowser to /woboq -RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - (cd /woboq \ - cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - -DCMAKE_BUILD_TYPE=Release . \ - make) - -# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service -RUN mkdir /var/run/sshd -RUN echo 'root:root' | chpasswd -RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config -EXPOSE 22 +# RUN pip3 install -U wheel && \ + # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ + # easy_install -U pip && \ + # pip install -U pip setuptools wheel && \ + # pip install -U docopt PyYAML sphinx==1.5.6 && \ + # pip install sphinx-rtd-theme==0.1.9 recommonmark + +# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + # pip3 install opencv-python && \ + # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + # pip install opencv-python + +# #For docstring checker +# RUN pip3 install pylint pytest astroid isort +# RUN pip install pylint pytest astroid isort LinkChecker + +# COPY ./python/requirements.txt /root/ +# RUN pip3 install -r /root/requirements.txt +# RUN pip install -r /root/requirements.txt + +# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +# RUN apt-get install -y libssl-dev libffi-dev +# RUN pip3 install certifi urllib3[secure] +# RUN pip install certifi urllib3[secure] + + +# # Install woboq_codebrowser to /woboq +# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + # (cd /woboq \ + # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + # -DCMAKE_BUILD_TYPE=Release . \ + # make) + +# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +# RUN mkdir /var/run/sshd +# RUN echo 'root:root' | chpasswd +# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +# EXPOSE 22 From c19ff1f3d28b38867de8b98d63f19b8c759c4535 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 21 Nov 2018 15:37:36 +0800 Subject: [PATCH 02/36] Add python3.6 and python3.7 support in padde build scripts test=develop --- paddle/scripts/paddle_build.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 32f9bca645..569e56e5a9 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -116,6 +116,18 @@ function cmake_gen() { export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" + elif [ "$1" == "cp36-cp36m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so" + elif [ "$1" == "cp37-cp37m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" fi fi fi @@ -419,7 +431,7 @@ function assert_api_not_changed() { source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec - if [ "$1" == "cp35-cp35m" ]; then + if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec From 255cc1eb6540785c8cb786a6c9f291fa53010ca0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 21 Nov 2018 15:43:17 +0800 Subject: [PATCH 03/36] Add support for Mac build test=develop --- paddle/scripts/paddle_build.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 569e56e5a9..9632eaec00 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -94,6 +94,30 @@ function cmake_gen() { else exit 1 fi + elif [ "$1" == "cp36-cp36m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi + elif [ "$1" == "cp37-cp37m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi fi else if [ "$1" != "" ]; then From 3e3599f3d937e0444606056f3c9f2261b74dfd93 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 21 Nov 2018 11:31:04 +0000 Subject: [PATCH 04/36] Refine split tensorrt plugin --- .../inference/tensorrt/convert/split_op.cc | 3 +- .../tensorrt/plugin/split_op_plugin.cu | 157 ++++++++++++++---- .../tensorrt/plugin/split_op_plugin.h | 9 +- 3 files changed, 134 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 6620c76318..871354267e 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -40,7 +40,7 @@ class SplitOpConverter : public OpConverter { int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = boost::get>(op_desc.GetAttr("sections")); - PADDLE_ENFORCE(axis != 0); + // PADDLE_ENFORCE(axis != 0); if (axis < 0) { axis += input_dims.nbDims; } else { @@ -48,7 +48,6 @@ class SplitOpConverter : public OpConverter { } PADDLE_ENFORCE(output_lengths.size() == output_num); - // plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 4adea2db1e..1ec0753e9f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" namespace paddle { @@ -19,6 +21,52 @@ namespace inference { namespace tensorrt { namespace plugin { +// copied from operators::math::SplitFunctor +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int* out_cols, + int out_cols_size, T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = out_cols[0]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs_data[curr_segment]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int fixed_out_col, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } + } +} + nvinfer1::Dims SplitPlugin::getOutputDimensions( int index, const nvinfer1::Dims* input_dims, int num_inputs) { PADDLE_ENFORCE_EQ(num_inputs, 1); @@ -31,48 +79,95 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions( int SplitPlugin::initialize() { PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS); - + // notice input dims is [C, H, W] + nvinfer1::Dims dims = this->getInputDims(0); + outer_rows_ = 1; + inner_cols_ = 1; + for (int i = 0; i < axis_; ++i) { + outer_rows_ *= dims.d[i]; + } + for (int i = axis_ + 1; i < dims.nbDims; ++i) { + inner_cols_ *= dims.d[i]; + } + same_shape_ = true; std::vector segment_offsets(1, 0); for (int i = 0; i < this->getNbOutputs(); ++i) { - segment_offsets.push_back(segment_offsets.back() + output_length_[i]); + if (output_length_[i] != output_length_[0]) { + same_shape_ = false; + } + segment_offsets.push_back(segment_offsets.back() + + output_length_[i] * inner_cols_); } - segment_offsets_ = segment_offsets; - nvinfer1::Dims dims = this->getInputDims(0); - nx_ = 1; - for (int i = dims.nbDims - 1; i > axis_; --i) { - nx_ *= dims.d[i]; + inner_cols_ *= dims.d[axis_]; + d_segment_offsets_ = segment_offsets; + segment_offsets_ = std::move(segment_offsets); + d_output_ptrs_.resize(this->getNbOutputs(), nullptr); + return 0; +} + +template +inline void Split(cudaStream_t stream, const bool same_shape, + const int outer_rows, const int inner_cols, + const std::vector& segment_offsets, + const int* d_segment_offsets, const T* input, T** outputs) { + const int kThreadsPerBlock = 1024; + const int kMaxBlocks = 65535; + int block_cols = kThreadsPerBlock; + if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((inner_cols + 31) >> 5) << 5; } - ny_ = dims.d[axis_]; - nz_ = 1; - for (int i = axis_ - 1; i >= 0; --i) { - nz_ *= dims.d[i]; + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int grid_cols = + std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks); + int grid_rows = + std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (same_shape) { + SplitKernel<<>>( + input, outer_rows, inner_cols, segment_offsets[1], outputs); + } else { + SplitKernel<<>>( + input, outer_rows, inner_cols, d_segment_offsets, + static_cast(segment_offsets.size()), outputs); } - return 0; } int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { - auto const& input_dims = this->getInputDims(0); - int input_size = 0; - float const* idata = reinterpret_cast(inputs[0]); - float** odatas = reinterpret_cast(outputs); - - // kernel impl here. - int inputBatchOffset = nx_ * ny_ * nz_; - for (size_t i = 0; i < this->getNbOutputs(); i++) { - for (size_t j = 0; j < batchSize; j++) { - cudaMemcpyAsync( - odatas[i] + - j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * - sizeof(float), - inputs[0] + - (inputBatchOffset * j + segment_offsets_[i] * nx_) * - sizeof(float), - (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float), - cudaMemcpyDeviceToDevice, stream); + float const* input_ptr = reinterpret_cast(inputs[0]); + if (axis_ == -1 && this->getNbOutputs() < 10) { + float** output_ptrs = reinterpret_cast(outputs); + int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) + ? sizeof(__half) + : sizeof(float); + for (int i = 0; i < this->getNbOutputs(); ++i) { + PADDLE_ENFORCE( + cudaMemcpyAsync( + output_ptrs[i], input_ptr + segment_offsets_[i], + (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size, + cudaMemcpyDeviceToDevice, stream) == cudaSuccess); + } + } else { + outer_rows_ *= batchSize; + const int* d_segment_offsets_ptr = + thrust::raw_pointer_cast(&d_segment_offsets_[0]); + float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); + PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs, + this->getNbOutputs() * sizeof(float*), + cudaMemcpyHostToDevice, + stream) == cudaSuccess); + if (this->getDataType() == nvinfer1::DataType::kFLOAT) { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, input_ptr, output_ptrs); + } else { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT + (__half**)output_ptrs); // NOLINT } } - return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index b5b6e69992..6f028d3d72 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,7 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: SplitPlugin(int axis, std::vector const &output_lengths) - : axis_(axis), output_length_(output_lengths) {} + : axis_(axis), same_shape_(true), output_length_(output_lengths) {} SplitPlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); @@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT { } int axis_; + int outer_rows_; + int inner_cols_; + bool same_shape_; std::vector output_length_; - int nx_, ny_, nz_; std::vector segment_offsets_; + thrust::device_vector d_segment_offsets_; + thrust::device_vector d_output_ptrs_; }; } // namespace plugin From 6eba5bd276a8d79d5611ec42db0c47273fb4950c Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 21 Nov 2018 15:32:25 +0000 Subject: [PATCH 05/36] Fix direct copy and refine split ut test=develop --- .../tensorrt/convert/test_split_op.cc | 55 ++++++++++++++----- .../tensorrt/plugin/split_op_plugin.cu | 7 ++- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index f81d011552..23909378dd 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -20,30 +20,59 @@ namespace paddle { namespace inference { namespace tensorrt { -TEST(split_op, test) { +template +void TensorRTSplitTest(const std::vector &in_shape, + const std::vector §ions) { std::unordered_set parameters({""}); framework::Scope scope; - TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2)); - validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2)); - validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2)); + TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000); + + auto make_dim = [](const std::vector &shape) { + nvinfer1::DimsCHW dim; + dim.c() = shape[0]; + dim.h() = shape[1]; + dim.w() = shape[2]; + return dim; + }; + validator.DeclInputVar("split_input", make_dim(in_shape)); + std::vector output_vars; + for (size_t i = 0; i < sections.size(); ++i) { + auto out_shape = in_shape; + out_shape[Axis - 1] = sections[i]; + std::string output_name = "split_out" + std::to_string(i); + validator.DeclOutputVar(output_name, make_dim(out_shape)); + output_vars.push_back(output_name); + } // Prepare Op description framework::OpDesc desc; desc.SetType("split"); desc.SetInput("X", {"split_input"}); - desc.SetOutput("Out", {"split_out1", "split_out2"}); + desc.SetOutput("Out", output_vars); - int num = 0; - int axis = 1; - std::vector output_lengths = {2, 1}; - desc.SetAttr("axis", axis); - desc.SetAttr("num", num); - desc.SetAttr("sections", output_lengths); + desc.SetAttr("axis", Axis); + desc.SetAttr("num", 0); + desc.SetAttr("sections", sections); validator.SetOp(*desc.Proto()); - validator.Execute(1); + validator.Execute(BatchSize); +} + +TEST(split_op, test_same_shape_batch1) { + TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); +} + +TEST(split_op, test_different_shape_batch1) { + TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); +} + +TEST(split_op, test_same_shape_batch10) { + TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); +} + +TEST(split_op, test_different_shape_batch10) { + TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 1ec0753e9f..de61ace59e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -138,11 +138,12 @@ inline void Split(cudaStream_t stream, const bool same_shape, int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { float const* input_ptr = reinterpret_cast(inputs[0]); - if (axis_ == -1 && this->getNbOutputs() < 10) { + if (((batchSize == 1 && axis_ == 0) || axis_ == -1) && + this->getNbOutputs() < 10) { float** output_ptrs = reinterpret_cast(outputs); int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) - ? sizeof(__half) - : sizeof(float); + ? sizeof(float) + : sizeof(__half); for (int i = 0; i < this->getNbOutputs(); ++i) { PADDLE_ENFORCE( cudaMemcpyAsync( From 3912545ffec3ea5a850420f0a804afadc9f0352a Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 22 Nov 2018 04:30:19 +0000 Subject: [PATCH 06/36] add dlpack support test=develop --- CMakeLists.txt | 1 + cmake/external/dlpack.cmake | 31 +++++ paddle/fluid/framework/CMakeLists.txt | 3 + paddle/fluid/framework/dlpack_tensor.cc | 127 +++++++++++++++++++ paddle/fluid/framework/dlpack_tensor.h | 45 +++++++ paddle/fluid/framework/dlpack_tensor_test.cc | 113 +++++++++++++++++ 6 files changed, 320 insertions(+) create mode 100644 cmake/external/dlpack.cmake create mode 100644 paddle/fluid/framework/dlpack_tensor.cc create mode 100644 paddle/fluid/framework/dlpack_tensor.h create mode 100644 paddle/fluid/framework/dlpack_tensor_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index c62cc9bfd7..b6ae241272 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,7 @@ include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) include(external/xxhash) # download xxhash +include(external/dlpack) if (NOT WIN32) # there is no official support of snappystream, warpctc, nccl, cupti in windows diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake new file mode 100644 index 0000000000..94d8fcc668 --- /dev/null +++ b/cmake/external/dlpack.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack) +set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include) + +include_directories(${DLPACK_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dlpack + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/dmlc/dlpack.git" + GIT_TAG "v0.2" + PREFIX ${DLPACK_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(dlpack STATIC ${dummyfile}) +else() + add_library(dlpack INTERFACE) +endif() + +add_dependencies(dlpack extern_dlpack) + +LIST(APPEND externl_project_dependencies dlpack) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cb9057672c..d7d7834b49 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -205,3 +205,6 @@ cc_test(tuple_test SRCS tuple_test.cc ) if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) + +cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) +cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc new file mode 100644 index 0000000000..04e3f78afe --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" + +namespace paddle { +namespace framework { + +namespace internal { +template +static ::DLDataType GetDLDataTypeCode() { + ::DLDataType dtype; + if (std::is_same::value || + std::is_floating_point::value) { + dtype.code = kDLFloat; + } else if (std::is_unsigned::value) { + dtype.code = kDLUInt; + } else if (std::is_integral::value) { + dtype.code = kDLInt; + } else { + PADDLE_THROW("Unsupported data type %s", typeid(T).name()); + } + dtype.bits = 8 * sizeof(T); + dtype.lanes = 1; + return dtype; +} + +static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { +#define REG_DL_DATA_TYPE(type) \ + { std::type_index(typeid(type)), GetDLDataTypeCode() } + static const std::unordered_map + type_to_dtype_map({ + REG_DL_DATA_TYPE(platform::float16), // NOLINT + REG_DL_DATA_TYPE(float), // NOLINT + REG_DL_DATA_TYPE(double), // NOLINT + REG_DL_DATA_TYPE(int), // NOLINT + REG_DL_DATA_TYPE(int64_t), // NOLINT + REG_DL_DATA_TYPE(bool), // NOLINT + REG_DL_DATA_TYPE(size_t), // NOLINT + REG_DL_DATA_TYPE(int16_t), // NOLINT + REG_DL_DATA_TYPE(uint8_t), // NOLINT + REG_DL_DATA_TYPE(int8_t) // NOLINT + }); + static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); + auto it = type_to_dtype_map.find(type); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s", + type.name()); + return it->second; +#undef REG_DL_DATA_TYPE +} + +struct DLContextVisitor : public boost::static_visitor<::DLContext> { + inline ::DLContext operator()(const platform::CPUPlace &place) const { + DLContext ctx; + ctx.device_type = kDLCPU; + ctx.device_id = 0; + return ctx; + } + + inline ::DLContext operator()(const platform::CUDAPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLGPU; + ctx.device_id = place.device; + return ctx; +#else + PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version"); +#endif + } + + inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLCPUPinned; + ctx.device_id = 0; + return ctx; +#else + PADDLE_THROW( + "platform::CUDAPinnedPlace is not supported in CPU only version"); +#endif + } +}; +} // namespace internal + +DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { + // init data, data buffer + t_.data = const_cast(tensor.data()); + + // init ctx, DLContext type with device_type and device_id + auto place = tensor.place(); + t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + + // init dtype + t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); + t_.dtype.lanes = lanes; + + // init ndim, tensor rank + auto &dims = tensor.dims(); + using DimType = decltype(t_.ndim); // int + t_.ndim = static_cast(dims.size()); + + // init shape, tensor dims + t_.shape = shape_; + for (DimType i = 0; i < t_.ndim; ++i) { + t_.shape[i] = dims[i]; + } + + // init strides, nullptr means the tensor is compact + t_.strides = nullptr; + + // init byte_offset + t_.byte_offset = 0; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h new file mode 100644 index 0000000000..0c52bce1ef --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { + +class DLPackTensor { + public: + using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t + using ShapeType = + std::remove_reference::type; // int64_t + + // lanes is only used in CPU to enable vectorization + explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1); + + inline operator const ::DLTensor&() const { return t_; } + + inline operator ::DLTensor&() { return t_; } + + private: + ::DLTensor t_; + + // The shape in DLTensor is defined as int64_t* + // Add this member to make TVMTensor init without heap allocation + ShapeType shape_[9]; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc new file mode 100644 index 0000000000..938b056350 --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +namespace { // NOLINT +template +constexpr uint8_t GetDLDataTypeCode() { + return std::is_same::value || + std::is_floating_point::value + ? static_cast(kDLFloat) + : (std::is_unsigned::value + ? static_cast(kDLUInt) + : (std::is_integral::value ? static_cast(kDLInt) + : static_cast(-1))); +} +} // NOLINT + +template +void TestMain(const platform::Place &place, uint16_t lanes) { + DDim dims{4, 5, 6, 7}; + Tensor tensor; + tensor.Resize(dims); + void *p = tensor.mutable_data(place); + + DLPackTensor dlpack_tensor(tensor, lanes); + ::DLTensor &dl_tensor = dlpack_tensor; + + CHECK_EQ(p, dl_tensor.data); + if (platform::is_cpu_place(place)) { + CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else if (platform::is_gpu_place(place)) { + CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(boost::get(place).device, + dl_tensor.ctx.device_id); + } else if (platform::is_cuda_pinned_place(place)) { + CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else { + CHECK_EQ(false, true); + } + + CHECK_EQ(dims.size(), dl_tensor.ndim); + for (auto i = 0; i < dims.size(); ++i) { + CHECK_EQ(dims[i], dl_tensor.shape[i]); + } + + CHECK_EQ(dl_tensor.strides == nullptr, true); + CHECK_EQ(static_cast(0), dl_tensor.byte_offset); + + CHECK_EQ(lanes, dl_tensor.dtype.lanes); + CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits); + + CHECK_EQ(GetDLDataTypeCode(), dl_tensor.dtype.code); +} + +template +void TestMainLoop() { +#ifdef PADDLE_WITH_CUDA + std::vector places{platform::CPUPlace(), + platform::CUDAPlace(0), + platform::CUDAPinnedPlace()}; + if (platform::GetCUDADeviceCount() > 1) { + places.emplace_back(platform::CUDAPlace(1)); + } +#else + std::vector places{platform::CPUPlace()}; +#endif + std::vector lanes{1, 2}; + for (auto &p : places) { + for (auto &l : lanes) { + TestMain(p, l); + } + } +} + +#define PADDLE_DLPACK_TEST(type) \ + TEST(dlpack, test_##type) { TestMainLoop(); } + +using float16 = platform::float16; +PADDLE_DLPACK_TEST(float16); +PADDLE_DLPACK_TEST(float); +PADDLE_DLPACK_TEST(double); +PADDLE_DLPACK_TEST(int); +PADDLE_DLPACK_TEST(int64_t); +PADDLE_DLPACK_TEST(bool); +PADDLE_DLPACK_TEST(size_t); +PADDLE_DLPACK_TEST(int16_t); +PADDLE_DLPACK_TEST(uint8_t); +PADDLE_DLPACK_TEST(int8_t); + +#undef PADDLE_DLPACK_TEST + +} // namespace framework +} // namespace paddle From 982e48922020e8d5f3ddcfc682068fcbdc5b7fe2 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 22 Nov 2018 06:26:04 +0000 Subject: [PATCH 07/36] test=develop --- python/paddle/fluid/layers/nn.py | 5 +++-- python/paddle/fluid/tests/unittests/test_layers.py | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 99acd7e308..32d411b830 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2139,8 +2139,9 @@ def pool2d(input, input tensor is NCHW, where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. - pool_size (int): The side length of pooling windows. All pooling - windows are squares with pool_size on a side. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple, + it must contain two integers, (pool_size_Height, pool_size_Width). + Otherwise, the pool kernel size will be a square of an int. pool_type: ${pooling_type_comment} pool_stride (int): stride of the pooling layer. pool_padding (int): padding size. diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index a8fa5436c4..c4310fe006 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -202,6 +202,12 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.sequence_unpad(x=x, length=length)) print(str(program)) + def test_pool2d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') + self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3])) + def test_lstm_unit(self): program = Program() with program_guard(program): From 1adda8e06c075d55edcc6aa50804eab62b903f72 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Thu, 22 Nov 2018 06:53:16 +0000 Subject: [PATCH 08/36] Add more unit tests for split plugin test=develop --- .../inference/tensorrt/convert/split_op.cc | 13 ++--- .../tensorrt/convert/test_split_op.cc | 47 ++++++++++++++++--- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 871354267e..ae5b1b9806 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -19,9 +19,6 @@ namespace paddle { namespace inference { namespace tensorrt { -/* - * SplitOp. - */ class SplitOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, @@ -40,15 +37,11 @@ class SplitOpConverter : public OpConverter { int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = boost::get>(op_desc.GetAttr("sections")); - // PADDLE_ENFORCE(axis != 0); - if (axis < 0) { - axis += input_dims.nbDims; - } else { - axis -= 1; - } + // split on batch is not supported in TensorRT + PADDLE_ENFORCE(axis != 0); + axis += (axis < 0) ? input_dims.nbDims : -1; PADDLE_ENFORCE(output_lengths.size() == output_num); - // plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index 23909378dd..5aacc5c600 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -59,21 +59,54 @@ void TensorRTSplitTest(const std::vector &in_shape, validator.Execute(BatchSize); } -TEST(split_op, test_same_shape_batch1) { +// batch = 0, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch1) { TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); } - -TEST(split_op, test_different_shape_batch1) { +// batch = 0, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch1) { TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); } - -TEST(split_op, test_same_shape_batch10) { +// batch = 10, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch10) { TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); } - -TEST(split_op, test_different_shape_batch10) { +// batch = 10, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch10) { TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); } +// batch = 0, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2}); +} +// batch = 0, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1}); +} +// batch = 10, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2}); +} +// batch = 10, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1}); +} +// batch = 0, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2}); +} +// batch = 0, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1}); +} +// batch = 10, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2}); +} +// batch = 10, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1}); +} } // namespace tensorrt } // namespace inference From 510601b2793047858763032b7816af07ab2b2bc7 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 22 Nov 2018 09:01:08 +0000 Subject: [PATCH 09/36] test=develop --- python/paddle/fluid/layers/nn.py | 10 +++++++--- python/paddle/fluid/tests/unittests/test_layers.py | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 32d411b830..27f83a60bd 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2139,12 +2139,16 @@ def pool2d(input, input tensor is NCHW, where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. - pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple, + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two integers, (pool_size_Height, pool_size_Width). Otherwise, the pool kernel size will be a square of an int. pool_type: ${pooling_type_comment} - pool_stride (int): stride of the pooling layer. - pool_padding (int): padding size. + pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + it must contain two integers, (pool_stride_Height, pool_stride_Width). + Otherwise, the pool stride size will be a square of an int. + pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple, + it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width). + Otherwise, the pool padding size will be a square of an int. global_pooling (bool): ${global_pooling_comment} use_cudnn (bool): ${use_cudnn_comment} ceil_mode (bool): ${ceil_mode_comment} diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index c4310fe006..559c9cda48 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -206,7 +206,12 @@ class TestBook(unittest.TestCase): program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') - self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3])) + self.assertIsNotNone( + layers.pool2d( + x, + pool_size=[5, 3], + pool_stride=[1, 2], + pool_padding=(2, 1))) def test_lstm_unit(self): program = Program() From 83370576cd8f35e4155d94a789c886c8c264056d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 22 Nov 2018 18:52:54 +0800 Subject: [PATCH 10/36] Add sqlite3 support in Python3.6 test=develop --- tools/manylinux1/build_scripts/build_utils.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh index d97745ad2d..48cce15a14 100755 --- a/tools/manylinux1/build_scripts/build_utils.sh +++ b/tools/manylinux1/build_scripts/build_utils.sh @@ -50,6 +50,15 @@ function do_cpython_build { mkdir -p ${prefix}/lib # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6 + if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then + wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz + tar -zxf sqlite-autoconf-3250300.tar.gz + cd sqlite-autoconf-3250300 + ./configure --prefix=/usr/local + make -j8 && make install + cd ../ && rm sqlite-autoconf-3250300.tar.gz + fi + # NOTE --enable-shared for generating libpython shared library needed for # linking of some of the nupic.core test executables. if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then @@ -59,9 +68,9 @@ function do_cpython_build { make -j8 > /dev/null make altinstall > /dev/null else - CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null - make -j8 > /dev/null - make install > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null fi popd echo "ZZZ looking for libpython" From 00b9e9a1357bb3fa6e6adceb4e650d9f6424aa2a Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 22 Nov 2018 20:40:56 +0800 Subject: [PATCH 11/36] Refine cublas to support CUBLAS_TENSOR_OP_MATH (#13929) * refine cublase test=develop * code refine * refine cublas * add GEMME_EX * add enable_cublas_tensor_op_math doc and add cublasCall test=develop * fix CublasCall for cuda version test=develop * fix error test=develop * fix GEMM_EX to be compatible with gcc 4.8 test=develop * add GEMM_EX test=develop * to compatiable with gcc4.8 test=develop --- paddle/fluid/operators/math/blas_impl.cu.h | 206 +++++++++++++++++---- paddle/fluid/platform/device_context.h | 47 +++++ paddle/fluid/platform/dynload/cublas.h | 26 ++- paddle/fluid/platform/gpu_info.cc | 20 ++ paddle/fluid/platform/gpu_info.h | 3 + python/paddle/fluid/__init__.py | 3 +- 6 files changed, 256 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index d84c88cb3b..d35073029a 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -16,6 +16,9 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/gpu_info.h" + +DECLARE_bool(enable_cublas_tensor_op_math); namespace paddle { namespace operators { @@ -42,11 +45,44 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...)); #else PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const float *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const float *beta, void *C, + cudaDataType_t Ctype, int ldc) { + // Because the gcc 4.8 doesn't expand template parameter pack that + // appears in a lambda-expression, I can not use template parameter pack + // here. + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + VLOG(5) << "use_tensor_op_math: " + << (platform::TensorCoreAvailable() ? "True" : "False"); + PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc)); +#else + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -69,13 +105,18 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...)); #else PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5"); #endif } + + template + static void GEMM_EX(ARGS... args) { + PADDLE_THROW("Currently there are not cublasDgemmEx."); + } }; template <> @@ -96,14 +137,16 @@ struct CUBlas { reinterpret_cast<__half *>(C), ldc)); } - static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const float16 *alpha, const float16 *A, int lda, - long long int strideA, const float16 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const float16 *beta, float16 *C, int ldc, - long long int strideC, // NOLINT - int batchCount) { + static void GEMM_STRIDED_BATCH(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float16 *alpha, const float16 *A, + int lda, long long int strideA, // NOLINT + const float16 *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const float16 *beta, float16 *C, int ldc, + long long int strideC, // NOLINT + int batchCount) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( handle, transa, transb, m, n, k, @@ -114,6 +157,45 @@ struct CUBlas { ldc, strideC, batchCount)); #else PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const void *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const void *beta, void *C, + cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + PADDLE_ENFORCE(platform::dynload::cublasGemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); +#else + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -133,8 +215,21 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, N); +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, N); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, N); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 } template <> @@ -157,30 +252,18 @@ inline void Blas::GEMM( PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53, "cublas fp16 gemm requires GPU compute capability >= 53"); -#if CUDA_VERSION >= 8000 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -#if CUDA_VERSION >= 9000 - if (context_.GetComputeCapability() >= 70) { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH)); - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } else { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_DEFAULT_MATH)); - } -#endif // CUDA_VERSION >= 9000 - +#if CUDA_VERSION >= 8000 // cublasHgemm does true FP16 computation which is slow for non-Volta // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: // input/output in fp16, computation in fp32, which can also be accelerated // using tensor cores in volta GPUs. - PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B, - CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, - CUDA_R_32F, algo)); + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A, + CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, @@ -199,8 +282,38 @@ void Blas::GEMM(bool transA, bool transB, int M, // the cblas convention. cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, ldc); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 +} + +template <> +template <> +inline void Blas::GEMM( + bool transA, bool transB, int M, int N, int K, platform::float16 alpha, + const platform::float16 *A, int lda, const platform::float16 *B, int ldb, + platform::float16 beta, platform::float16 *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, + ldc); } template <> @@ -238,9 +351,34 @@ void Blas::BatchedGEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; - CUBlas::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, - strideC, batchCount); +#if CUDA_VERSION >= 9010 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto cublas_call = [&]() { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( + context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, + CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); + }; + auto &dev_ctx = const_cast(context_); + dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + } else { +#endif // CUDA_VERSION >= 9010 + + CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, strideB, A, lda, + strideA, &beta, C, ldc, strideC, batchCount); + +#if CUDA_VERSION >= 9010 + } +#endif // CUDA_VERSION >= 9010 } } // namespace math diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 9a9018cdea..3edd727978 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -143,6 +143,39 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; +#if CUDA_VERSION >= 9000 +class ScopedCublasMathMode { + public: + ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) + : handle_(handle) { + need_reset = false; + PADDLE_ENFORCE( + platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), + "Failed to get old cublas math mode"); + if (old_math_mode_ != new_math_mode) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, new_math_mode), + "Failed to set old cublas math mode"); + need_reset = true; + } + } + + ~ScopedCublasMathMode() { + if (need_reset) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, old_math_mode_), + "Failed to set old cublas math mode"); + } + } + + private: + cublasHandle_t handle_; + cublasMath_t old_math_mode_; + bool need_reset; +}; + +#endif + class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext { callback_manager_->Wait(); } +#if CUDA_VERSION >= 9000 + /*! \brief CublasCall may need to change cublas's config, + * but the cublas may be hold by multi-thread, so we should + * add lock here. */ + template + void CublasCall(Callback callback, cublasMath_t new_math) { + std::lock_guard guard(cublas_mtx_); + ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); + callback(); + } +#endif + private: CUDAPlace place_; @@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext { // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes mutable std::mutex callback_mtx_; std::unique_ptr callback_manager_; + + mutable std::mutex cublas_mtx_; }; template <> diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 4ea0cd7283..ff80bd525c 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -61,9 +61,6 @@ extern void *cublas_dso_handle; extern DynLoad__##__name __name #endif -#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ - DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) - #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ __macro(cublasSaxpy_v2); \ __macro(cublasDaxpy_v2); \ @@ -93,22 +90,23 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 8.0 #if CUDA_VERSION >= 8000 -#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ - __macro(cublasGemmEx); \ - __macro(cublasSgemmStridedBatched); \ - __macro(cublasDgemmStridedBatched); \ - __macro(cublasCgemmStridedBatched); \ - __macro(cublasZgemmStridedBatched); \ - __macro(cublasHgemmStridedBatched); - -CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched); #endif // APIs available after CUDA 9.0 #if CUDA_VERSION >= 9000 -#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode); +#endif -CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#if CUDA_VERSION >= 9010 +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx); #endif #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c78f159ad2..833d48347f 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +DEFINE_bool( + enable_cublas_tensor_op_math, false, + "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " + "but it may loss precision. Currently, There are two CUDA libraries that" + " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up" + " GEMM computations(the matrices must be either half precision or single " + "precision); cuDNN uses Tensor Cores to speed up both convolutions(the " + "input and output must be half precision) and recurrent neural networks " + "(RNNs)."); + namespace paddle { namespace platform { @@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) { return driver_version; } +bool TensorCoreAvailable() { +#if CUDA_VERSION >= 9000 + int device = GetCurrentDeviceId(); + int driver_version = GetCUDAComputeCapability(device); + return driver_version >= 70; +#else + return false; +#endif +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index be44158431..6a0b3c8e02 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id); //! Get the driver version of the ith GPU int GetCUDADriverVersion(int id); +//! Wheter the current device support TensorCore +bool TensorCoreAvailable(); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 543acf2d34..3c092dee34 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -133,7 +133,8 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', - 'conv_workspace_size_limit', 'cudnn_exhaustive_search' + 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', + 'cudnn_exhaustive_search' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) From 6cc6bf4074d69c5c0b02af612b94e438d596803a Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Thu, 22 Nov 2018 15:30:43 +0100 Subject: [PATCH 12/36] Bumped MKL-DNN version to 0.17 test=develop --- cmake/external/mkldnn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 785148d4f9..b280db23b9 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -53,7 +53,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "21fb5f2af1dd14e132af4f1b79160977ee487818" + GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} From a902b8b0f811f6837330385b95fa2f552393197c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 01:07:58 +0800 Subject: [PATCH 13/36] Add sqlite3 support test=develop --- tools/manylinux1/Dockerfile.x64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index e91216a5b8..48fd145e5f 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && \ - bash build_scripts/install_nccl2.sh && rm -r build_scripts + bash build_scripts/install_nccl2.sh && rm -rf build_scripts ENV SSL_CERT_FILE=/opt/_internal/certs.pem From 9ea1ce63192fee1a211aa5dcc6fecf4758434451 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 23 Nov 2018 11:51:07 +0800 Subject: [PATCH 14/36] Update issue templates --- .github/ISSUE_TEMPLATE/---feature-request-.md | 27 +++++++++++++ .github/ISSUE_TEMPLATE/---inference-issue-.md | 40 +++++++++++++++++++ .../ISSUE_TEMPLATE/---installation-issue-.md | 40 +++++++++++++++++++ .github/ISSUE_TEMPLATE/---model-issue-.md | 36 +++++++++++++++++ .github/ISSUE_TEMPLATE/---others-.md | 33 +++++++++++++++ .github/ISSUE_TEMPLATE/---training-issue-.md | 38 ++++++++++++++++++ 6 files changed, 214 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/---feature-request-.md create mode 100644 .github/ISSUE_TEMPLATE/---inference-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---installation-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---model-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---others-.md create mode 100644 .github/ISSUE_TEMPLATE/---training-issue-.md diff --git a/.github/ISSUE_TEMPLATE/---feature-request-.md b/.github/ISSUE_TEMPLATE/---feature-request-.md new file mode 100644 index 0000000000..57708855dc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---feature-request-.md @@ -0,0 +1,27 @@ +--- +name: 建议(Feature request) +about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue. + +--- + +欢迎您对PaddlePaddle提出建议,非常感谢您对PaddlePaddle的贡献! +在留下您的建议时,辛苦您同步提供如下信息: +- 版本、环境信息 +1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1 +2)CPU/GPU:您是否使用GPU进行训练,如是,请提供您的CUDA和cuDNN版本号 +3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 建议描述:请您详细描述,您认为需优化的功能 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +Please make sure that this is a feature request. +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +**To Reproduce** +Steps to reproduce the behavior +**Describe the feature and the current behavior/state.** +**Any Other info.** diff --git a/.github/ISSUE_TEMPLATE/---inference-issue-.md b/.github/ISSUE_TEMPLATE/---inference-issue-.md new file mode 100644 index 0000000000..37bdc8889e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md @@ -0,0 +1,40 @@ +--- +name: 预测(Inference Issue) +about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准描述您的问题,例如“最新预测库的API文档在哪儿 ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本(如Mac OS 10.14),Python版本 +-预测信息 +    1)C++预测:请您提供预测库安装包的版本信息,及其中的version.txt文件 +    2)CMake包含路径的完整命令 +    3)API信息(如调用请提供) +    4)预测库来源:官网下载/特殊环境(如BCLOUD编译) +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that th +If there is no solution,please make sure that this is an inference issue including the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Cmake orders +-C++version.txt +-API information +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---installation-issue-.md b/.github/ISSUE_TEMPLATE/---installation-issue-.md new file mode 100644 index 0000000000..ce4ba58932 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md @@ -0,0 +1,40 @@ +--- +name: 安装(Installation Issue) +about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation +  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:请包含关键词“安装错误”/“编译错误”,例如“Mac编译错误” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(如Mac OS 10.14)、Python版本 +- 安装方式信息: +1)pip安装/docker安装 +2)本地编译:请提供cmake命令,编译命令 +3)docker编译:请提供docker镜像,编译命令            +  特殊环境请注明:如离线安装等 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is an installation issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg. Mac OS 10.14) +-Python version +- Install method: pip install/install with docker/build from source(without docker)/build within docker +- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---model-issue-.md b/.github/ISSUE_TEMPLATE/---model-issue-.md new file mode 100644 index 0000000000..7cb52f37b9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---model-issue-.md @@ -0,0 +1,36 @@ +--- +name: 模型(Model Issue) +about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/ + algorithm/dataset  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错  ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(例如Mac OS 10.14),Python版本 +- 模型信息 +    1)模型名称 2)使用数据集名称 3)使用算法名称 4)模型链接 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a issue of models including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Name of Models&Dataset/details of operator +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---others-.md b/.github/ISSUE_TEMPLATE/---others-.md new file mode 100644 index 0000000000..6a291153e4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---others-.md @@ -0,0 +1,33 @@ +--- +name: 其他(Others) +about: 如上述分类未包含您的问题,可在此提出。 You could use this template for reporting other issues + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题 +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU/GPU:如果您使用GPU训练,请提供GPU驱动版本、CUDA和cuDNN版本号 +    3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +    4)Python版本号 +    5)显存信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please provide us with the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/cuDNN version +-OS Platform and Distribution(eg.Mac OS 10.14) +-Python version +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---training-issue-.md b/.github/ISSUE_TEMPLATE/---training-issue-.md new file mode 100644 index 0000000000..29e8383d97 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---training-issue-.md @@ -0,0 +1,38 @@ +--- +name: 训练(Training issue) +about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training +  issue. + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题,例如“Insufficient Memory xxx" ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本 +- 训练信息 +    1)单机/多机,单卡/多卡 +    2)显存信息 +    3)Operator信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a training issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Other imformation: Distriuted training/informantion of operator/ +Graphics card storage +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** From 36f08eef3b466001f339e2c33f47dac60bbc6821 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 23 Nov 2018 13:04:41 +0800 Subject: [PATCH 15/36] CUDA kernel for density_prior_box_op. (#14513) * CUDA kernel for density_prior_box_op. * Support flatten to 2D. --- paddle/fluid/API.spec | 2 +- paddle/fluid/framework/op_desc.cc | 6 + .../fluid/operators/detection/CMakeLists.txt | 2 +- .../detection/density_prior_box_op.cc | 36 ++-- .../detection/density_prior_box_op.cu | 170 ++++++++++++++++++ .../detection/density_prior_box_op.h | 73 ++++---- python/paddle/fluid/layers/detection.py | 43 +++-- python/paddle/fluid/tests/test_detection.py | 60 ++++--- .../unittests/test_density_prior_box_op.py | 30 ++-- 9 files changed, 305 insertions(+), 117 deletions(-) create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cu diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 541c4db1fa..50114bf3df 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) -paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None)) +paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index fbaa169df6..362cda3f23 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { this->attrs_[name] = std::vector(); break; } + case proto::AttrType::LONGS: { + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from LONGS to LONGS"; + this->attrs_[name] = std::vector(); + break; + } case proto::AttrType::FLOATS: { VLOG(110) << "SetAttr: " << Type() << ", " << name << " from INTS to FLOATS"; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 58f6f48467..6c85f1577e 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -22,7 +22,7 @@ iou_similarity_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) -detection_library(density_prior_box_op SRCS density_prior_box_op.cc) +detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc index 99df15c322..1012ba3652 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cc +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { auto fixed_sizes = ctx->Attrs().Get>("fixed_sizes"); auto fixed_ratios = ctx->Attrs().Get>("fixed_ratios"); auto densities = ctx->Attrs().Get>("densities"); + bool flatten = ctx->Attrs().Get("flatten_to_2d"); PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(), "The number of fixed_sizes and densities must be equal."); size_t num_priors = 0; - if ((fixed_sizes.size() > 0) && (densities.size() > 0)) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + if (!flatten) { + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } else { + int64_t dim0 = input_dims[2] * input_dims[3] * num_priors; + ctx->SetOutputDim("Boxes", {dim0, 4}); + ctx->SetOutputDim("Variances", {dim0, 4}); } - std::vector dim_vec(4); - dim_vec[0] = input_dims[2]; - dim_vec[1] = input_dims[3]; - dim_vec[2] = num_priors; - dim_vec[3] = 4; - ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); - ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); } protected: @@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + ctx.GetPlace()); } }; @@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") .SetDefault(true); - + AddAttr("flatten_to_2d", + "(bool) Whether to flatten to 2D and " + "the second dim is 4.") + .SetDefault(false); AddAttr( "step_w", "Density prior boxes step across width, 0.0 for auto calculation.") diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu new file mode 100644 index 0000000000..3b7c781795 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -0,0 +1,170 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" + +namespace paddle { +namespace operators { + +template +static __device__ inline T Clip(T in) { + return min(max(in, 0.), 1.); +} + +template +static __global__ void GenDensityPriorBox( + const int height, const int width, const int im_height, const int im_width, + const T offset, const T step_width, const T step_height, + const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin, + const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) { + int gidx = blockIdx.x * blockDim.x + threadIdx.x; + int gidy = blockIdx.y * blockDim.y + threadIdx.y; + int step_x = blockDim.x * gridDim.x; + int step_y = blockDim.y * gridDim.y; + + const T* width_ratio = ratios_shift; + const T* height_ratio = ratios_shift + num_priors; + const T* width_shift = ratios_shift + 2 * num_priors; + const T* height_shift = ratios_shift + 3 * num_priors; + + for (int j = gidy; j < height; j += step_y) { + for (int i = gidx; i < width * num_priors; i += step_x) { + int h = j; + int w = i / num_priors; + int k = i % num_priors; + + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + + T center_x_temp = center_x + width_shift[k]; + T center_y_temp = center_y + height_shift[k]; + + T box_width_ratio = width_ratio[k] / 2.; + T box_height_ratio = height_ratio[k] / 2.; + + T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.); + T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.); + T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.); + T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.); + + int out_offset = (j * width * num_priors + i) * 4; + out[out_offset] = is_clip ? Clip(xmin) : xmin; + out[out_offset + 1] = is_clip ? Clip(ymin) : ymin; + out[out_offset + 2] = is_clip ? Clip(xmax) : xmax; + out[out_offset + 3] = is_clip ? Clip(ymax) : ymax; + + var[out_offset] = var_xmin; + var[out_offset + 1] = var_ymin; + var[out_offset + 2] = var_xmax; + var[out_offset + 3] = var_ymax; + } + } +} + +template +class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto is_clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + int step_average = static_cast((step_width + step_height) * 0.5); + + framework::Tensor h_temp; + T* tdata = h_temp.mutable_data({num_priors * 4}, platform::CPUPlace()); + int idx = 0; + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = shift / 2. + dj * shift - step_average / 2.; + float center_y_temp = shift / 2. + di * shift - step_average / 2.; + tdata[idx] = box_width_ratio; + tdata[num_priors + idx] = box_height_ratio; + tdata[2 * num_priors + idx] = center_x_temp; + tdata[3 * num_priors + idx] = center_y_temp; + idx++; + } + } + } + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + framework::Tensor d_temp; + framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); + + // At least use 32 threads, at most 512 threads. + // blockx is multiple of 32. + int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int gridx = (feature_width * num_priors + blockx - 1) / blockx; + dim3 threads(blockx, 1); + dim3 grids(gridx, feature_height); + + auto stream = + ctx.template device_context().stream(); + GenDensityPriorBox<<>>( + feature_height, feature_width, img_height, img_width, offset, + step_width, step_height, num_priors, d_temp.data(), is_clip, + variances[0], variances[1], variances[2], variances[3], + boxes->data(), vars->data()); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(density_prior_box, + ops::DensityPriorBoxOpCUDAKernel, + ops::DensityPriorBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index 9a52077e9c..ed2f5df80c 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { step_height = step_h; } int num_priors = 0; - if (fixed_sizes.size() > 0 && densities.size() > 0) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); } boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); - auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); + auto box_dim = vars->dims(); + boxes->Resize({feature_height, feature_width, num_priors, 4}); + auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); int step_average = static_cast((step_width + step_height) * 0.5); for (int h = 0; h < feature_height; ++h) { @@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { auto fixed_size = fixed_sizes[s]; int density = densities[s]; // Generate density prior boxes with fixed ratios. - if (fixed_ratios.size() > 0) { - for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float ar = fixed_ratios[r]; - int shift = step_average / density; - float box_width_ratio = fixed_size * sqrt(ar); - float box_height_ratio = fixed_size / sqrt(ar); - for (int di = 0; di < density; ++di) { - for (int dj = 0; dj < density; ++dj) { - float center_x_temp = - center_x - step_average / 2. + shift / 2. + dj * shift; - float center_y_temp = - center_y - step_average / 2. + shift / 2. + di * shift; - e_boxes(h, w, idx, 0) = - (center_x_temp - box_width_ratio / 2.) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.) / img_width - : 0; - e_boxes(h, w, idx, 1) = - (center_y_temp - box_height_ratio / 2.) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.) / img_height - : 0; - e_boxes(h, w, idx, 2) = - (center_x_temp + box_width_ratio / 2.) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.) / img_width - : 1; - e_boxes(h, w, idx, 3) = - (center_y_temp + box_height_ratio / 2.) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.) / img_height - : 1; - idx++; - } + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = + center_x - step_average / 2. + shift / 2. + dj * shift; + float center_y_temp = + center_y - step_average / 2. + shift / 2. + di * shift; + e_boxes(h, w, idx, 0) = + (center_x_temp - box_width_ratio / 2.) / img_width >= 0 + ? (center_x_temp - box_width_ratio / 2.) / img_width + : 0; + e_boxes(h, w, idx, 1) = + (center_y_temp - box_height_ratio / 2.) / img_height >= 0 + ? (center_y_temp - box_height_ratio / 2.) / img_height + : 0; + e_boxes(h, w, idx, 2) = + (center_x_temp + box_width_ratio / 2.) / img_width <= 1 + ? (center_x_temp + box_width_ratio / 2.) / img_width + : 1; + e_boxes(h, w, idx, 3) = + (center_y_temp + box_height_ratio / 2.) / img_height <= 1 + ? (center_y_temp + box_height_ratio / 2.) / img_height + : 1; + idx++; } } } @@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); vars->Resize(var_dim); + boxes->Resize(box_dim); } }; // namespace operators diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3f17400a14..4843af8340 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -1029,6 +1029,7 @@ def density_prior_box(input, clip=False, steps=[0.0, 0.0], offset=0.5, + flatten_to_2d=False, name=None): """ **Density Prior Box Operator** @@ -1065,22 +1066,24 @@ def density_prior_box(input, height/weight of the input will be automatically calculated. Default: [0., 0.] offset(float): Prior boxes center offset. Default: 0.5 + flatten_to_2d(bool): Whether to flatten output prior boxes and variance + to 2D shape, the second dim is 4. Default: False. name(str): Name of the density prior box op. Default: None. Returns: tuple: A tuple with two Variable (boxes, variances) boxes: the output density prior boxes of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input, - num_priors is the total - box count of each position of input. + The layout is [H, W, num_priors, 4] when flatten_to_2d is False. + The layout is [H * W * num_priors, 4] when flatten_to_2d is True. + H is the height of input, W is the width of input, + num_priors is the total box count of each position of input. variances: the expanded variances of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_priors is the total - box count of each position of input + The layout is [H, W, num_priors, 4] when flatten_to_2d is False. + The layout is [H * W * num_priors, 4] when flatten_to_2d is True. + H is the height of input, W is the width of input + num_priors is the total box count of each position of input. Examples: @@ -1089,14 +1092,11 @@ def density_prior_box(input, box, var = fluid.layers.density_prior_box( input=conv1, image=images, - min_sizes=[100.], - max_sizes=[200.], - aspect_ratios=[1.0, 1.0 / 2.0, 2.0], - densities=[3, 4], - fixed_sizes=[50., 60.], - fixed_ratios=[1.0, 3.0, 1.0 / 3.0], - flip=True, - clip=True) + densities=[4, 2, 1], + fixed_sizes=[32.0, 64.0, 128.0], + fixed_ratios=[1.], + clip=True, + flatten_to_2d=True) """ helper = LayerHelper("density_prior_box", **locals()) dtype = helper.input_dtype() @@ -1127,14 +1127,11 @@ def density_prior_box(input, 'step_w': steps[0], 'step_h': steps[1], 'offset': offset, + 'densities': densities, + 'fixed_sizes': fixed_sizes, + 'fixed_ratios': fixed_ratios, + 'flatten_to_2d': flatten_to_2d, } - if densities is not None and len(densities) > 0: - attrs['densities'] = densities - if fixed_sizes is not None and len(fixed_sizes) > 0: - attrs['fixed_sizes'] = fixed_sizes - if fixed_ratios is not None and len(fixed_ratios) > 0: - attrs['fixed_ratios'] = fixed_ratios - box = helper.create_variable_for_type_inference(dtype) var = helper.create_variable_for_type_inference(dtype) helper.append_op( diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 982d291801..a2eca5541a 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase): class TestPriorBox(unittest.TestCase): def test_prior_box(self): - data_shape = [3, 224, 224] - images = fluid.layers.data( - name='pixel', shape=data_shape, dtype='float32') - conv1 = fluid.layers.conv2d(images, 3, 3, 2) - box, var = layers.prior_box( - input=conv1, - image=images, - min_sizes=[100.0], - aspect_ratios=[1.], - flip=True, - clip=True) - assert len(box.shape) == 4 - assert box.shape == var.shape - assert box.shape[3] == 4 + program = Program() + with program_guard(program): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.prior_box( + input=conv1, + image=images, + min_sizes=[100.0], + aspect_ratios=[1.], + flip=True, + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[3] == 4 class TestDensityPriorBox(unittest.TestCase): def test_density_prior_box(self): - data_shape = [3, 224, 224] - images = fluid.layers.data( - name='pixel', shape=data_shape, dtype='float32') - conv1 = fluid.layers.conv2d(images, 3, 3, 2) - box, var = layers.density_prior_box( - input=conv1, - image=images, - densities=[3, 4], - fixed_sizes=[50., 60.], - fixed_ratios=[1.0], - clip=True) - assert len(box.shape) == 4 - assert box.shape == var.shape - assert box.shape[3] == 4 + program = Program() + with program_guard(program): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.density_prior_box( + input=conv1, + image=images, + densities=[3, 4], + fixed_sizes=[50., 60.], + fixed_ratios=[1.0], + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[-1] == 4 class TestAnchorGenerator(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py index 79d1fd3d71..4b0bc1dcf8 100644 --- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py @@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest): 'offset': self.offset, 'densities': self.densities, 'fixed_sizes': self.fixed_sizes, - 'fixed_ratios': self.fixed_ratios + 'fixed_ratios': self.fixed_ratios, + 'flatten_to_2d': self.flatten_to_2d } self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} @@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest): self.set_data() def set_density(self): - self.densities = [] - self.fixed_sizes = [] - self.fixed_ratios = [] + self.densities = [4, 2, 1] + self.fixed_sizes = [32.0, 64.0, 128.0] + self.fixed_ratios = [1.0] + self.layer_w = 17 + self.layer_h = 17 + self.image_w = 533 + self.image_h = 533 + self.flatten_to_2d = False def init_test_params(self): - self.layer_w = 32 - self.layer_h = 32 - - self.image_w = 40 - self.image_h = 40 + self.set_density() self.step_w = float(self.image_w) / float(self.layer_w) self.step_h = float(self.image_h) / float(self.layer_h) @@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest): self.variances = [0.1, 0.1, 0.2, 0.2] self.variances = np.array(self.variances, dtype=np.float).flatten() - self.set_density() - self.clip = True self.num_priors = 0 if len(self.fixed_sizes) > 0 and len(self.densities) > 0: @@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest): (self.layer_h, self.layer_w, self.num_priors, 1)) self.out_boxes = out_boxes.astype('float32') self.out_var = out_var.astype('float32') + if self.flatten_to_2d: + self.out_boxes = self.out_boxes.reshape((-1, 4)) + self.out_var = self.out_var.reshape((-1, 4)) class TestDensityPriorBox(TestDensityPriorBoxOp): @@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp): self.densities = [3, 4] self.fixed_sizes = [1.0, 2.0] self.fixed_ratios = [1.0] + self.layer_w = 32 + self.layer_h = 32 + self.image_w = 40 + self.image_h = 40 + self.flatten_to_2d = True if __name__ == '__main__': From 61c5f13fcf92c18f30c05a90e3d3badd884f9340 Mon Sep 17 00:00:00 2001 From: sabreshao Date: Fri, 23 Nov 2018 14:27:39 +0800 Subject: [PATCH 16/36] Fix cmake for AMDGPU platform (#13801) * HIP cmake. Enable whole archieve build for pybind library. Disable two warning. Rollback to C++11. Link RCCL to WA gpu kernel loading issue. Update eigen to fix build failure. Add more include directories. Fix O3 build failure. Update eigen. fix tensor_util_test segment fault issue add more macro check in hip.cmake. we may consider refine hip.cmake to inherit all add_definitions() in parrent scope, in the future. Fix rocRAND load. Update eigen to fix gru_unit_op and reduce_op. Add HIP support to testing. Update eigen to support int16 and int8 in arg min and arg max. * add rocprim as cub library used by nv implementation * Reduce build time in rocprim. * Add rocprim introduction, remove useless cmake code. * Remove useless flags and format cmake file. --- CMakeLists.txt | 1 + cmake/external/eigen.cmake | 2 +- cmake/external/rocprim.cmake | 44 +++++++++++++++++++++++++++++ cmake/flags.cmake | 3 ++ cmake/generic.cmake | 26 +++++++++-------- cmake/hip.cmake | 32 +++++++++++++++++---- paddle/fluid/pybind/CMakeLists.txt | 4 +-- paddle/testing/paddle_gtest_main.cc | 2 +- 8 files changed, 94 insertions(+), 20 deletions(-) create mode 100644 cmake/external/rocprim.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 3059ab7e0e..8dcf9786e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,6 +204,7 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) +include(external/rocprim) include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 573ad5e5f0..6aef97f212 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -17,7 +17,7 @@ if(WITH_AMD_GPU) extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake new file mode 100644 index 0000000000..914c064918 --- /dev/null +++ b/cmake/external/rocprim.cmake @@ -0,0 +1,44 @@ +if (NOT WITH_AMD_GPU) + return() +endif() + +# rocprim is "ROCm Parallel Primitives" for short. +# It is a header-only library providing HIP and HC parallel primitives +# for developing performant GPU-accelerated code on AMD ROCm platform. + +if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "/opt/rocm/hcc") +endif() + +INCLUDE(ExternalProject) + +SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim) +SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim) +SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_rocprim + GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git" + GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc + PREFIX ${ROCPRIM_SOURCE_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc + CMAKE_ARGS -DONLY_INSTALL=ON + CMAKE_ARGS -DBUILD_TEST=OFF + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR} + + INSTALL_DIR ${ROCPRIM_INSTALL_DIR} + ${EXTERNAL_PROJECT_LOG_ARGS} +) + +INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR}) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";") + add_library(rocprim STATIC ${dummyfile}) +else() + add_library(rocprim INTERFACE) +endif() + +add_dependencies(rocprim extern_rocprim) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 343e44ab4b..c4472040ce 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -129,6 +129,9 @@ set(COMMON_FLAGS -Wno-error=parentheses-equality # Warnings in pybind11 -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 -Wno-error=terminate # Warning in PADDLE_ENFORCE + -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 + -Wimplicit-fallthrough=0 # Warning in tinyformat.h + -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 ) set(GPU_COMMON_FLAGS diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 111627a932..7d803d00ef 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -454,25 +454,29 @@ function(hip_library TARGET_NAME) else() add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) - target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a) - find_fluid_modules(${TARGET_NAME}) + target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so) + find_fluid_modules(${TARGET_NAME}) endif() - if (hip_library_DEPS) - add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + if("${hip_library_DEPS}" MATCHES "ARCHIVE_START") + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). + # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. + target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END) + else() + target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) - string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - endif() + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() endforeach() else(hip_library_SRCS) if (hip_library_DEPS) - merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) + merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) else() - message(FATAL "Please specify source file or library in nv_library.") + message(FATAL "Please specify source file or library in nv_library.") endif() endif(hip_library_SRCS) endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bfe491bd6b..4276bc5b08 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU) endif() include_directories("/opt/rocm/include") +include_directories("/opt/rocm/hip/include") +include_directories("/opt/rocm/miopen/include") include_directories("/opt/rocm/hipblas/include") include_directories("/opt/rocm/hiprand/include") include_directories("/opt/rocm/rocrand/include") @@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust") list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc") -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" ) +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) if(WITH_DSO) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") endif(WITH_DSO) -if(WITH_DOUBLE) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE") -endif(WITH_DOUBLE) - if(WITH_TESTING) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") endif(WITH_TESTING) +if(WITH_DISTRIBUTE) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") +endif(WITH_DISTRIBUTE) + +if(WITH_GRPC) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") +endif(WITH_GRPC) + +if(NOT WITH_GOLANG) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG") +endif(NOT WITH_GOLANG) + +if(WITH_MKLDNN) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") +endif(WITH_MKLDNN) + +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") + +if(NOT WITH_RDMA) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA") +endif(NOT WITH_RDMA) + + + if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb6ee2f4a5..25d241d976 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -5,8 +5,8 @@ if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + DEPS ARCHIVE_START ${PYBIND_DEPS} + ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END) else() cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 598f435461..babb862122 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -28,7 +28,7 @@ int main(int argc, char** argv) { for (int i = 0; i < argc; ++i) { new_argv.push_back(argv[i]); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); #else From e21edb26f6e7fb364597c31a26f128c3c2710516 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 17:53:13 +0800 Subject: [PATCH 17/36] add Set/GetCPUNumThreads api --- paddle/fluid/inference/api/analysis_config.cc | 1 + paddle/fluid/inference/api/analysis_predictor.cc | 3 +-- paddle/fluid/inference/api/api_impl.cc | 3 +-- paddle/fluid/inference/api/paddle_api.h | 9 +++++++++ .../inference/tests/api/analyzer_resnet50_tester.cc | 1 + paddle/fluid/inference/tests/api/config_printer.h | 2 ++ paddle/fluid/inference/tests/api/tester_helper.h | 1 + paddle/fluid/operators/math/fc_compute.h | 4 +--- paddle/fluid/platform/cpu_helper.cc | 2 +- 9 files changed, 18 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 5ccd2dc5ab..100ee0c9d3 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_num_threads_ = other.cpu_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cb14d2a260..9162ccefd8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -35,7 +35,6 @@ #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); -DECLARE_int32(paddle_num_threads); namespace paddle { @@ -67,7 +66,7 @@ bool AnalysisPredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); if (!PrepareScope(parent_scope)) { return false; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index fcbc3803d0..c3d17edea4 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); -DECLARE_int32(paddle_num_threads); namespace paddle { namespace { @@ -76,7 +75,7 @@ bool NativePaddlePredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 0a2a2a1a23..b7f7781d06 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -186,6 +186,15 @@ struct NativeConfig : public PaddlePredictor::Config { // Specify the variable's name of each input if input tensors don't follow the // `feeds` and `fetches` of the phase `save_inference_model`. bool specify_input_name{false}; + + // Set and get the number of cpu threads. + void SetCPUNumThreads(int cpu_num_threads) { + cpu_num_threads_ = cpu_num_threads; + } + int GetCPUNumThreads() const { return cpu_num_threads_; } + + protected: + int cpu_num_threads_{1}; // number of cpu threads for each instance. }; // A factory to help create different predictors. diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 2b936175ed..308a794ca3 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; + cfg->SetCPUNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index aa0c4b1d04..a803f5b3f4 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; + os << GenSpaces(num_spaces) + << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 7b686045a5..fdadd59049 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); DECLARE_bool(profile); +DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index b072b4c20a..5b9953a5aa 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -17,8 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/jit_kernel.h" -DECLARE_int32(paddle_num_threads); - namespace paddle { namespace operators { namespace math { @@ -43,7 +41,7 @@ inline void FCCompute(const BlasT& blas, const int M, .template Get>(N); #ifdef PADDLE_WITH_MKLML -#pragma omp parallel for if (FLAGS_paddle_num_threads > 1) +#pragma omp parallel for #endif for (int i = 0; i < M; i++) { T* dst = Y + i * N; diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index f2d691b293..b737a6c38d 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) { #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; platform::dynload::MKL_Set_Num_Threads(real_num_threads); - omp_set_num_threads(num_threads); + omp_set_num_threads(real_num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif From a5c4b463c962bed48fba89d459adf82f4899d6c3 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 18:37:33 +0800 Subject: [PATCH 18/36] add SetMKLDNNThreadId api --- paddle/fluid/inference/api/analysis_predictor.cc | 8 ++++++++ paddle/fluid/inference/api/analysis_predictor.h | 2 ++ paddle/fluid/inference/api/paddle_analysis_config.h | 2 +- paddle/fluid/inference/tests/api/tester_helper.h | 9 ++++++--- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9162ccefd8..4633a75e5e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -159,6 +159,14 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } +void AnalysisPredictor::SetMKLDNNThreadId(int tid) { +#ifdef PADDLE_WITH_MKLDNN + platform::set_cur_thread_id(tid); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; +#endif +} + bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index cf81b7db73..9191970a3a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor { framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } + void SetMKLDNNThreadId(int tid); + protected: bool PrepareProgram(const std::shared_ptr &program); bool PrepareScope(const std::shared_ptr &parent_scope); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2ac736df7c..a09bd1cac2 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig { int max_batch_size = 1); bool use_tensorrt() const { return use_tensorrt_; } + void EnableMKLDNN(); // NOTE this is just for internal development, please not use it. // NOT stable yet. - void EnableMKLDNN(); bool use_mkldnn() const { return use_mkldnn_; } friend class ::paddle::AnalysisPredictor; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index fdadd59049..72703bc80b 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -216,13 +216,16 @@ void TestMultiThreadPrediction( size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { -#ifdef PADDLE_WITH_MKLDNN - platform::set_cur_thread_id(static_cast(tid) + 1); -#endif // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; auto &predictor = predictors[tid]; +#ifdef PADDLE_WITH_MKLDNN + if (use_analysis) { + static_cast(predictor.get()) + ->SetMKLDNNThreadId(static_cast(tid) + 1); + } +#endif // warmup run LOG(INFO) << "Running thread " << tid << ", warm up run..."; From e66b4c6bff74231898cbbb013627b0eb86eced0f Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 18:49:59 +0800 Subject: [PATCH 19/36] adjust tester_helper to make multi-instance multi-thread work test=develop --- paddle/fluid/inference/tests/api/tester_helper.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 72703bc80b..d21567ac19 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -207,11 +207,7 @@ void TestMultiThreadPrediction( int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; std::vector threads; - std::vector> predictors; - predictors.emplace_back(CreateTestPredictor(config, use_analysis)); - for (int tid = 1; tid < num_threads; ++tid) { - predictors.emplace_back(predictors.front()->Clone()); - } + auto main_predictor = CreateTestPredictor(config, use_analysis); size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { @@ -219,7 +215,9 @@ void TestMultiThreadPrediction( // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; - auto &predictor = predictors[tid]; + // To ensure the thread binding correctly, + // please clone inside the threadpool. + auto predictor = main_predictor->Clone(); #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) From 116979a40adf7fe7788f8cd50b9f03c57bcbba7b Mon Sep 17 00:00:00 2001 From: luotao1 Date: Fri, 23 Nov 2018 16:17:56 +0800 Subject: [PATCH 20/36] refine api name test=develop --- paddle/fluid/inference/api/analysis_config.cc | 3 ++- paddle/fluid/inference/api/analysis_predictor.cc | 4 ++-- paddle/fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/api/paddle_api.h | 14 +++++++++----- .../tests/api/analyzer_resnet50_tester.cc | 2 +- paddle/fluid/inference/tests/api/config_printer.h | 2 +- paddle/fluid/inference/tests/api/tester_helper.h | 2 +- 8 files changed, 18 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 100ee0c9d3..dd75f0d9a6 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -46,7 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; - cpu_num_threads_ = other.cpu_num_threads_; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; @@ -73,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 4633a75e5e..c132ce326c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -66,7 +66,7 @@ bool AnalysisPredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (!PrepareScope(parent_scope)) { return false; @@ -159,7 +159,7 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } -void AnalysisPredictor::SetMKLDNNThreadId(int tid) { +void AnalysisPredictor::SetMkldnnThreadID(int tid) { #ifdef PADDLE_WITH_MKLDNN platform::set_cur_thread_id(tid); #else diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9191970a3a..db57812bc3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -69,7 +69,7 @@ class AnalysisPredictor : public PaddlePredictor { framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } - void SetMKLDNNThreadId(int tid); + void SetMkldnnThreadID(int tid); protected: bool PrepareProgram(const std::shared_ptr &program); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index c3d17edea4..66a8e51396 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -75,7 +75,7 @@ bool NativePaddlePredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index b7f7781d06..1513a4b3b4 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -187,14 +187,18 @@ struct NativeConfig : public PaddlePredictor::Config { // `feeds` and `fetches` of the phase `save_inference_model`. bool specify_input_name{false}; - // Set and get the number of cpu threads. - void SetCPUNumThreads(int cpu_num_threads) { - cpu_num_threads_ = cpu_num_threads; + // Set and get the number of cpu math library threads. + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; + } + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; } - int GetCPUNumThreads() const { return cpu_num_threads_; } protected: - int cpu_num_threads_{1}; // number of cpu threads for each instance. + // number of cpu math library (such as MKL, OpenBlas) threads for each + // instance. + int cpu_math_library_num_threads_{1}; }; // A factory to help create different predictors. diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 308a794ca3..abc63577b7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -27,7 +27,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; - cfg->SetCPUNumThreads(FLAGS_paddle_num_threads); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index a803f5b3f4..4231eef722 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -54,7 +54,7 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; os << GenSpaces(num_spaces) - << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n"; + << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d21567ac19..1dc1678406 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -221,7 +221,7 @@ void TestMultiThreadPrediction( #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) - ->SetMKLDNNThreadId(static_cast(tid) + 1); + ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif From c35bf3d34b43dd6cc5b96e963f8990d60a68d749 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 16:36:54 +0800 Subject: [PATCH 21/36] Fix multiclass_nms_op unit test fail in python3.6 test=develop --- .../fluid/tests/unittests/test_multiclass_nms_op.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index df0562dcc7..e35be54b63 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -145,10 +145,16 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, lod.append(nmsed_num) if nmsed_num == 0: continue + tmp_det_out = [] for c, indices in nmsed_outs.items(): for idx in indices: xmin, ymin, xmax, ymax = boxes[n][idx][:] det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + tmp_det_out.append( + [c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + sorted_det_out = sorted( + tmp_det_out, key=lambda tup: tup[0], reverse=False) + det_outs.extend(sorted_det_out) return det_outs, lod @@ -210,7 +216,7 @@ class TestMulticlassNMSOp(OpTest): class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp): def set_argument(self): # Here set 2.0 to test the case there is no outputs. - # In practical use, 0.0 < score_threshold < 1.0 + # In practical use, 0.0 < score_threshold < 1.0 self.score_threshold = 2.0 From 5ca56cad1f3fdb50f7d019ae5e658b538f98aecc Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 23 Nov 2018 08:54:09 +0000 Subject: [PATCH 22/36] test=develop --- python/paddle/fluid/layers/control_flow.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 9730fbf510..05138bf945 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table): def increment(x, value=1.0, in_place=True): """ - This function performs an operation that increments each value in the + This function performs an operation that increments the value in the input :math:`x` by an amount: :math:`value` as mentioned in the input - parameter. This operation is performed in-place by default. + parameter. This operation is performed in-place by default. Notice that + the number of elements in :math:`x` must be equal to 1. Args: x (Variable|list): The tensor that has the input values. @@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True): Examples: .. code-block:: python - data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32') + data = fluid.layers.data(name='data', shape=[1], dtype='float32', + append_batch_size=False) data = fluid.layers.increment(x=data, value=3.0, in_place=True) """ helper = LayerHelper("increment", **locals()) From f7847ca6a304a649982c04bff4f3eec846a06c5d Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 23 Nov 2018 17:09:14 +0800 Subject: [PATCH 23/36] fix cublas warp error test=develop --- paddle/fluid/platform/dynload/cublas.cc | 3 +++ paddle/fluid/platform/dynload/cublas.h | 30 ++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc index 361d3439b8..41648c32fe 100644 --- a/paddle/fluid/platform/dynload/cublas.cc +++ b/paddle/fluid/platform/dynload/cublas.cc @@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); #endif +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4 +CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index ff80bd525c..ced789b90d 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -90,23 +90,33 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 8.0 #if CUDA_VERSION >= 8000 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched); +#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ + __macro(cublasGemmEx); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); + +CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif // APIs available after CUDA 9.0 #if CUDA_VERSION >= 9000 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode); +#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ + __macro(cublasSetMathMode); \ + __macro(cublasGetMathMode); + +CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif +// APIs available after CUDA 9.1 #if CUDA_VERSION >= 9010 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx); +#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(cublasGemmBatchedEx); \ + __macro(cublasGemmStridedBatchedEx); + +CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP From 5431d5c471d32a5ea3be049a339e57262bd3b483 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 18:15:06 +0800 Subject: [PATCH 24/36] Polish code test=develop --- python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index e35be54b63..9778bd694d 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -149,7 +149,6 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, for c, indices in nmsed_outs.items(): for idx in indices: xmin, ymin, xmax, ymax = boxes[n][idx][:] - det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) tmp_det_out.append( [c, scores[n][c][idx], xmin, ymin, xmax, ymax]) sorted_det_out = sorted( From 3d100b0c927b6326e75b3e493d545ee2b0ff4f4b Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 19:14:46 +0800 Subject: [PATCH 25/36] Add Python3.6 Python3.7 compile process test=develop --- Dockerfile | 75 +++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index b36102175c..eb7bb2549e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,44 +33,49 @@ RUN apt-get update && \ automake locales clang-format swig cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ + build-essential checkinstall \ + libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y -# Install Go and glide -RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ - tar -xz -C /usr/local && \ - mkdir /root/gopath && \ - mkdir /root/gopath/bin && \ - mkdir /root/gopath/src -ENV GOROOT=/usr/local/go GOPATH=/root/gopath -# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. -ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin -# install glide -RUN curl -s -q https://glide.sh/get | sh - -# Install TensorRT -# following TensorRT.tar.gz is not the default official one, we do two miny changes: -# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, -# and its size is only one-third of the official one. -# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. -# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - tar -xz -C /usr/local && \ - cp -rf /usr/local/TensorRT/include /usr && \ - cp -rf /usr/local/TensorRT/lib /usr - -# git credential to skip password typing -RUN git config --global credential.helper store - -# Fix locales to en_US.UTF-8 -RUN localedef -i en_US -f UTF-8 en_US.UTF-8 - -# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter -# version util jupyter fixes this issue. - -# specify sphinx version as 1.5.6 and remove -U option for [pip install -U -# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest -# version(1.7.1 for now), which causes building documentation failed. +COPY tools/manylinux1/build_scripts/* /root/python/ +RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0 + +# # Install Go and glide +# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + # tar -xz -C /usr/local && \ + # mkdir /root/gopath && \ + # mkdir /root/gopath/bin && \ + # mkdir /root/gopath/src +# ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# # install glide +# RUN curl -s -q https://glide.sh/get | sh + +# # Install TensorRT +# # following TensorRT.tar.gz is not the default official one, we do two miny changes: +# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# # and its size is only one-third of the official one. +# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. +# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ + # tar -xz -C /usr/local && \ + # cp -rf /usr/local/TensorRT/include /usr && \ + # cp -rf /usr/local/TensorRT/lib /usr + +# # git credential to skip password typing +# RUN git config --global credential.helper store + +# # Fix locales to en_US.UTF-8 +# RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# # version util jupyter fixes this issue. + +# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# # version(1.7.1 for now), which causes building documentation failed. # RUN pip3 install -U wheel && \ # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ From 64ca3d176cc1348f0735e3e6f4fd2c18e902f43b Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 23 Nov 2018 19:18:20 +0800 Subject: [PATCH 26/36] Add bias_attr in sequence_conv_pool API. (#14553) --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/nets.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 50114bf3df..8397ae093b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) -paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) +paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 00d33b36fc..fb75ef62d0 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -250,7 +250,8 @@ def sequence_conv_pool(input, filter_size, param_attr=None, act="sigmoid", - pool_type="max"): + pool_type="max", + bias_attr=None): """ The sequence_conv_pool is composed with Sequence Convolution and Pooling. @@ -266,6 +267,11 @@ def sequence_conv_pool(input, pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling. Default :math:`max`. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, sequence_conv + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. Return: Variable: The final result after Sequence Convolution and Pooling. @@ -289,6 +295,7 @@ def sequence_conv_pool(input, num_filters=num_filters, filter_size=filter_size, param_attr=param_attr, + bias_attr=bias_attr, act=act) pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type) From e8a8f2626cc65b8dc3a91507eb233581e8e7e0e2 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 16:58:57 +0800 Subject: [PATCH 27/36] Add Python3.6 and Python3.7 support in Ubuntu Dockerfile test=develop --- Dockerfile | 200 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 118 insertions(+), 82 deletions(-) diff --git a/Dockerfile b/Dockerfile index eb7bb2549e..6f45c79f3a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,27 @@ ENV HOME /root # Add bash enhancements COPY ./paddle/scripts/docker/root/ /root/ +# Prepare packages for Python +RUN apt-get update && \ + apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ + libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ + xz-utils tk-dev libffi-dev liblzma-dev + +# Install Python3.6 +RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ + tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ + ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ + wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ + tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +# Install Python3.7 +RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ + tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + RUN apt-get update && \ apt-get install -y --allow-downgrades patchelf \ python3 python3-dev python3-pip \ @@ -34,88 +55,103 @@ RUN apt-get update && \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ build-essential checkinstall \ - libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ + libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y -COPY tools/manylinux1/build_scripts/* /root/python/ -RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0 - -# # Install Go and glide -# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ - # tar -xz -C /usr/local && \ - # mkdir /root/gopath && \ - # mkdir /root/gopath/bin && \ - # mkdir /root/gopath/src -# ENV GOROOT=/usr/local/go GOPATH=/root/gopath -# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. -# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin -# # install glide -# RUN curl -s -q https://glide.sh/get | sh - -# # Install TensorRT -# # following TensorRT.tar.gz is not the default official one, we do two miny changes: -# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, -# # and its size is only one-third of the official one. -# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. -# # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - # tar -xz -C /usr/local && \ - # cp -rf /usr/local/TensorRT/include /usr && \ - # cp -rf /usr/local/TensorRT/lib /usr - -# # git credential to skip password typing -# RUN git config --global credential.helper store - -# # Fix locales to en_US.UTF-8 -# RUN localedef -i en_US -f UTF-8 en_US.UTF-8 - -# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter -# # version util jupyter fixes this issue. - -# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U -# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest -# # version(1.7.1 for now), which causes building documentation failed. -# RUN pip3 install -U wheel && \ - # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - # easy_install -U pip && \ - # pip install -U pip setuptools wheel && \ - # pip install -U docopt PyYAML sphinx==1.5.6 && \ - # pip install sphinx-rtd-theme==0.1.9 recommonmark - -# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - # pip3 install opencv-python && \ - # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - # pip install opencv-python - -# #For docstring checker -# RUN pip3 install pylint pytest astroid isort -# RUN pip install pylint pytest astroid isort LinkChecker - -# COPY ./python/requirements.txt /root/ -# RUN pip3 install -r /root/requirements.txt -# RUN pip install -r /root/requirements.txt - -# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use -# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -# RUN apt-get install -y libssl-dev libffi-dev -# RUN pip3 install certifi urllib3[secure] -# RUN pip install certifi urllib3[secure] - - -# # Install woboq_codebrowser to /woboq -# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - # (cd /woboq \ - # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - # -DCMAKE_BUILD_TYPE=Release . \ - # make) - -# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service -# RUN mkdir /var/run/sshd -# RUN echo 'root:root' | chpasswd -# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config -# EXPOSE 22 +# Install Go and glide +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh + +# Install TensorRT +# following TensorRT.tar.gz is not the default official one, we do two miny changes: +# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# and its size is only one-third of the official one. +# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. +RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ + tar -xz -C /usr/local && \ + cp -rf /usr/local/TensorRT/include /usr && \ + cp -rf /usr/local/TensorRT/lib /usr + +# git credential to skip password typing +RUN git config --global credential.helper store + +# Fix locales to en_US.UTF-8 +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# version util jupyter fixes this issue. + +# specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# version(1.7.1 for now), which causes building documentation failed. +RUN pip3.5 install -U wheel && \ + pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 install -U wheel && \ + pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 install -U wheel && \ + pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ + easy_install -U pip && \ + pip install -U pip setuptools wheel && \ + pip install -U docopt PyYAML sphinx==1.5.6 && \ + pip install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.5 install opencv-python && \ + pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 install opencv-python && \ + pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 install opencv-python && \ + pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip install opencv-python + +#For docstring checker +RUN pip3.5 install pylint pytest astroid isort +RUN pip3.6 install pylint pytest astroid isort +RUN pip3.7 install pylint pytest astroid isort +RUN pip install pylint pytest astroid isort LinkChecker + +COPY ./python/requirements.txt /root/ +RUN pip3.5 install -r /root/requirements.txt +RUN pip3.6 install -r /root/requirements.txt +RUN pip3.7 install -r /root/requirements.txt +RUN pip install -r /root/requirements.txt + +# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +RUN apt-get install -y libssl-dev libffi-dev +RUN pip3.5 install certifi urllib3[secure] +RUN pip3.6 install certifi urllib3[secure] +RUN pip3.7 install certifi urllib3[secure] +RUN pip install certifi urllib3[secure] + + +# Install woboq_codebrowser to /woboq +RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + (cd /woboq \ + cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + -DCMAKE_BUILD_TYPE=Release . \ + make) + +# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +RUN mkdir /var/run/sshd +RUN echo 'root:root' | chpasswd +RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +EXPOSE 22 From 155a0f78e6f8688a68dae765dee4d25e08bc614b Mon Sep 17 00:00:00 2001 From: Min Date: Sat, 24 Nov 2018 17:17:14 +0800 Subject: [PATCH 28/36] Polish code test=develop --- Dockerfile | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6f45c79f3a..9459552890 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,13 +34,13 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null # Install Python3.7 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null RUN apt-get update && \ @@ -54,8 +54,6 @@ RUN apt-get update && \ automake locales clang-format swig cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - build-essential checkinstall \ - libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y @@ -94,9 +92,9 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3.5 install -U wheel && \ - pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \ +RUN pip3 install -U wheel && \ + pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.6 install -U wheel && \ pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ @@ -108,9 +106,9 @@ RUN pip3.5 install -U wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.5 install opencv-python && \ +RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 install opencv-python && \ pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.6 install opencv-python && \ @@ -122,13 +120,13 @@ RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip install opencv-python #For docstring checker -RUN pip3.5 install pylint pytest astroid isort +RUN pip3 install pylint pytest astroid isort RUN pip3.6 install pylint pytest astroid isort RUN pip3.7 install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ -RUN pip3.5 install -r /root/requirements.txt +RUN pip3 install -r /root/requirements.txt RUN pip3.6 install -r /root/requirements.txt RUN pip3.7 install -r /root/requirements.txt RUN pip install -r /root/requirements.txt @@ -136,7 +134,7 @@ RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev -RUN pip3.5 install certifi urllib3[secure] +RUN pip3 install certifi urllib3[secure] RUN pip3.6 install certifi urllib3[secure] RUN pip3.7 install certifi urllib3[secure] RUN pip install certifi urllib3[secure] From 05e6a7141717f1eb1e73836c7aec67faea4d4db5 Mon Sep 17 00:00:00 2001 From: Min Date: Sat, 24 Nov 2018 17:19:22 +0800 Subject: [PATCH 29/36] Polish code test=develop --- .dockerignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 49adfe4f0a..2b2e74053d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,5 @@ *.DS_Store build/ -build* *.user .vscode .idea From c1bf9664cdf1187c3e01750ef68cfbbd8d788b7d Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sun, 25 Nov 2018 14:11:23 +0800 Subject: [PATCH 30/36] Add options to disable SO_REUSEPORT of grpc. (#14269) --- .../operators/distributed/grpc_client.cc | 5 +++++ .../operators/distributed/grpc_server.cc | 20 +++++++++++++++++++ .../operators/distributed/sendrecvop_utils.cc | 2 ++ python/paddle/fluid/__init__.py | 1 + 4 files changed, 28 insertions(+) diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index c28f86146d..0bd76b3f6c 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(rpc_disable_reuse_port); + namespace paddle { namespace operators { namespace distributed { @@ -383,6 +385,9 @@ std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { // Channel configurations: grpc::ChannelArguments args; args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); + if (FLAGS_rpc_disable_reuse_port) { + args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); + } args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); args.SetMaxSendMessageSize(std::numeric_limits::max()); args.SetMaxReceiveMessageSize(std::numeric_limits::max()); diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index ffd2b1707b..77bf67be25 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -20,6 +20,8 @@ limitations under the License. */ using ::grpc::ServerAsyncResponseWriter; +DECLARE_bool(rpc_disable_reuse_port); + namespace paddle { namespace operators { namespace distributed { @@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() { VLOG(40) << "AsyncGRPCServer WaitSeverReady"; } +// Define an option subclass in order to disable SO_REUSEPORT for the +// server socket. +// Come from: +// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +class NoReusePortOption : public ::grpc::ServerBuilderOption { + public: + void UpdateArguments(::grpc::ChannelArguments* args) override { + args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); + } + + void UpdatePlugins(std::vector>* + plugins) override {} +}; + void AsyncGRPCServer::StartServer() { ::grpc::ServerBuilder builder; builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(), @@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() { builder.SetMaxSendMessageSize(std::numeric_limits::max()); builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); + if (FLAGS_rpc_disable_reuse_port) { + builder.SetOption( + std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); + } builder.RegisterService(&service_); for (auto t : rpc_call_map_) { diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 374fa680e3..df5af3476f 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" +DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not."); + namespace paddle { namespace operators { namespace distributed { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 3c092dee34..d851b9dfaa 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -129,6 +129,7 @@ def __bootstrap__(): read_env_flags.append('rpc_send_thread_num') read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_prefetch_thread_num') + read_env_flags.append('rpc_disable_reuse_port') if core.is_compiled_with_cuda(): read_env_flags += [ From a7188d5bc7a8990ce228902e0361ae7880bb1927 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Sun, 25 Nov 2018 17:01:49 +0800 Subject: [PATCH 31/36] fix executor transfer cache bug (#14518) --- paddle/fluid/framework/CMakeLists.txt | 8 ++- paddle/fluid/framework/executor.cc | 1 + paddle/fluid/framework/naive_executor.cc | 1 + paddle/fluid/framework/operator.cc | 49 +++++-------- paddle/fluid/framework/operator.h | 4 ++ .../fluid/framework/transfer_scope_cache.cc | 72 +++++++++++++++++++ paddle/fluid/framework/transfer_scope_cache.h | 41 +++++++++++ 7 files changed, 143 insertions(+), 33 deletions(-) create mode 100644 paddle/fluid/framework/transfer_scope_cache.cc create mode 100644 paddle/fluid/framework/transfer_scope_cache.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 281d073166..83c8478685 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -116,8 +116,14 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) +if (NOT WIN32) +cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto) +cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler transfer_scope_cache) +else() cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) + shape_inference data_transform lod_tensor) +endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 7ce08b728d..f6c82995e1 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index e8e53f988f..e829563952 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -83,6 +83,7 @@ void NaiveExecutor::Run() { for (auto &op : ops_) { VLOG(3) << std::this_thread::get_id() << " run " << op->Type() << " on scope " << scope_; + op->SetIsCalledByExecutor(false); op->Run(*scope_, place_); } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 1ec170b6f6..0084573cd0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/profiler.h" @@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false, namespace paddle { namespace framework { -// Combine two hash values to a single hash. -inline size_t CombineHash(size_t seed, size_t a) { - return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2); -} - std::vector> kKernelPriority = { std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), @@ -797,17 +793,6 @@ void OperatorWithKernel::TransferInplaceVarsBack( Scope* OperatorWithKernel::TryTransferData( const Scope& scope, const OpKernelType& expected_kernel_key, std::vector* transfered_inplace_vars) const { -// In the inference scenerio, the scopes will be reused across the batches, so -// the `new_scope` here will result in GPU memroy explosion over the running of -// operators. -// We use a thread_local cache to fix that issue, the key in the cache is the -// combination of the `scope` argument, from_kernel_type, target_kernel_type. -// Have a discussion with @Superjomn or the inference developers if some changes -// on this logic for this macro might not tested on the other scenerios. -#ifdef PADDLE_ON_INFERENCE - thread_local std::unordered_map infer_transfer_scope_cache; -#endif - Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { for (auto& var_name : var_name_item.second) { @@ -838,23 +823,23 @@ Scope* OperatorWithKernel::TryTransferData( VLOG(30) << "Transform Variable " << var_name << " from " << kernel_type_for_var << " to " << expected_kernel_key; -#ifdef PADDLE_ON_INFERENCE - size_t infer_cache_key = - CombineHash(OpKernelType::Hash()(kernel_type_for_var), - OpKernelType::Hash()(expected_kernel_key)); - infer_cache_key = - CombineHash(infer_cache_key, std::hash()(&scope)); - - auto it = infer_transfer_scope_cache.find(infer_cache_key); - if (it != infer_transfer_scope_cache.end()) { - new_scope = infer_transfer_scope_cache[infer_cache_key]; - } else { - new_scope = &scope.NewScope(); - infer_transfer_scope_cache[infer_cache_key] = new_scope; + // In the inference scenerio, the scopes will be reused across the + // batches, so the `new_scope` here will result in GPU memroy explosion + // over the running of operators. + // We use a thread_local cache to fix that issue, the key in the cache is + // the combination of the `scope` argument, from_kernel_type, + // target_kernel_type. + // Have a discussion with @Superjomn or the inference developers if some + // changes on this logic for this macro might not tested on the other + // scenerios. + // If this op is not called by an Executor or ParallelExecutor, it should + // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and + // variables, that behavior a lot different. + if (!run_by_executor_) { + new_scope = TryCreateTransferScope(kernel_type_for_var, + expected_kernel_key, &scope); } -#endif - - if (new_scope == nullptr) { + if (!new_scope) { new_scope = &scope.NewScope(); } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index ef83833217..bfdfdc56b3 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -127,6 +127,8 @@ class OperatorBase { //! Get all outputs variable names virtual std::vector OutputVars(bool has_intermediate) const; + void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } + protected: std::string type_; // NOTE: in case of OpGrad, inputs_ contains: @@ -139,6 +141,8 @@ class OperatorBase { // IG (Inputs Gradients) VariableNameMap outputs_; AttributeMap attrs_; + // Whether this operator executes in an Executor. + bool run_by_executor_{true}; private: void GenerateTemporaryNames(); diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc new file mode 100644 index 0000000000..e52a8317e2 --- /dev/null +++ b/paddle/fluid/framework/transfer_scope_cache.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/transfer_scope_cache.h" + +namespace paddle { +namespace framework { + +std::unordered_map& global_transfer_data_cache() { + thread_local auto* x = new std::unordered_map; + return *x; +} + +std::unordered_set& global_transfer_scope_cache() { + thread_local auto* x = new std::unordered_set; + return *x; +} + +Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, + const Scope* scope) { + Scope* new_scope{nullptr}; + size_t infer_cache_key = + CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1)); + infer_cache_key = + CombineHash(infer_cache_key, std::hash()(scope)); + + auto it = global_transfer_data_cache().find(infer_cache_key); + if (it != global_transfer_data_cache().end()) { + new_scope = global_transfer_data_cache()[infer_cache_key]; + } else { + new_scope = &scope->NewScope(); + global_transfer_data_cache()[infer_cache_key] = new_scope; + } + global_transfer_scope_cache().insert(new_scope); + return new_scope; +} + +void RemoveKidsFromTransferScopeCache(Scope* scope) { + auto it = global_transfer_scope_cache().find(scope); + if (it != global_transfer_scope_cache().end()) { + global_transfer_scope_cache().erase(it); + } + for (auto* s : scope->kids()) { + auto it = global_transfer_scope_cache().find(s); + if (it != global_transfer_scope_cache().end()) { + global_transfer_scope_cache().erase(it); + } + } + + // remove global transfer data cache + auto& cache = global_transfer_data_cache(); + for (auto it = cache.begin(); it != cache.end();) { + if (it->second == scope) + it = cache.erase(it); + else + it++; + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h new file mode 100644 index 0000000000..86fc0bf529 --- /dev/null +++ b/paddle/fluid/framework/transfer_scope_cache.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +std::unordered_map& global_transfer_data_cache(); + +std::unordered_set& global_transfer_scope_cache(); + +// Combine two hash values to a single hash. +static size_t CombineHash(size_t seed, size_t a) { + return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, + const Scope* scope); + +void RemoveKidsFromTransferScopeCache(Scope* scope); + +} // namespace framework +} // namespace paddle From c52f65e0711cf5bb061df9743a631e9e63694c68 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Sun, 25 Nov 2018 17:33:31 +0800 Subject: [PATCH 32/36] Fix the random fail in test_image_classification.py (#14551) test=develop --- python/paddle/fluid/tests/book/test_image_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py index cba486cf59..c91bd27895 100644 --- a/python/paddle/fluid/tests/book/test_image_classification.py +++ b/python/paddle/fluid/tests/book/test_image_classification.py @@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None): assert len(results[0]) == len(transpiler_results[0]) for i in range(len(results[0])): np.testing.assert_almost_equal( - results[0][i], transpiler_results[0][i], decimal=5) + results[0][i], transpiler_results[0][i], decimal=4) print("infer results: ", results[0]) From 923c8e33325690d304cf6f0e13f29a4ea1611544 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Sun, 25 Nov 2018 22:40:24 +0800 Subject: [PATCH 33/36] add benchmark for inference (#14571) --- paddle/fluid/inference/CMakeLists.txt | 1 + paddle/fluid/inference/utils/CMakeLists.txt | 2 + paddle/fluid/inference/utils/benchmark.cc | 49 +++++++++++++++++ paddle/fluid/inference/utils/benchmark.h | 52 +++++++++++++++++++ .../fluid/inference/utils/benchmark_tester.cc | 39 ++++++++++++++ 5 files changed, 143 insertions(+) create mode 100644 paddle/fluid/inference/utils/CMakeLists.txt create mode 100644 paddle/fluid/inference/utils/benchmark.cc create mode 100644 paddle/fluid/inference/utils/benchmark.h create mode 100644 paddle/fluid/inference/utils/benchmark_tester.cc diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 2c5364b724..058a5b5f46 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -4,6 +4,7 @@ endif() # analysis and tensorrt must be added before creating static library, # otherwise, there would be undefined reference to them in static library. add_subdirectory(analysis) +add_subdirectory(utils) if (TENSORRT_FOUND) add_subdirectory(tensorrt) endif() diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt new file mode 100644 index 0000000000..2104e4ac72 --- /dev/null +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(benchmark SRCS benchmark.cc DEPS enforce) +cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc new file mode 100644 index 0000000000..021edc2de5 --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/benchmark.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +std::string Benchmark::SerializeToString() const { + std::stringstream ss; + ss << "-----------------------------------------------------\n"; + ss << "name\t"; + ss << "batch_size\t"; + ss << "num_threads\t"; + ss << "latency\t"; + ss << "qps"; + ss << '\n'; + + ss << name_ << "\t"; + ss << batch_size_ << "\t"; + ss << num_threads_ << "\t"; + ss << latency_ << "\t"; + ss << 1000 / latency_; + ss << '\n'; + return ss.str(); +} +void Benchmark::PersistToFile(const std::string &path) const { + std::ofstream file(path, std::ios::app); + PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path); + file << SerializeToString(); + file.flush(); + file.close(); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h new file mode 100644 index 0000000000..80e8f77adb --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +namespace paddle { +namespace inference { + +/* + * Helper class to calculate the performance. + */ +struct Benchmark { + int batch_size() const { return batch_size_; } + void SetBatchSize(int x) { batch_size_ = x; } + + int num_threads() const { return num_threads_; } + void SetNumThreads(int x) { num_threads_ = x; } + + bool use_gpu() const { return use_gpu_; } + void SetUseGpu() { use_gpu_ = true; } + + int latency() const { return latency_; } + void SetLatency(int x) { latency_ = x; } + + const std::string& name() const { return name_; } + void SetName(const std::string& name) { name_ = name; } + + std::string SerializeToString() const; + void PersistToFile(const std::string& path) const; + + private: + bool use_gpu_{false}; + int batch_size_{0}; + int latency_; + int num_threads_{1}; + std::string name_; +}; + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc new file mode 100644 index 0000000000..eb25547408 --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/benchmark.h" +#include +#include + +using namespace paddle::inference; +TEST(Benchmark, basic) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString(); +} + +TEST(Benchmark, PersistToFile) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + + benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("1.log"); +} \ No newline at end of file From 840c1b29adb1168f8ff84d1151462b0d171c741b Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 26 Nov 2018 10:30:25 +0800 Subject: [PATCH 34/36] test=develop (#14562) * test=develop remove code. * test=develop --- paddle/fluid/inference/api/demo_ci/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 49683eab07..8fb464c0f5 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -46,8 +46,6 @@ if(WITH_GPU) endif() endif(NOT WIN32) endif() - -include_directories("D:/Paddle/") include_directories("${PADDLE_LIB}") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include") From bf222f197d0ac5f7416d5e70d5ddca1009b96559 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 26 Nov 2018 11:21:26 +0800 Subject: [PATCH 35/36] Use sub scope in tensor_array_to_tensor op. (#14524) test=develop --- paddle/fluid/framework/executor.cc | 4 +- .../fluid/inference/api/analysis_predictor.cc | 1 - paddle/fluid/inference/api/api_impl.cc | 1 - .../fluid/inference/tests/api/CMakeLists.txt | 50 +++++++++---------- .../operators/tensor_array_to_tensor_op.cc | 10 ++-- 5 files changed, 32 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f6c82995e1..3dc571d757 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -392,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr> gc; - // WhileOp would set keep_kids to false - // WhileGradOp would need the scopes created in WhileOp + // WhileOp would set keep_kids to true, + // because WhileGradOp needs the scopes created in WhileOp. // Perhaps, we should not perform eager deletion in WhileOp // The scopes and variables created by WhileOp would be deleted // in WhileGradOp. diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index c132ce326c..d111c0cfd9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -174,7 +174,6 @@ bool AnalysisPredictor::Run(const std::vector &inputs, inference::Timer timer; timer.tic(); // set feed variable - std::vector feeds; framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); if (!SetFeed(inputs, scope)) { LOG(ERROR) << "fail to set feed"; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 66a8e51396..ff375c73a7 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -138,7 +138,6 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, Timer timer; timer.tic(); // set feed variable - std::vector feeds; framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get(); if (!SetFeed(inputs, scope)) { LOG(ERROR) << "fail to set feed"; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index e8bd13037e..7dc88d9dd0 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) - inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") + inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) @@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI - # anakin rnn1 - set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin") - set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1") - inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin") - inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt") - cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc - ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin - --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt - DEPS inference_anakin_api_shared SERIAL) - # anakin mobilenet - if(WITH_GPU) - set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet") - inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin") - cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc - ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin - DEPS inference_anakin_api_shared dynload_cuda SERIAL) - endif() + # anakin rnn1 + set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin") + set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1") + inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin") + inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt") + cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc + ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin + --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt + DEPS inference_anakin_api_shared SERIAL) + # anakin mobilenet + if(WITH_GPU) + set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet") + inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin") + cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc + ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api_shared dynload_cuda SERIAL) + endif() endif() if(WITH_GPU AND TENSORRT_FOUND) - set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") - if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) - inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") - endif() - inference_analysis_test(test_trt_models SRCS trt_models_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") + if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") + endif() + inference_analysis_test(test_trt_models SRCS trt_models_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) endif() diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc index 96dc123f6a..58a74ec2c1 100644 --- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase { out_inx_dim[0] = inx.size(); out_inx.Resize(out_inx_dim); + auto &local_scope = scope.NewScope(); std::string var_name = "out_index"; - framework::Variable *tmp_index_var = - const_cast(scope).Var(var_name); + framework::Variable *tmp_index_var = local_scope.Var(var_name); auto &tmp_index_tensor = *(tmp_index_var->GetMutable()); tmp_index_tensor.Resize(out_inx_dim); @@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase { out_dims[axis] = out_dim_sum; out.Resize(out_dims); - LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names); - // Invoke Reshape Op + LodTensorArray2LodTensorVector(local_scope, base_name, Input("X"), &names); + // Invoke concat Op auto concat_op = framework::OpRegistry::CreateOp( "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs); - concat_op->Run(scope, place); + concat_op->Run(local_scope, place); } }; From 39ec80def42ef48112ee0025e2a318138f1992b7 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 26 Nov 2018 12:49:17 +0800 Subject: [PATCH 36/36] Remove the memory copy of feeding data in C++ inference API (#14577) * Remove the memory copy for feeding data in C++ inference API * Fix compling dependence * Fix compling in ONLY_CPU mode --- paddle/fluid/inference/api/CMakeLists.txt | 4 +++- .../fluid/inference/api/analysis_predictor.cc | 23 ++++++++++++++---- paddle/fluid/inference/api/api_impl.cc | 24 +++++++++++++++---- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e9969b84f3..eda251c534 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager) cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce) cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS + lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config + analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array) cc_test(test_paddle_inference_api SRCS api_tester.cc diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d111c0cfd9..72ac534384 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #endif #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -214,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, framework::DDim ddim = framework::make_ddim(inputs[i].shape); void *input_ptr; if (inputs[i].dtype == PaddleDType::INT64) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; } - // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. - std::memcpy(static_cast(input_ptr), inputs[i].data.data(), - inputs[i].data.length()); + if (platform::is_cpu_place(place_)) { + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + } else { +#ifdef PADDLE_WITH_CUDA + auto dst_gpu_place = boost::get(place_); + memory::Copy(dst_gpu_place, static_cast(input_ptr), + platform::CPUPlace(), inputs[i].data.data(), + inputs[i].data.length(), + 0); // stream 0 for sync copy +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. framework::LoD lod; for (auto &level : inputs[i].lod) { diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index ff375c73a7..0f88ad14b0 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -193,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, framework::DDim ddim = framework::make_ddim(inputs[i].shape); void *input_ptr; if (inputs[i].dtype == PaddleDType::INT64) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { - input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; } - // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. - std::memcpy(static_cast(input_ptr), inputs[i].data.data(), - inputs[i].data.length()); + if (platform::is_cpu_place(place_)) { + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + } else { +#ifdef PADDLE_WITH_CUDA + auto dst_gpu_place = boost::get(place_); + memory::Copy(dst_gpu_place, static_cast(input_ptr), + platform::CPUPlace(), inputs[i].data.data(), + inputs[i].data.length(), + 0); // stream 0 for sync copy +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. framework::LoD lod; for (auto &level : inputs[i].lod) {