Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into crop_op

Conflicts:
	paddle/pybind/pybind.cc
	python/paddle/v2/framework/tests/op_test_util.py
update-doc-pybind
wanghaoshuang 8 years ago
commit b299d07fbe

@ -67,6 +67,9 @@ endif()
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# TODO: support glog for Android api 16 ~ 19 in the future
message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
endif()
set(WITH_GPU OFF CACHE STRING

@ -6,13 +6,14 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
# ENV variables
ARG ANDROID_ABI
ARG ANDROID_API
ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV ANDROID_API=${ANDROID_API:-21}
ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
ANDROID_TOOLCHAINS_DIR=/opt/toolchains
RUN apt-get update && \
apt-get install -y \
@ -42,14 +43,12 @@ RUN pip install --upgrade pip && \
pip install pre-commit
# Android NDK
RUN mkdir /opt/android-ndk-tmp && \
RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
mkdir -p /opt/android-ndk-tmp && \
cd /opt/android-ndk-tmp && \
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
rm -rf /opt/android-ndk-tmp && \
rm -rf ${ANDROID_NDK_HOME}
rm -rf /opt/android-ndk-tmp
CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]

@ -18,9 +18,9 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
IF(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ELSE(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
@ -56,3 +56,12 @@ SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID)
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
ENDIF()
ENDIF()

@ -19,9 +19,9 @@ SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
IF(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
ELSE(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
@ -56,3 +56,12 @@ ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
IF(ANDROID)
INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib)
ENDIF()
ENDIF()

@ -73,6 +73,26 @@ IF(NOT ${CBLAS_FOUND})
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
IF(WITH_C_API)
INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
# Because libopenblas.a is a symbolic link of another library, thus need to
# install the whole directory.
IF(ANDROID)
SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI})
ELSE()
SET(TMP_INSTALL_DIR third_party/openblas/lib)
ENDIF()
INSTALL(CODE "execute_process(
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
)"
)
INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
\"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
)"
)
ENDIF()
ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")

@ -223,6 +223,15 @@ IF(NOT PROTOBUF_FOUND)
SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
CACHE FILEPATH "protoc library." FORCE)
IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
ENDIF()
ENDIF()
IF(CMAKE_CROSSCOMPILING)
PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
ELSE()

@ -49,3 +49,12 @@ ExternalProject_Add(
)
LIST(APPEND external_project_dependencies zlib)
IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
IF(ANDROID)
INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib)
ENDIF()
ENDIF()

@ -262,7 +262,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
- 生成库
无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件,`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
## 实现单元测试
@ -354,11 +354,7 @@ class TestMulGradOp(GradientChecker):
### 编译和执行单元测试
单元测试编写完成之后,在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)中添加以下内容,将单元测试加入工程:
```
py_test(test_mul_op SRCS test_mul_op.py)
```
`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
请注意,**不同于Op的编译测试运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试:

@ -5,15 +5,13 @@
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc````doc_cn`` 两个子目录下。
如何构建PaddlePaddle的文档
==========================
如何构建文档
============
PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式我们提供了一个构建脚本build_docs.sh来进行构建。
PaddlePaddle文档需要准备的环境相对较复杂所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
PaddlePaddle的文档构建有两种方式。
使用Docker构建PaddlePaddle的文档
--------------------------------
使用Docker构建
--------------
使用Docker构建PaddlePaddle的文档需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档
@ -21,58 +19,46 @@ PaddlePaddle文档需要准备的环境相对较复杂所以我们推荐使
cd TO_YOUR_PADDLE_CLONE_PATH
cd paddle/scripts/tools/build_docs
bash build_docs.sh with_docker
编译完成后,会在当前目录生成两个子目录\:
* doc 英文文档目录
* doc_cn 中文文档目录
sh build_docs.sh
编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
打开浏览器访问对应目录下的index.html即可访问本地文档。
直接构建PaddlePaddle的文档
--------------------------
因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包用户需要首先确认py_paddle包已经安装。
.. code-block:: bash
python -c "import py_paddle"
如果提示错误那么用户需要在本地编译安装PaddlePaddle请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_
注意用户在首次编译安装PaddlePaddle时请将WITH_DOC选项关闭。在编译安装正确之后请再次确认py_paddle包已经安装即可进行下一步操作。
直接构建
--------
如果提示正确,可以执行以下命令编译生成文档,即
.. code-block:: bash
cd TO_YOUR_PADDLE_CLONE_PATH
cd paddle/scripts/tools/build_docs
bash build_docs.sh local
编译完成之后,会在当前目录生成两个子目录\:
* doc 英文文档目录
* doc_cn 中文文档目录
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
make gen_proto_py
make paddle_docs paddle_docs_cn
编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
打开浏览器访问对应目录下的index.html即可访问本地文档。
如何书写PaddlePaddle的文档
==========================
如何书写文档
============
PaddlePaddle文档使用 `sphinx`_ 自动生成用户可以参考sphinx教程进行书写。
如何更新www.paddlepaddle.org文档
================================
如何更新文档主题
================
PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下,包含所有和前端网页设计相关的文件。
开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_
如何更新doc.paddlepaddle.org
============================
更新的文档以PR的形式提交到github中提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_
目前PaddlePaddle的develop分支的文档是自动触发更新的用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_
`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_
.. _cmake: https://cmake.org/
.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/

@ -64,9 +64,29 @@ link_paddle_exe(paddle_capi_shared)
install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
if(ANDROID)
execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
OUTPUT_VARIABLE GIT_COMMITS_LIST
RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${GIT_COMMITS_LIST_RESULT})
set(GIT_COMMITS_LIST "No commits.")
endif()
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
DESTINATION lib/${ANDROID_ABI})
install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
\"Compiler:\n\"
\"\\t${CMAKE_C_COMPILER}\\n\"
\"\\t${CMAKE_CXX_COMPILER}\\n\"
\"Compiler Flags:\\n\"
\"\\t${CMAKE_F_FLAGS}\\n\"
\"\\t${CMAKE_CXX_FLAGS}\\n\"
\"Android API: ${CMAKE_SYSTEM_VERSION}\\n\"
\"Lastest commit:\\n\"
\"\\t${GIT_COMMITS_LIST}\\n\"
)"
)
else(ANDROID)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
install(TARGETS paddle_capi_shared DESTINATION lib)

@ -9,6 +9,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_test(variable_test SRCS variable_test.cc)

@ -2,11 +2,22 @@
## Motivation
In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the gradient operators/expressions together with the chain rule. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
In Neural Network, many model is solved by the the backpropagation algorithm(known as BP) at present. Technically it caculates the gradient of the loss function, then distributed back through the networks. Follows the chain rule, so we need a module chains the gradient operators/expressions together with to construct the backward pass. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
## Backward Operator Registry
## Implementation
A backward network is built up with several backward operators. Backward operators take forward operators' inputs outputs, and output gradients and then calculate its input gradients.
In this design doc, we exported only one API for generating the backward pass.
```c++
std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars);
```
The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
### Backward Operator Registry
A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
| | forward operator | backward operator
| ---------------------- | ---------------- |------------------------- |
@ -25,7 +36,7 @@ REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
## Backward Opeartor Creating
### Backward Opeartor Creating
Given a certain forward operator, we can get its corresponding backward operator by calling:
@ -43,40 +54,47 @@ The function `BuildGradOp` will sequentially execute following processes:
4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
## Backward Network Building
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
### Backward Network Building
In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network.
given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`, `InputGradients`.
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and append them together one by one. There is some corner case need to process specially.
1. Op
when the input forward network is an Op, return its gradient Operator Immediately.
When the input forward network is an Op, return its gradient Operator Immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
2. NetOp
when the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
3. RnnOp
RnnOp is a nested stepnet operator. Backward module need to recusively call `Backward` for every stepnet.
4. Sharing Variables
**sharing variables**. As illustrated in the pictures, two operator's share the same variable name of W@GRAD, which will overwrite their sharing input variable.
<p align="center">
<img src="./images/duplicate_op.png" width="50%" ><br/>
**shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwrite their shared input variable.
pic 1. Sharing variables in operators.
<p align="center">
<img src="./images/duplicate_op.png" width="50%" ><br/>
</p>
1. Shared variable in operators.
Sharing variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator to replace the overwrite links.
</p>
<p align="center">
<img src="images/duplicate_op2.png" width="40%" ><br/>
Share variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator replace the overwrite links.
pic 2. Replace sharing variable's gradient with `Add` operator.
<p align="center">
<img src="images/duplicate_op2.png" width="50%" ><br/>
</p>
2. Replace shared variable's gradient with `Add` operator.
Because our framework finds variables accord to their names, we need to rename the output links. We add a suffix of number to represent its position in clockwise.
</p>
5. Part of Gradient is Zero.
In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator, we need to fill a same shape gradient matrix in the position. In our implement, we insert a special `fillZeroLike` operator.
Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 24 KiB

@ -18,8 +18,10 @@
#ifndef PADDLE_ONLY_CPU
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif
#include <glog/logging.h>
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/enforce.h"
@ -32,7 +34,8 @@ template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<T>;
using Vector = thrust::host_vector<
T, thrust::system::cuda::experimental::pinned_allocator<T>>;
#endif
using LoD = std::vector<Vector<size_t>>;

@ -0,0 +1,52 @@
/*
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h"
#include <gtest/gtest.h>
__global__ void test(size_t* a, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) {
a[i] *= 2;
}
}
TEST(LoDTensor, LoDInGPU) {
paddle::framework::Tensor tensor;
paddle::framework::LoDTensor lod_tensor;
paddle::platform::GPUPlace place(0);
paddle::framework::LoD src_lod;
src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
tensor.Resize({14, 16});
tensor.mutable_data<float>(place);
lod_tensor.set_lod(src_lod);
lod_tensor.set_tensor(&tensor);
CHECK_EQ(lod_tensor.lod_element(0, 2), 4);
CHECK_EQ(lod_tensor.lod_element(0, 4), 8);
auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].data(), lod[0].size());
cudaDeviceSynchronize();
for (size_t i = 0; i < src_lod[0].size(); ++i) {
CHECK_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
}
}

@ -81,6 +81,9 @@ class Tensor {
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
/*! Return the numel of the memory block. */
inline int64_t numel() const;
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
@ -162,6 +165,12 @@ class Tensor {
/*! points to dimensions of memory block. */
DDim dims_;
/**
* A cache of the number of elements in a tensor.
* Would be 0 for an uninitialized tensor.
*/
int64_t numel_;
/**
* @brief A PlaceHolder may be shared by more than one tensor.
*

@ -24,7 +24,7 @@ inline void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL(
holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE_GE(
holder_->size(), product(dims_) * sizeof(T) + offset_,
holder_->size(), numel() * sizeof(T) + offset_,
"Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.\n"
"or maybe the required data-type mismatches the data already stored.");
@ -54,11 +54,11 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
template <typename T>
inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
PADDLE_ENFORCE_GT(product(dims_), 0,
PADDLE_ENFORCE_GT(numel(), 0,
"Tensor's numel must be larger than zero to call "
"Tensor::mutable_data. Call Tensor::set_dim first.");
/* some versions of boost::variant don't have operator!= */
int64_t size = product(dims_) * sizeof(T);
int64_t size = numel() * sizeof(T);
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
@ -97,7 +97,7 @@ inline void Tensor::CopyFrom(const Tensor& src,
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
auto size = product(src.dims_) * sizeof(T);
auto size = src.numel() * sizeof(T);
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
PADDLE_ENFORCE_LT(begin_idx, end_idx,
"Begin index must be less than end index.");
PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
size_t base = product(dims_) / dims_[0];
size_t base = numel() / dims_[0];
Tensor dst;
dst.holder_ = holder_;
DDim dst_dims = dims_;
@ -143,11 +143,14 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
inline Tensor& Tensor::Resize(const DDim& dims) {
dims_ = dims;
numel_ = product(dims_);
return *this;
}
inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return numel_; }
template <typename T>
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;

@ -594,7 +594,7 @@ struct StridePadding {
float32x4_t s1 = vdupq_n_f32(0.f);
for (int s = 0; s < step; s++) {
float32x4_t s0 = vld1q_f32(input);
float32x4x2_t v = {s0, s1};
float32x4x2_t v = {{s0, s1}};
vst2q_f32(inputPadding, v);
input += 4;
inputPadding += 8;

@ -53,27 +53,27 @@ bool DeConv3DLayer::init(const LayerMap &layerMap,
size_t DeConv3DLayer::getSize() {
CHECK_NE(inputLayers_.size(), 0UL);
outputH_.clear();
outputW_.clear();
outputD_.clear();
imgSizeW_.clear();
imgSizeH_.clear();
imgSizeD_.clear();
N_.clear();
NOut_.clear();
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); ++i) {
outputW_.push_back(
imageSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
outputH_.push_back(imageSize(
imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
outputD_.push_back(imageSize(
imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
NOut_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
N_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
imgSizeW_.push_back(
imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
imgSizeH_.push_back(imageSize(
outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
imgSizeD_.push_back(imageSize(
outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
layerSize += NOut_[i] * numFilters_;
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
getOutput().setFrameDepth(outputD_[0]);
getOutput().setFrameHeight(imgSizeH_[0]);
getOutput().setFrameWidth(imgSizeW_[0]);
getOutput().setFrameDepth(imgSizeD_[0]);
return layerSize;
}
@ -103,9 +103,9 @@ void DeConv3DLayer::forward(PassType passType) {
}
colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
numFilters_,
outputD_[i],
outputH_[i],
outputW_[i],
imgSizeD_[i],
imgSizeH_[i],
imgSizeW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
@ -144,9 +144,9 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
colBuf_->vol2Col(
getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
numFilters_,
outputD_[i],
outputH_[i],
outputW_[i],
imgSizeD_[i],
imgSizeH_[i],
imgSizeW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],

@ -49,6 +49,12 @@ struct LayerState {
};
typedef std::shared_ptr<LayerState> LayerStatePtr;
/// Paddle device ID, MKLDNN is -2, CPU is -1
enum PADDLE_DEVICE_ID {
MKLDNN_DEVICE = -2,
CPU_DEVICE = -1,
};
/**
* @brief Base class for layer.
* Define necessary variables and functions for every layer.
@ -59,11 +65,6 @@ protected:
LayerConfig config_;
/// whether to use GPU
bool useGpu_;
/// Paddle device ID, MKLDNN is -2, CPU is -1
enum PADDLE_DEVICE_ID {
MKLDNN_DEVICE = -2,
CPU_DEVICE = -1,
};
/// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
int deviceId_;
/// Input layers

File diff suppressed because it is too large Load Diff

@ -45,35 +45,28 @@ public:
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void convertWeightsFromPaddle() override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void convertWeightsToPaddle() override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void forward(PassType passType) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void backward(const UpdateCallback& callback) override;
void updateInputData() override;
protected:
/**
* reshape the input image sizes
* and reset output buffer size
* and reset mkldnn forward
*/
void reshape();
/**
* reset the forward primitve and memory
* only would be called when input size changes
*/
void resetFwd();
/**
* reset the backward primitve and memory for mkldnn fc
* only would be called when needed
*/
void resetBwd();
void convertOutputToOtherDevice() override;
void updateWeights(const UpdateCallback& callback) override;
void convertWeightsFromPaddle() override;
void convertWeightsToPaddle() override;
};
} // namespace paddle

File diff suppressed because it is too large Load Diff

@ -83,8 +83,7 @@ void SwitchOrderLayer::forward(PassType passType) {
setOutDims();
resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
if (heightAxis_.size() > 0) {
getOutputValue()->reshape(reshapeHeight_, reshapeWidth_);
getOutputGrad()->reshape(reshapeHeight_, reshapeWidth_);
resetOutput(reshapeHeight_, reshapeWidth_);
}
// switch NCHW to NHWC

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save