gpu support, fix build issue:

1. Non utf-8 characters within comments of OPs may lead to protobuf fail to parse_from_string
2. comment out some ops which not supported on windows
3. cuda libs may not be correctly linked to target on windows
panyx0718-patch-1
peizhilin 6 years ago
parent 71d7980f69
commit 1f12ba6192

@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO) if(NOT WITH_DSO)
# TODO(panyx0718): CUPTI only allows DSO? # TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32)
endif(NOT WITH_DSO) endif(NOT WITH_DSO)
# setting nvcc arch flags # setting nvcc arch flags

@ -13,10 +13,14 @@ cc_library(paddle_fluid_api
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
# paddle_fluid_origin exclude inference api interface # paddle_fluid_origin exclude inference api interface
if(WIN32) if(WIN32)
sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32) else(WIN32)
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
endif(WIN32) endif(WIN32)
@ -36,6 +40,9 @@ endif()
# Create static library # Create static library
if(WIN32) if(WIN32)
sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32) else(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
endif(WIN32) endif(WIN32)
@ -50,6 +57,9 @@ endif()
if(WIN32) if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32) else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)

@ -74,7 +74,7 @@ PadConstantLikeOp Operator.
Pad input(Y) with a pad_value, the number of values padded to the edges of each Pad input(Y) with a pad_value, the number of values padded to the edges of each
axis is specified by the difference of the shape of X and Y. axis is specified by the difference of the shape of X and Y.
((0, shape_x_0 - shape_y_0), (0, shape_x_n - shape_y_n)) unique pad widths for ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
each axis. each axis.
The input should be a k-D tensor(k > 0 and k < 7). As an example: The input should be a k-D tensor(k > 0 and k < 7). As an example:

@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor), " "(Tensor), "
"Argmaxes corresponding to indices in X used " "Argmaxes corresponding to indices in X used "
"for gradient computation. Only output " "for gradient computation. Only output "
"if arg “is_test” is false.") "if arg \"is_test\" is false.")
.AsIntermediate(); .AsIntermediate();
AddAttr<float>("spatial_scale", AddAttr<float>("spatial_scale",
"(float, default 1.0), " "(float, default 1.0), "

@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is: Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
$(N, C_{out}, H_{out}, W_{out})$, where $(N, C_{out}, H_{out}, W_{out})$, where
$$ $$
H_{out} = (H_{in}1) * strides[0] 2 * paddings[0] + ksize[0] \\ H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
W_{out} = (W_{in}1) * strides[1] 2 * paddings[1] + ksize[1] W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1]
$$ $$
Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
)DOC"); )DOC");

@ -22,6 +22,10 @@ if(WITH_PYTHON)
endif(WITH_AMD_GPU) endif(WITH_AMD_GPU)
if(WIN32) if(WIN32)
if(WITH_GPU AND NOT WITH_DSO)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
target_link_libraries(paddle_pybind ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_pybind shlwapi) target_link_libraries(paddle_pybind shlwapi)
endif(WIN32) endif(WIN32)

@ -61,12 +61,13 @@ IF(WIN32)
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
# COMMAND ${CMAKE_COMMAND} -E touch stub.cc # COMMAND ${CMAKE_COMMAND} -E touch stub.cc
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/libs
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python # COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/libs
COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/lib-python # COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/libs
DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
ELSE(WIN32) ELSE(WIN32)
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp

@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
# import all class inside framework into fluid module # import all class inside framework into fluid module
from . import framework from . import framework
from .framework import * from .framework import *
@ -43,16 +44,17 @@ from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip from . import clip
from . import profiler from . import profiler
from . import unique_name from . import unique_name
from . import recordio_writer if os.name != 'nt':
from . import parallel_executor from . import recordio_writer
from .parallel_executor import * from . import parallel_executor
from .parallel_executor import *
from paddle.fluid.layers.math_op_patch import monkey_patch_variable from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + \ __all__ = framework.__all__ + executor.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + [ lod_tensor.__all__ + [
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
@ -78,7 +80,8 @@ __all__ = framework.__all__ + executor.__all__ + \
'recordio_writer', 'recordio_writer',
'Scope', 'Scope',
] ]
if os.name != 'nt':
__all__ += parallel_executor.__all__
def __bootstrap__(): def __bootstrap__():
""" """
@ -110,12 +113,16 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb', 'dist_threadpool_size', 'eager_delete_tensor_gb',
'reader_queue_speed_test_mode' 'reader_queue_speed_test_mode'
] ]
if os.name != 'nt':
read_env_flags.append('warpctc_dir')
read_env_flags.append('cpu_deterministic')
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_server_profile_period') read_env_flags.append('rpc_server_profile_period')

@ -15,13 +15,15 @@
from __future__ import print_function from __future__ import print_function
import contextlib import contextlib
import os
from .. import core from .. import core
from .. import executor from .. import executor
from .. import framework from .. import framework
from .. import io from .. import io
from .. import parallel_executor if os.name != 'nt':
from .. import parallel_executor
from .. import unique_name from .. import unique_name
from .trainer import check_and_get_place from .trainer import check_and_get_place

@ -28,7 +28,8 @@ from .. import framework
from .. import io from .. import io
# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
from .. import optimizer as opt_module from .. import optimizer as opt_module
from .. import parallel_executor if os.name != 'nt':
from .. import parallel_executor
from ..transpiler import distribute_transpiler from ..transpiler import distribute_transpiler
__all__ = [ __all__ = [

@ -536,7 +536,7 @@ class Operator(object):
OP_WITHOUT_KERNEL_SET = { OP_WITHOUT_KERNEL_SET = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine',
'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
} }

@ -15,6 +15,7 @@
from __future__ import print_function from __future__ import print_function
import contextlib import contextlib
import multiprocessing import multiprocessing
import os
import six import six
import threading import threading
@ -344,70 +345,71 @@ def _copy_reader_create_op_(block, op):
return new_op return new_op
@templatedoc(op_type='create_recordio_file_reader') if os.name != 'nt':
def open_recordio_file(filename, @templatedoc(op_type='create_recordio_file_reader')
shapes, def open_recordio_file(filename,
lod_levels, shapes,
dtypes, lod_levels,
pass_num=1, dtypes,
for_parallel=True): pass_num=1,
""" for_parallel=True):
${comment} """
${comment}
Args:
filename(${filename_type}): ${filename_comment}. Args:
shapes(list): List of tuples which declaring data shapes. filename(${filename_type}): ${filename_comment}.
lod_levels(${lod_levels_type}): ${lod_levels_comment}. shapes(list): List of tuples which declaring data shapes.
dtypes(list): List of strs which declaring data type. lod_levels(${lod_levels_type}): ${lod_levels_comment}.
pass_num(int): Number of passes to run. dtypes(list): List of strs which declaring data type.
for_parallel(Bool): Set it as True if you are going to run pass_num(int): Number of passes to run.
subsequent operators in parallel. for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel.
Returns:
${out_comment}. Returns:
${out_comment}.
Examples:
Examples:
>>> import paddle.fluid as fluid
>>> reader = fluid.layers.io.open_recordio_file( >>> import paddle.fluid as fluid
>>> filename='./data.recordio', >>> reader = fluid.layers.io.open_recordio_file(
>>> shapes=[(3,224,224), (1)], >>> filename='./data.recordio',
>>> lod_levels=[0, 0], >>> shapes=[(3,224,224), (1)],
>>> dtypes=['float32', 'int64']) >>> lod_levels=[0, 0],
>>> # Via the reader, we can use 'read_file' layer to get data: >>> dtypes=['float32', 'int64'])
>>> image, label = fluid.layers.io.read_file(reader) >>> # Via the reader, we can use 'read_file' layer to get data:
""" >>> image, label = fluid.layers.io.read_file(reader)
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] """
shape_concat = [] dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
ranks = [] shape_concat = []
ranks = []
for shape in shapes:
shape_concat.extend(shape) for shape in shapes:
ranks.append(len(shape)) shape_concat.extend(shape)
ranks.append(len(shape))
var_name = unique_name('open_recordio_file')
var_name = unique_name('open_recordio_file')
startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name) startup_blk = default_startup_program().current_block()
startup_blk.append_op( startup_var = startup_blk.create_var(name=var_name)
type='create_recordio_file_reader', startup_blk.append_op(
outputs={'Out': [startup_var]}, type='create_recordio_file_reader',
attrs={ outputs={'Out': [startup_var]},
'shape_concat': shape_concat, attrs={
'lod_levels': lod_levels, 'shape_concat': shape_concat,
'filename': filename, 'lod_levels': lod_levels,
'ranks': ranks 'filename': filename,
}) 'ranks': ranks
})
startup_var.desc.set_dtypes(dtypes) startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True startup_var.persistable = True
main_prog_var = _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var) startup_var)
if pass_num > 1: if pass_num > 1:
main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
return monkey_patch_reader_methods(main_prog_var) return monkey_patch_reader_methods(main_prog_var)
def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):

File diff suppressed because it is too large Load Diff

@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
from .. import core from .. import core
from ..framework import convert_np_dtype_to_dtype_ from ..framework import convert_np_dtype_to_dtype_
@ -99,27 +100,28 @@ Examples:
>>> result = fluid.layers.hard_shrink(x=data, threshold=0.3) >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
""" """
__all__ += ['cumsum'] if os.name != 'nt':
__all__ += ['cumsum']
_cum_sum_ = generate_layer_fn('cumsum') _cum_sum_ = generate_layer_fn('cumsum')
def cumsum(x, axis=None, exclusive=None, reverse=None): def cumsum(x, axis=None, exclusive=None, reverse=None):
locals_var = locals().keys() locals_var = locals().keys()
kwargs = dict() kwargs = dict()
for name in locals_var: for name in locals_var:
val = locals()[name] val = locals()[name]
if val is not None: if val is not None:
kwargs[name] = val kwargs[name] = val
return _cum_sum_(**kwargs) return _cum_sum_(**kwargs)
cumsum.__doc__ = _cum_sum_.__doc__ + """
Examples:
>>> data = fluid.layers.data(name="input", shape=[32, 784]) cumsum.__doc__ = _cum_sum_.__doc__ + """
>>> result = fluid.layers.cumsum(data, axis=0) Examples:
"""
>>> data = fluid.layers.data(name="input", shape=[32, 784])
>>> result = fluid.layers.cumsum(data, axis=0)
"""
__all__ += ['thresholded_relu'] __all__ += ['thresholded_relu']

@ -180,7 +180,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
package_data['paddle.libs']+=['libmkldnn.so.0'] package_data['paddle.libs']+=['libmkldnn.so.0']
shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
# remove unused paddle/libs/__init__.py # remove unused paddle/libs/__init__.py
os.remove(libs_path+'/__init__.py') if os.path.isfile(libs_path+'/__init__.py'):
os.remove(libs_path+'/__init__.py')
package_dir['paddle.libs']=libs_path package_dir['paddle.libs']=libs_path
# change rpath of core.so, add $ORIGIN/../libs/ to it. # change rpath of core.so, add $ORIGIN/../libs/ to it.

Loading…
Cancel
Save