diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9385943da9..90c25e4350 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,18 +7,14 @@
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks
- sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+ sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown
-# files now, please not add it to pre-commit hook now
-# - id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for
-# documenation
-# - id: debug-statements
+- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+ sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+ hooks:
+ - id: clang-formater
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b62f29787..af193c27ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 2.8)
project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b3)
+set(PADDLE_MINOR_VERSION 9)
+set(PADDLE_PATCH_VERSION 0a0)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
diff --git a/README.md b/README.md
index 81ff8c7122..8a8e158415 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
# PaddlePaddle
-[](https://travis-ci.org/baidu/Paddle)
-[](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-[](LICENSE)
+[](https://travis-ci.org/PaddlePaddle/Paddle)
+[](http://www.paddlepaddle.org/)
+[](http://www.paddlepaddle.org/cn/index.html)
+[](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[](https://github.com/PaddlePaddle/Paddle/releases)
+[](LICENSE)
+
Welcome to the PaddlePaddle GitHub.
@@ -14,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle.
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
## Features
@@ -26,15 +29,15 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
connection.
- **Efficiency**
-
+
In order to unleash the power of heterogeneous computing resource,
optimization occurs at different levels of PaddlePaddle, including
computing, memory, architecture and communication. The following are some
examples:
- Optimized math operations through SSE/AVX intrinsics, BLAS libraries
- (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
- - Highly optimized recurrent networks which can handle **variable-length**
+ (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+ - Highly optimized recurrent networks which can handle **variable-length**
sequence without padding.
- Optimized local and distributed training for models with high dimensional
sparse data.
@@ -57,41 +60,39 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
## Installation
Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or
+pre-built packages (**docker image**, **deb package**) or
directly build on **Linux** and **Mac OS X** from the source code.
-
+
## Documentation
Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en)
You can follow the quick start tutorial to learn how use PaddlePaddle
step-by-step.
-
+
- [Example and Demo](http://paddlepaddle.org/doc/demo/)
We provide five demos, including: image classification, sentiment analysis,
- sequence to sequence model, recommendation, semantic role labeling.
-
+ sequence to sequence model, recommendation, semantic role labeling.
+
- [Distributed Training](http://paddlepaddle.org/doc/cluster)
This system supports training deep learning models on multiple machines
with data parallelism.
-
+
- [Python API](http://paddlepaddle.org/doc/ui/)
PaddlePaddle supports using either Python interface or C++ to build your
system. We also use SWIG to wrap C++ source code to create a user friendly
interface for Python. You can also use SWIG to create interface for your
favorite programming language.
-
+
- [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html)
We sincerely appreciate your interest and contributions. If you would like to
- contribute, please read the contribution guide.
+ contribute, please read the contribution guide.
- [Source Code Documents](http://paddlepaddle.org/doc/source/)
## Ask Questions
-Please join the [**gitter chat**](https://gitter.im/PaddlePaddle/Deep_Learning) or send email to
-**paddle-dev@baidu.com** to ask questions and talk about methods and models.
-Framework development discussions and
-bug reports are collected on [Issues](https://github.com/baidu/paddle/issues).
+
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
## Copyright and License
PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index 2982e54c66..daca5f01cf 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -17,24 +17,15 @@ import os
from optparse import OptionParser
-def extract_dict_features(pair_file, feature_file, src_dict_file,
- tgt_dict_file):
- src_dict = set()
- tgt_dict = set()
-
- with open(pair_file) as fin, open(feature_file, 'w') as feature_out, open(
- src_dict_file, 'w') as src_dict_out, open(tgt_dict_file,
- 'w') as tgt_dict_out:
+def extract_dict_features(pair_file, feature_file):
+
+ with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
for line in fin:
- sentence, labels = line.strip().split('\t')
+ sentence, predicate, labels = line.strip().split('\t')
sentence_list = sentence.split()
labels_list = labels.split()
- src_dict.update(sentence_list)
- tgt_dict.update(labels_list)
-
verb_index = labels_list.index('B-V')
- verb_feature = sentence_list[verb_index]
mark = [0] * len(labels_list)
if verb_index > 0:
@@ -42,47 +33,50 @@ def extract_dict_features(pair_file, feature_file, src_dict_file,
ctx_n1 = sentence_list[verb_index - 1]
else:
ctx_n1 = 'bos'
- ctx_n1_feature = ctx_n1
+
+ if verb_index > 1:
+ mark[verb_index - 2] = 1
+ ctx_n2 = sentence_list[verb_index - 2]
+ else:
+ ctx_n2 = 'bos'
mark[verb_index] = 1
- ctx_0_feature = sentence_list[verb_index]
+ ctx_0 = sentence_list[verb_index]
if verb_index < len(labels_list) - 2:
mark[verb_index + 1] = 1
ctx_p1 = sentence_list[verb_index + 1]
else:
ctx_p1 = 'eos'
- ctx_p1_feature = ctx_p1
+
+ if verb_index < len(labels_list) - 3:
+ mark[verb_index + 2] = 1
+ ctx_p2 = sentence_list[verb_index + 2]
+ else:
+ ctx_p2 = 'eos'
+
feature_str = sentence + '\t' \
- + verb_feature + '\t' \
- + ctx_n1_feature + '\t' \
- + ctx_0_feature + '\t' \
- + ctx_p1_feature + '\t' \
+ + predicate + '\t' \
+ + ctx_n2 + '\t' \
+ + ctx_n1 + '\t' \
+ + ctx_0 + '\t' \
+ + ctx_p1 + '\t' \
+ + ctx_p2 + '\t' \
+ ' '.join([str(i) for i in mark]) + '\t' \
+ labels
feature_out.write(feature_str + '\n')
- src_dict_out.write('\n')
- src_dict_out.write('\n'.join(list(src_dict)))
-
- tgt_dict_out.write('\n'.join(list(tgt_dict)))
if __name__ == '__main__':
- usage = '-p pair_file -f feature_file -s source dictionary -t target dictionary '
+ usage = '-p pair_file -f feature_file'
parser = OptionParser(usage)
parser.add_option('-p', dest='pair_file', help='the pair file')
- parser.add_option(
- '-f', dest='feature_file', help='the file to store feature')
- parser.add_option(
- '-s', dest='src_dict', help='the file to store source dictionary')
- parser.add_option(
- '-t', dest='tgt_dict', help='the file to store target dictionary')
+ parser.add_option('-f', dest='feature_file', help='the feature file')
(options, args) = parser.parse_args()
- extract_dict_features(options.pair_file, options.feature_file,
- options.src_dict, options.tgt_dict)
+ extract_dict_features(options.pair_file, options.feature_file)
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
index 4d1bef8f95..86ab00ce41 100644
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -51,7 +51,7 @@ def read_sentences(words_file):
for line in fin:
line = line.strip()
if line == '':
- sentences.append(s.lower())
+ sentences.append(s)
s = ''
else:
s += line + ' '
@@ -64,6 +64,11 @@ def transform_labels(sentences, labels):
if len(labels[i]) == 1:
continue
else:
+ verb_list = []
+ for x in labels[i][0]:
+ if x !='-':
+ verb_list.append(x)
+
for j in xrange(1, len(labels[i])):
label_list = labels[i][j]
current_tag = 'O'
@@ -88,8 +93,7 @@ def transform_labels(sentences, labels):
is_in_bracket = True
else:
print 'error:', ll
-
- sen_lab_pair.append((sentences[i], label_seq))
+ sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
return sen_lab_pair
@@ -97,9 +101,9 @@ def write_file(sen_lab_pair, output_file):
with open(output_file, 'w') as fout:
for x in sen_lab_pair:
sentence = x[0]
- label_seq = ' '.join(x[1])
- assert len(sentence.split()) == len(x[1])
- fout.write(sentence + '\t' + label_seq + '\n')
+ label_seq = ' '.join(x[2])
+ assert len(sentence.split()) == len(x[2])
+ fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
index 268c0995e2..55e33f4685 100644
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,6 +14,10 @@
# limitations under the License.
set -e
wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
tar -xzvf conll05st-tests.tar.gz
rm conll05st-tests.tar.gz
cp ./conll05st-release/test.wsj/words/test.wsj.words.gz .
@@ -22,4 +26,4 @@ gunzip test.wsj.words.gz
gunzip test.wsj.props.gz
python extract_pairs.py -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature -s src.dict -t tgt.dict
+python extract_dict_feature.py -p test.wsj.seq_pair -f feature
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index 5c003584a5..d4c137ef42 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -17,11 +17,15 @@ from paddle.trainer.PyDataProvider2 import *
UNK_IDX = 0
-def hook(settings, word_dict, label_dict, **kwargs):
+def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
settings.word_dict = word_dict
settings.label_dict = label_dict
+ settings.predicate_dict = predicate_dict
+
#all inputs are integral and sequential type
settings.slots = [
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(predicate_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
@@ -31,27 +35,33 @@ def hook(settings, word_dict, label_dict, **kwargs):
]
-@provider(init_hook=hook)
-def process(obj, file_name):
+def get_batch_size(yeild_data):
+ return len(yeild_data[0])
+
+
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+ can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
with open(file_name, 'r') as fdata:
for line in fdata:
- sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = \
+ sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \
line.strip().split('\t')
-
+
words = sentence.split()
sen_len = len(words)
- word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+ word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
- predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
- ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
- ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX)] * sen_len
- ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+ predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+ ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+ ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+ ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+ ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+ ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
marks = mark.split()
mark_slot = [int(w) for w in marks]
label_list = label.split()
- label_slot = [obj.label_dict.get(w) for w in label_list]
-
- yield word_slot, predicate_slot, ctx_n1_slot, \
- ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+ label_slot = [settings.label_dict.get(w) for w in label_list]
+ yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+ ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
index e3f6edad69..54ceff0e72 100644
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -18,8 +18,9 @@ import sys
from paddle.trainer_config_helpers import *
#file paths
-word_dict_file = './data/src.dict'
-label_dict_file = './data/tgt.dict'
+word_dict_file = './data/wordDict.txt'
+label_dict_file = './data/targetDict.txt'
+predicate_file= './data/verbDict.txt'
train_list_file = './data/train.list'
test_list_file = './data/test.list'
@@ -30,8 +31,10 @@ if not is_predict:
#load dictionaries
word_dict = dict()
label_dict = dict()
+ predicate_dict = dict()
with open(word_dict_file, 'r') as f_word, \
- open(label_dict_file, 'r') as f_label:
+ open(label_dict_file, 'r') as f_label, \
+ open(predicate_file, 'r') as f_pre:
for i, line in enumerate(f_word):
w = line.strip()
word_dict[w] = i
@@ -40,6 +43,11 @@ if not is_predict:
w = line.strip()
label_dict[w] = i
+ for i, line in enumerate(f_pre):
+ w = line.strip()
+ predicate_dict[w] = i
+
+
if is_test:
train_list_file = None
@@ -50,91 +58,157 @@ if not is_predict:
module='dataprovider',
obj='process',
args={'word_dict': word_dict,
- 'label_dict': label_dict})
+ 'label_dict': label_dict,
+ 'predicate_dict': predicate_dict })
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
+ pred_len = len(predicate_dict)
else:
word_dict_len = get_config_arg('dict_len', int)
label_dict_len = get_config_arg('label_len', int)
+ pred_len = get_config_arg('pred_len', int)
+############################## Hyper-parameters ##################################
mark_dict_len = 2
word_dim = 32
mark_dim = 5
-hidden_dim = 128
+hidden_dim = 512
depth = 8
-emb_lr = 1e-2
-fc_lr = 1e-2
-lstm_lr = 2e-2
+
+
+
+########################### Optimizer #######################################
+
settings(
batch_size=150,
- learning_method=AdamOptimizer(),
- learning_rate=1e-3,
+ learning_method=MomentumOptimizer(momentum=0),
+ learning_rate=2e-2,
regularization=L2Regularization(8e-4),
- gradient_clipping_threshold=25)
+ is_async=False,
+ model_average=ModelAverage(average_window=0.5,
+ max_average_window=10000),
+
+)
-#6 features
+
+
+
+####################################### network ##############################
+#8 features and 1 target
word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=word_dict_len)
+predicate = data_layer(name='verb_data', size=pred_len)
+
+ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
+ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
mark = data_layer(name='mark_data', size=mark_dict_len)
+
if not is_predict:
target = data_layer(name='target', size=label_dict_len)
-ptt = ParameterAttribute(name='src_emb', learning_rate=emb_lr)
-layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-fc_para_attr = ParameterAttribute(learning_rate=fc_lr)
-lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=lstm_lr)
-para_attr = [fc_para_attr, lstm_para_attr]
-word_embedding = embedding_layer(size=word_dim, input=word, param_attr=ptt)
-predicate_embedding = embedding_layer(
- size=word_dim, input=predicate, param_attr=ptt)
-ctx_n1_embedding = embedding_layer(size=word_dim, input=ctx_n1, param_attr=ptt)
-ctx_0_embedding = embedding_layer(size=word_dim, input=ctx_0, param_attr=ptt)
-ctx_p1_embedding = embedding_layer(size=word_dim, input=ctx_p1, param_attr=ptt)
-mark_embedding = embedding_layer(size=mark_dim, input=mark)
+default_std=1/math.sqrt(hidden_dim)/3.0
+
+emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
+std_0 = ParameterAttribute(initial_std=0.)
+std_default = ParameterAttribute(initial_std=default_std)
+
+predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
+mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
hidden_0 = mixed_layer(
+ name='hidden0',
size=hidden_dim,
- input=[
- full_matrix_projection(input=word_embedding),
- full_matrix_projection(input=predicate_embedding),
- full_matrix_projection(input=ctx_n1_embedding),
- full_matrix_projection(input=ctx_0_embedding),
- full_matrix_projection(input=ctx_p1_embedding),
- full_matrix_projection(input=mark_embedding),
- ])
+ bias_attr=std_default,
+ input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
+
-lstm_0 = lstmemory(input=hidden_0, layer_attr=layer_attr)
+mix_hidden_lr = 1e-3
+lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(name='lstm0',
+ input=hidden_0,
+ act=ReluActivation(),
+ gate_act=SigmoidActivation(),
+ state_act=SigmoidActivation(),
+ bias_attr=std_0,
+ param_attr=lstm_para_attr)
#stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
+
for i in range(1, depth):
- fc = fc_layer(input=input_tmp, size=hidden_dim, param_attr=para_attr)
+ mix_hidden = mixed_layer(name='hidden'+str(i),
+ size=hidden_dim,
+ bias_attr=std_default,
+ input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+ full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+ ]
+ )
+
+ lstm = lstmemory(name='lstm'+str(i),
+ input=mix_hidden,
+ act=ReluActivation(),
+ gate_act=SigmoidActivation(),
+ state_act=SigmoidActivation(),
+ reverse=((i % 2)==1),
+ bias_attr=std_0,
+ param_attr=lstm_para_attr)
+
+ input_tmp = [mix_hidden, lstm]
+
+feature_out = mixed_layer(name='output',
+ size=label_dict_len,
+ bias_attr=std_default,
+ input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+ full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+ ],
+ )
- lstm = lstmemory(
- input=fc,
- act=ReluActivation(),
- reverse=(i % 2) == 1,
- layer_attr=layer_attr)
- input_tmp = [fc, lstm]
-prob = fc_layer(
- input=input_tmp,
- size=label_dict_len,
- act=SoftmaxActivation(),
- param_attr=para_attr)
if not is_predict:
- cls = classification_cost(input=prob, label=target)
- outputs(cls)
+ crf_l = crf_layer( name = 'crf',
+ size = label_dict_len,
+ input = feature_out,
+ label = target,
+ param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
+
+ )
+
+
+ crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+ size = label_dict_len,
+ input = feature_out,
+ label = target,
+ param_attr=ParameterAttribute(name='crfw')
+ )
+
+
+ eval = sum_evaluator(input=crf_dec_l)
+
+ outputs(crf_l)
+
else:
- outputs(prob)
+ crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+ size = label_dict_len,
+ input = feature_out,
+ param_attr=ParameterAttribute(name='crfw')
+ )
+
+ outputs(crf_dec_l)
+
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index f051d4175c..2761814e18 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -26,7 +26,7 @@ UNK_IDX = 0
class Prediction():
- def __init__(self, train_conf, dict_file, model_dir, label_file):
+ def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
"""
train_conf: trainer configure.
dict_file: word dictionary file name.
@@ -35,26 +35,41 @@ class Prediction():
self.dict = {}
self.labels = {}
+ self.predicate_dict={}
self.labels_reverse = {}
- self.load_dict_label(dict_file, label_file)
+ self.load_dict_label(dict_file, label_file, predicate_dict_file)
len_dict = len(self.dict)
len_label = len(self.labels)
-
- conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
- ',label_len=' + str(len_label) + ',is_predict=True')
+ len_pred = len(self.predicate_dict)
+
+ conf = parse_config(
+ train_conf,
+ 'dict_len=' + str(len_dict) +
+ ',label_len=' + str(len_label) +
+ ',pred_len=' + str(len_pred) +
+ ',is_predict=True')
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(model_dir)
slots = [
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_pred),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(2)
+ ]
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(2)
]
self.converter = DataProviderConverter(slots)
- def load_dict_label(self, dict_file, label_file):
+ def load_dict_label(self, dict_file, label_file, predicate_dict_file):
"""
Load dictionary from self.dict_file.
"""
@@ -65,39 +80,42 @@ class Prediction():
self.labels[line.strip()] = line_count
self.labels_reverse[line_count] = line.strip()
+ for line_count, line in enumerate(open(predicate_dict_file, 'r')):
+ self.predicate_dict[line.strip()] = line_count
def get_data(self, data_file):
"""
Get input data of paddle format.
"""
with open(data_file, 'r') as fdata:
for line in fdata:
- sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip(
+ sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
).split('\t')
words = sentence.split()
sen_len = len(words)
-
+
word_slot = [self.dict.get(w, UNK_IDX) for w in words]
- predicate_slot = [self.dict.get(predicate, UNK_IDX)] * sen_len
+ predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+ ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
+ ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
marks = mark.split()
mark_slot = [int(w) for w in marks]
+
+ yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+ ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot
- yield word_slot, predicate_slot, ctx_n1_slot, \
- ctx_0_slot, ctx_p1_slot, mark_slot
-
- def predict(self, data_file):
+ def predict(self, data_file, output_file):
"""
data_file: file name of input data.
"""
input = self.converter(self.get_data(data_file))
output = self.network.forwardTest(input)
- prob = output[0]["value"]
- lab = list(np.argsort(-prob)[:, 0])
+ lab = output[0]["id"].tolist()
- with open(data_file, 'r') as fin, open('predict.res', 'w') as fout:
+ with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
index = 0
for line in fin:
sen = line.split('\t')[0]
@@ -109,8 +127,8 @@ class Prediction():
def option_parser():
- usage = ("python predict.py -c config -w model_dir "
- "-d word dictionary -l label_file -i input_file")
+ usage = ("python predict.py -c config -w model_dir "
+ "-d word dictionary -l label_file -i input_file -p pred_dict_file")
parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option(
"-c",
@@ -131,6 +149,13 @@ def option_parser():
dest="label_file",
default=None,
help="label file")
+ parser.add_option(
+ "-p",
+ "--predict_dict_file",
+ action="store",
+ dest="predict_dict_file",
+ default=None,
+ help="predict_dict_file")
parser.add_option(
"-i",
"--data",
@@ -144,6 +169,14 @@ def option_parser():
dest="model_path",
default=None,
help="model path")
+
+ parser.add_option(
+ "-o",
+ "--output_file",
+ action="store",
+ dest="output_file",
+ default=None,
+ help="output file")
return parser.parse_args()
@@ -154,10 +187,12 @@ def main():
dict_file = options.dict_file
model_path = options.model_path
label_file = options.label_file
+ predict_dict_file = options.predict_dict_file
+ output_file = options.output_file
swig_paddle.initPaddle("--use_gpu=0")
- predict = Prediction(train_conf, dict_file, model_path, label_file)
- predict.predict(data_file)
+ predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
+ predict.predict(data_file,output_file)
if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
index a545b9a5d5..d0acdb0bd0 100644
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
@@ -26,15 +26,18 @@ LOG=`get_best_pass $log`
LOG=(${LOG})
best_model_path="output/pass-${LOG[1]}"
-
config_file=db_lstm.py
-dict_file=./data/src.dict
-label_file=./data/tgt.dict
+dict_file=./data/wordDict.txt
+label_file=./data/targetDict.txt
+predicate_dict_file=./data/verbDict.txt
input_file=./data/feature
+output_file=predict.res
python predict.py \
-c $config_file \
-w $best_model_path \
-l $label_file \
+ -p $predicate_dict_file \
-d $dict_file \
- -i $input_file
+ -i $input_file \
+ -o $output_file
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 844649e8c0..c4ab44f5ca 100644
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -36,4 +36,5 @@ paddle train \
--job=test \
--use_gpu=false \
--config_args=is_test=1 \
+ --test_all_data_in_one_period=1 \
2>&1 | tee 'test.log'
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index c3a22b644b..420768bb2b 100644
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -16,11 +16,14 @@
set -e
paddle train \
--config=./db_lstm.py \
+ --use_gpu=0 \
+ --log_period=5000 \
+ --trainer_count=1 \
+ --show_parameter_stats_period=5000 \
--save_dir=./output \
- --trainer_count=4 \
- --log_period=10 \
- --num_passes=500 \
- --use_gpu=false \
- --show_parameter_stats_period=10 \
+ --num_passes=10000 \
+ --average_test_period=10000000 \
+ --init_model_path=./data \
+ --load_missing_parameter_strategy=rand \
--test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
+ 2>&1 | tee 'train.log'
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index 894070e7c9..114a9138eb 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -29,6 +29,7 @@ settings(
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
+ average_window=0.5,
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst
index 399c5da5ff..01d2caefb5 100644
--- a/doc/algorithm/rnn/rnn.rst
+++ b/doc/algorithm/rnn/rnn.rst
@@ -17,7 +17,7 @@ PaddlePaddle does not need any preprocessing to sequence data, such as padding.
.. code-block:: python
- settings.slots = [
+ settings.input_types = [
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.trg_dict))]
diff --git a/doc/demo/semantic_role_labeling/curve.jpg b/doc/demo/semantic_role_labeling/curve.jpg
new file mode 100644
index 0000000000..baa35ae7f0
Binary files /dev/null and b/doc/demo/semantic_role_labeling/curve.jpg differ
diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
index 890f731458..e2793b2b34 100644
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
@@ -1,183 +1,200 @@
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]:
-
- [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ].
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank.
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem.
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release:the test data set of CoNll-2005 shared task
-test.wsj.words:the Wall Street Journal data sentences
-test.wsj.props: the propositional arguments
-src.dict:the dictionary of words in sentences
-tgt.dict:the labels dictionary
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit.
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model.
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-
-
-
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-
-
-
-
-In this sample, the coresponding labelled sentence is:
-
-[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] .
-
-In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
- settings.word_dict = word_dict
- settings.label_dict = label_dict
- #all inputs are integral and sequential type
- settings.slots = [
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(2),
- integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(use_seq=True, init_hook=hook)
-def process(obj, file_name):
- with open(file_name, 'r') as fdata:
- for line in fdata:
- sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t')
- words = sentence.split()
- sen_len = len(words)
- word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
-
- predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
- ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len
- ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len
- ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len
-
- marks = mark.split()
- mark_slot = [int(w) for w in marks]
-
- label_list = label.split()
- label_slot = [obj.label_dict.get(w) for w in label_list]
-
- yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
-```
-The `process`function yield 7 lists which are six features and labels.
-
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure.
-
-Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training
-The script for training is `train.sh`, user just need to execute:
-```bash
- ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
- --config=./db_lstm.py \
- --save_dir=./output \
- --trainer_count=4 \
- --log_period=10 \
- --num_passes=500 \
- --use_gpu=false \
- --show_parameter_stats_period=10 \
- --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
-- \--config=./db_lstm.py : network config file.
-- \--save_di=./output: output path to save models.
-- \--trainer_count=4 : set thread number (or GPU count).
-- \--log_period=10 : print log every 20 batches.
-- \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
-- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
-- \--show_parameter_stats_period=10: show parameter statistic every 100 batches.
-- \--test_all_data_in_one_period=1: test all data in every testing.
-
-
-After training, the models will be saved in directory `output`.
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
- ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
- --config=./db_lstm.py \
- --model_list=$model_list \
- --job=test \
- --config_args=is_test=1 \
-```
-
- - \--config=./db_lstm.py: network config file
- - \--model_list=$model_list.list: model list file
- - \--job=test: indicate the test job
- - \--config_args=is_test=1: flag to indicate test
-
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
- ./predict.sh
-
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py
- -c $config_file
- -w $model_path
- -l $label_file
- -d $dict_file
- -i $input_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction, the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005.
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+# Semantic Role labeling Tutorial #
+
+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]:
+
+ [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ].
+
+- V: verb
+- A0: acceptor
+- A1: thing accepted
+- A2: accepted-from
+- A3: Attribute
+- AM-MOD: modal
+- AM-NEG: negation
+
+Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank.
+
+To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem.
+
+## Data Description
+The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website.
+
+To download and process the original data, user just need to execute the following command:
+
+```bash
+cd data
+./get_data.sh
+```
+Several new files appear in the `data `directory as follows.
+```bash
+conll05st-release:the test data set of CoNll-2005 shared task
+test.wsj.words:the Wall Street Journal data sentences
+test.wsj.props: the propositional arguments
+feature: the extracted features from data set
+```
+
+## Training
+### DB-LSTM
+Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit.
+
+Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model.
+
+The following figure shows a temporal expanded 2-layer DB-LSTM network.
+
+
+
+
+### Features
+Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
+
+
+
+
+In this sample, the coresponding labelled sentence is:
+
+[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] .
+
+In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
+
+### Data Provider
+
+`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots.
+```
+def hook(settings, word_dict, label_dict, **kwargs):
+ settings.word_dict = word_dict
+ settings.label_dict = label_dict
+ #all inputs are integral and sequential type
+ settings.slots = [
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(predicate_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(2),
+ integer_value_sequence(len(label_dict))]
+```
+The corresponding data iterator is as following:
+```
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+ can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+ with open(file_name, 'r') as fdata:
+ for line in fdata:
+ sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \
+ line.strip().split('\t')
+
+ words = sentence.split()
+ sen_len = len(words)
+ word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+ predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+ ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+ ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+ ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+ ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+ ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+ marks = mark.split()
+ mark_slot = [int(w) for w in marks]
+
+ label_list = label.split()
+ label_slot = [settings.label_dict.get(w) for w in label_list]
+ yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+ ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+```
+The `process`function yield 9 lists which are 8 features and label.
+
+### Neural Network Config
+`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure.
+
+Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+
+### Run Training
+The script for training is `train.sh`, user just need to execute:
+```bash
+ ./train.sh
+```
+The content in `train.sh`:
+```
+paddle train \
+ --config=./db_lstm.py \
+ --use_gpu=0 \
+ --log_period=5000 \
+ --trainer_count=1 \
+ --show_parameter_stats_period=5000 \
+ --save_dir=./output \
+ --num_passes=10000 \
+ --average_test_period=10000000 \
+ --init_model_path=./data \
+ --load_missing_parameter_strategy=rand \
+ --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+```
+
+- \--config=./db_lstm.py : network config file.
+- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
+- \--log_period=500: print log every 20 batches.
+- \--trainer_count=1: set thread number (or GPU count).
+- \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
+- \--save_dir=./output: output path to save models.
+- \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+- \--average_test_period=10000000: do test on average parameter every average_test_period batches
+- \--init_model_path=./data: parameter initialization path
+- \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
+- \--test_all_data_in_one_period=1: test all data in one period
+
+
+After training, the models will be saved in directory `output`. Our training curve is as following:
+
+
+
+
+### Run testing
+The script for testing is `test.sh`, user just need to execute:
+```bash
+ ./test.sh
+```
+The main part in `tesh.sh`
+```
+paddle train \
+ --config=./db_lstm.py \
+ --model_list=$model_list \
+ --job=test \
+ --config_args=is_test=1 \
+```
+
+ - \--config=./db_lstm.py: network config file
+ - \--model_list=$model_list.list: model list file
+ - \--job=test: indicate the test job
+ - \--config_args=is_test=1: flag to indicate test
+ - \--test_all_data_in_one_period=1: test all data in 1 period
+
+
+### Run prediction
+The script for prediction is `predict.sh`, user just need to execute:
+```bash
+ ./predict.sh
+
+```
+In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
+```
+python predict.py
+ -c $config_file \
+ -w $best_model_path \
+ -l $label_file \
+ -p $predicate_dict_file \
+ -d $dict_file \
+ -i $input_file \
+ -o $output_file
+```
+
+`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
+
+After prediction, the result is saved in `predict.res`.
+
+## Reference
+[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005.
+
+[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/demo/sentiment_analysis/sentiment_analysis.md
index 385f49891d..c53952c544 100644
--- a/doc/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/demo/sentiment_analysis/sentiment_analysis.md
@@ -6,7 +6,7 @@ Sentiment analysis is also used to monitor social media based on large amount of
On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
-This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the [Internet Movie Database (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
+This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
## Data Preparation
@@ -39,7 +39,7 @@ imdbEr.txt imdb.vocab README test train
* imdbEr.txt: expected rating for each token in imdb.vocab.
* README: data documentation.
-Both train and test set directory contains:
+The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`.
```
labeledBow.feat neg pos unsup unsupBow.feat urls_neg.txt urls_pos.txt urls_unsup.txt
@@ -151,6 +151,7 @@ settings(
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
+ average_window=0.5,
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
@@ -163,17 +164,18 @@ stacked_lstm_net(dict_dim, class_dim=class_dim,
* **Data Definition**:
* get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
- * Define TrainData and TestData provider, here using Python interface (PyDataProviderWrapper) of PaddlePaddle to load data. For details, you can refer to the document of PyDataProvider.
+ * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2.
* **Algorithm Configuration**:
- * use sgd algorithm.
- * use adam optimization.
* set batch size of 128.
- * set average sgd window.
* set global learning rate.
+ * use adam optimization.
+ * set average sgd window.
+ * set L2 regularization.
+ * set gradient clipping threshold.
* **Network Configuration**:
- * dict_dim: get dictionary dimension.
- * class_dim: set category number, IMDB has two label, namely positive and negative label.
+ * dict_dim: dictionary dimension.
+ * class_dim: category number, IMDB has two label, namely positive and negative label.
* `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
* `bidirectional_lstm_net`: predefined network as shown in Figure 2.
diff --git a/doc/dev/new_layer/new_layer.rst b/doc/dev/new_layer/new_layer.rst
index 2fa0073048..af8b76a307 100644
--- a/doc/dev/new_layer/new_layer.rst
+++ b/doc/dev/new_layer/new_layer.rst
@@ -60,7 +60,7 @@ Implement C++ Class
The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
-It needs to derive the base class :code:`paddle::BaseLayer`, and it needs to override the following functions:
+It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
- constructor and destructor.
- :code:`init` function. It is used to initialize the parameters and settings.
diff --git a/doc/optimization/gpu_profiling.rst b/doc/optimization/gpu_profiling.rst
index 44ecb34885..667bf1364e 100644
--- a/doc/optimization/gpu_profiling.rst
+++ b/doc/optimization/gpu_profiling.rst
@@ -53,7 +53,7 @@ above profilers.
.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
:language: c++
- :lines: 107-121
+ :lines: 111-124
:linenos:
The above code snippet includes two methods, you can use any of them to profile the regions of interest.
@@ -75,12 +75,12 @@ To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_
Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
As a simple example, consider the following:
-1. Add :code:`REGISTER_TIMER_INFO` and :code:`printStatus` functions (see the emphasize-lines).
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
:language: c++
- :lines: 107-121
- :emphasize-lines: 10-11,14
+ :lines: 111-124
+ :emphasize-lines: 8-10,13
:linenos:
2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
@@ -126,8 +126,8 @@ To use this command line profiler **nvprof**, you can simply issue the following
.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
:language: c++
- :lines: 107-121
- :emphasize-lines: 7-8
+ :lines: 111-124
+ :emphasize-lines: 6-7
:linenos:
2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
diff --git a/doc/source/api/api.rst b/doc/source/api.rst
similarity index 90%
rename from doc/source/api/api.rst
rename to doc/source/api.rst
index 6fc450202d..30396c26b6 100644
--- a/doc/source/api/api.rst
+++ b/doc/source/api.rst
@@ -1,5 +1,5 @@
API
-========
+===
.. doxygenfile:: paddle/api/PaddleAPI.h
.. doxygenfile:: paddle/api/Internal.h
diff --git a/doc/source/cuda/cuda/cuda.rst b/doc/source/cuda/cuda/cuda.rst
deleted file mode 100644
index 52f17c2b2e..0000000000
--- a/doc/source/cuda/cuda/cuda.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-Cuda
-=============
-
-Dynamic Link Libs
---------------------------
-
-hl_dso_loader.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
-
-GPU Resources
-----------------
-
-hl_cuda.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
-
-hl_cuda.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.h
-
-CUDA Wrapper
---------------
-
-hl_cuda_cublas.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
-
-
-
-
diff --git a/doc/source/cuda/cuda/index.rst b/doc/source/cuda/cuda/index.rst
deleted file mode 100644
index 5fa38ff0fc..0000000000
--- a/doc/source/cuda/cuda/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-CUDA
-====================
-
-.. toctree::
- :maxdepth: 3
-
- cuda.rst
diff --git a/doc/source/cuda/index.rst b/doc/source/cuda/index.rst
new file mode 100644
index 0000000000..b0fed2e7f7
--- /dev/null
+++ b/doc/source/cuda/index.rst
@@ -0,0 +1,9 @@
+CUDA
+====
+
+.. toctree::
+ :maxdepth: 2
+
+ matrix.rst
+ nn.rst
+ utils.rst
diff --git a/doc/source/cuda/matrix/matrix.rst b/doc/source/cuda/matrix.rst
similarity index 76%
rename from doc/source/cuda/matrix/matrix.rst
rename to doc/source/cuda/matrix.rst
index dd4f06599c..b7699c83ed 100644
--- a/doc/source/cuda/matrix/matrix.rst
+++ b/doc/source/cuda/matrix.rst
@@ -1,61 +1,59 @@
Matrix
-=======
+======
-Base Matrix
--------------
+Base
+----
hl_matrix.h
-``````````````````
+```````````
.. doxygenfile:: paddle/cuda/include/hl_matrix.h
hl_matrix_base.h
-``````````````````
+````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_base.cuh
hl_matrix_apply.cuh
-``````````````````````
+```````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_apply.cuh
hl_matrix_ops.cuh
-``````````````````````
+`````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_ops.cuh
hl_matrix_type.cuh
-``````````````````````
+``````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_type.cuh
hl_sse_matrix_kernel.cuh
-``````````````````````````
+````````````````````````
.. doxygenfile:: paddle/cuda/include/hl_sse_matrix_kernel.cuh
+Matrix Function
+---------------
+
hl_batch_transpose.h
-``````````````````````````
+````````````````````
.. doxygenfile:: paddle/cuda/include/hl_batch_transpose.h
-Sparse Matrix
---------------
-
-hl_sparse.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.h
-
-hl_sparse.ph
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
-
-Others
----------------
-
hl_aggregate.h
-``````````````````
+``````````````
.. doxygenfile:: paddle/cuda/include/hl_aggregate.h
+hl_top_k.h
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+
hl_table_apply.h
-``````````````````
+````````````````
.. doxygenfile:: paddle/cuda/include/hl_table_apply.h
-hl_top_k.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+Sparse Matrix
+-------------
+hl_sparse.h
+```````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.h
+hl_sparse.ph
+````````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
diff --git a/doc/source/cuda/matrix/index.rst b/doc/source/cuda/matrix/index.rst
deleted file mode 100644
index 63f95eb466..0000000000
--- a/doc/source/cuda/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix
-====================
-
-.. toctree::
- :maxdepth: 3
-
- matrix.rst
diff --git a/doc/source/cuda/rnn/rnn.rst b/doc/source/cuda/nn.rst
similarity index 79%
rename from doc/source/cuda/rnn/rnn.rst
rename to doc/source/cuda/nn.rst
index ce8ed96692..5577d01e72 100644
--- a/doc/source/cuda/rnn/rnn.rst
+++ b/doc/source/cuda/nn.rst
@@ -1,36 +1,39 @@
-Neural Networks
-==================
+Neural Network
+==============
Base
--------
+----
+
.. doxygenfile:: paddle/cuda/include/hl_gpu.h
-.. doxygenfile:: paddle/cuda/include/hl_cnn.h
.. doxygenfile:: paddle/cuda/include/hl_functions.h
.. doxygenfile:: paddle/cuda/include/hl_avx_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
.. doxygenfile:: paddle/cuda/include/hl_gpu_functions.cuh
-
-Activation Functions
------------------------
.. doxygenfile:: paddle/cuda/include/hl_activation_functions.h
+
+CNN Related APIs
+----------------
+.. doxygenfile:: paddle/cuda/include/hl_cnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
+
RNN Related APIs
------------------
+----------------
.. doxygenfile:: paddle/cuda/include/hl_recurrent_apply.cuh
.. doxygenfile:: paddle/cuda/include/hl_sequence.h
LSTM Model
-``````````````
+``````````
+
.. doxygenfile:: paddle/cuda/include/hl_lstm.h
.. dpxygenfile:: paddle/cuda/include/hl_cpu_lstm.cuh
.. doxygenfile:: paddle/cuda/include/hl_gpu_lstm.cuh
.. doxygenfile:: paddle/cuda/include/hl_lstm_ops.cuh
GRU Model
-````````````````
+`````````
+
.. doxygenfile:: paddle/cuda/include/hl_gru_ops.cuh
.. doxygenfile:: paddle/cuda/include/hl_cpu_gru.cuh
.. doxygenfile:: paddle/cuda/include/hl_gpu_gru.cuh
-
-
diff --git a/doc/source/cuda/rnn/index.rst b/doc/source/cuda/rnn/index.rst
deleted file mode 100644
index 4913e47ba1..0000000000
--- a/doc/source/cuda/rnn/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-RNN
-====================
-
-.. toctree::
- :maxdepth: 3
-
- rnn.rst
diff --git a/doc/source/cuda/utils.rst b/doc/source/cuda/utils.rst
new file mode 100644
index 0000000000..850e8bd1c6
--- /dev/null
+++ b/doc/source/cuda/utils.rst
@@ -0,0 +1,37 @@
+Utils
+=====
+
+Dynamic Link Libs
+-----------------
+.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
+
+GPU Resources
+-------------
+
+hl_cuda.ph
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
+
+hl_cuda.h
+`````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.h
+
+HPPL Base
+---------
+.. doxygenfile:: paddle/cuda/include/hl_base.h
+
+CUBLAS Wrapper
+--------------
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
+
+Timer
+-----
+.. doxygenfile:: paddle/cuda/include/hl_time.h
+
+Thread Resource
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_thread.ph
+
+Device Function
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
diff --git a/doc/source/cuda/utils/index.rst b/doc/source/cuda/utils/index.rst
deleted file mode 100644
index 7a84cbe27d..0000000000
--- a/doc/source/cuda/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils
-====================
-
-.. toctree::
- :maxdepth: 3
-
- utils.rst
diff --git a/doc/source/cuda/utils/utils.rst b/doc/source/cuda/utils/utils.rst
deleted file mode 100644
index 1ea3e5404a..0000000000
--- a/doc/source/cuda/utils/utils.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Utilities
-===========
-
-HPPL Base
-------------
-
-hl_base.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_base.h
-
-Timer
------------
-
-hl_time.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_time.h
-
-Thread Resource
------------
-
-hl_thread.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_thread.ph
diff --git a/doc/source/gserver/activations/index.rst b/doc/source/gserver/activations.rst
similarity index 83%
rename from doc/source/gserver/activations/index.rst
rename to doc/source/gserver/activations.rst
index ccdae41128..55b9d3be38 100644
--- a/doc/source/gserver/activations/index.rst
+++ b/doc/source/gserver/activations.rst
@@ -1,5 +1,5 @@
Activations
-=============
+===========
.. doxygenclass:: paddle::ActivationFunction
:members:
diff --git a/doc/source/gserver/dataprovider/index.rst b/doc/source/gserver/dataprovider/index.rst
deleted file mode 100644
index 4f6077f122..0000000000
--- a/doc/source/gserver/dataprovider/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Data Providers Documents
-==========================
-
-.. toctree::
- :maxdepth: 3
-
- dataproviders.rst
diff --git a/doc/source/gserver/dataprovider/dataproviders.rst b/doc/source/gserver/dataproviders.rst
similarity index 87%
rename from doc/source/gserver/dataprovider/dataproviders.rst
rename to doc/source/gserver/dataproviders.rst
index e8aa4bc356..c30d9d6a36 100644
--- a/doc/source/gserver/dataprovider/dataproviders.rst
+++ b/doc/source/gserver/dataproviders.rst
@@ -1,23 +1,27 @@
+==============
Data Providers
-================
+==============
-Base DataProvider
-------------------
+DataProviders
+=============
+
+Base
+----
.. doxygenclass:: paddle::DataProvider
:members:
DataProviderGroup
--------------------
+-----------------
.. doxygenclass:: paddle::DataProviderGroup
:members:
MultiDataProvider
--------------------
+-----------------
.. doxygenclass:: paddle::MultiDataProvider
:members:
PyDataProvider
-===================
+==============
IFieldScanner
-------------
@@ -45,7 +49,7 @@ SparseValueScanner
:members:
SequenceScanner
-------------------
+---------------
.. doxygenclass:: paddle::SparseValueScanner
:members:
@@ -69,8 +73,8 @@ IPyDataProvider
.. doxygenclass:: paddle::PyDataProvider2
:members:
-Proto Data Provider
-===================
+ProtoDataProvider
+=================
ProtoDataProvider
----------------
@@ -78,6 +82,6 @@ ProtoDataProvider
:members:
ProtoSequenceDataProvider
-----------------
+-------------------------
.. doxygenclass:: paddle::ProtoSequenceDataProvider
:members:
diff --git a/doc/source/gserver/evaluators/evaluators.rst b/doc/source/gserver/evaluators.rst
similarity index 96%
rename from doc/source/gserver/evaluators/evaluators.rst
rename to doc/source/gserver/evaluators.rst
index 0c5cc85e7d..f5361f76cd 100644
--- a/doc/source/gserver/evaluators/evaluators.rst
+++ b/doc/source/gserver/evaluators.rst
@@ -1,14 +1,15 @@
-Base Evaluator
-==============
+==========
+Evaluators
+==========
+
+Base
+====
-Evaluator
----------
.. doxygenclass:: paddle::Evaluator
:members:
-
-Utils
-=====
+Sum
+===
SumEvaluator
------------
diff --git a/doc/source/gserver/evaluators/index.rst b/doc/source/gserver/evaluators/index.rst
deleted file mode 100644
index 298de3e1a3..0000000000
--- a/doc/source/gserver/evaluators/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Evaluators
-==========
-
-.. toctree::
- :maxdepth: 3
-
- evaluators.rst
diff --git a/doc/source/gserver/gradientmachines/gradientmachines.rst b/doc/source/gserver/gradientmachines.rst
similarity index 54%
rename from doc/source/gserver/gradientmachines/gradientmachines.rst
rename to doc/source/gserver/gradientmachines.rst
index 3607664c85..04c8e91d03 100644
--- a/doc/source/gserver/gradientmachines/gradientmachines.rst
+++ b/doc/source/gserver/gradientmachines.rst
@@ -1,18 +1,18 @@
Gradient Machines
-================
+=================
GradientMachine
----------------------
+---------------
.. doxygenclass:: paddle::GradientMachine
:members:
-GradientMachineModel
---------------------
+GradientMachineMode
+-------------------
.. doxygenclass:: paddle::IGradientMachineMode
:members:
MultiGradientMachine
----------------------
+--------------------
.. doxygenclass:: paddle::MultiGradientMachine
:members:
@@ -21,20 +21,7 @@ TrainerThread
.. doxygenclass:: paddle::TrainerThread
:members:
-Recurrent Gradient Machines
----------------------------
+RecurrentGradientMachine
+------------------------
.. doxygenclass:: paddle::RecurrentGradientMachine
:members:
-
-Networks
-========
-
-NeuralNetwork
--------------
-.. doxygenclass:: paddle::NeuralNetwork
- :members:
-
-ParallelNeuralNetwork
----------------------
-.. doxygenclass:: paddle::ParallelNeuralNetwork
- :members:
diff --git a/doc/source/gserver/gradientmachines/index.rst b/doc/source/gserver/gradientmachines/index.rst
deleted file mode 100644
index 997c29a102..0000000000
--- a/doc/source/gserver/gradientmachines/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Gradient Machines Documents
-=============================
-
-.. toctree::
- :maxdepth: 3
-
- gradientmachines.rst
diff --git a/doc/source/gserver/index.rst b/doc/source/gserver/index.rst
new file mode 100644
index 0000000000..223b00b9a9
--- /dev/null
+++ b/doc/source/gserver/index.rst
@@ -0,0 +1,12 @@
+GServer
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ activations.rst
+ dataproviders.rst
+ evaluators.rst
+ gradientmachines.rst
+ layers.rst
+ neworks.rst
diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers.rst
similarity index 95%
rename from doc/source/gserver/layers/layer.rst
rename to doc/source/gserver/layers.rst
index 4b8e149505..191b2bdff2 100644
--- a/doc/source/gserver/layers/layer.rst
+++ b/doc/source/gserver/layers.rst
@@ -1,6 +1,10 @@
-Base
+======
+Layers
======
+Base
+====
+
Layer
-----
.. doxygenclass:: paddle::Layer
@@ -17,7 +21,7 @@ Operator
:members:
Data Layer
-===========
+==========
.. doxygenclass:: paddle::DataLayer
:members:
@@ -58,6 +62,11 @@ CudnnConvLayer
.. doxygenclass:: paddle::CudnnConvLayer
:members:
+ExpandConvBaseLayer
+-------------------
+.. doxygenclass:: paddle::ExpandConvBaseLayer
+ :members:
+
ExpandConvLayer
---------------
.. doxygenclass:: paddle::ExpandConvLayer
@@ -86,6 +95,16 @@ CudnnPoolLayer
.. doxygenclass:: paddle::CudnnPoolLayer
:members:
+SpatialPyramidPoolLayer
+-----------------------
+.. doxygenclass:: paddle::SpatialPyramidPoolLayer
+ :members:
+
+MaxOutLayer
+-----------
+.. doxygenclass:: paddle::MaxOutLayer
+ :members:
+
Norm Layers
===========
@@ -402,6 +421,11 @@ TransLayer
Sampling Layers
===============
+BilinearInterpLayer
+-------------------
+.. doxygenclass:: paddle::BilinearInterpLayer
+ :members:
+
MultinomialSampler
------------------
.. doxygenclass:: paddle::MultinomialSampler
diff --git a/doc/source/gserver/layers/index.rst b/doc/source/gserver/layers/index.rst
deleted file mode 100644
index 559c5436b1..0000000000
--- a/doc/source/gserver/layers/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Layers Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- layer.rst
diff --git a/doc/source/gserver/neworks.rst b/doc/source/gserver/neworks.rst
new file mode 100644
index 0000000000..73fb60d549
--- /dev/null
+++ b/doc/source/gserver/neworks.rst
@@ -0,0 +1,12 @@
+Networks
+========
+
+NeuralNetwork
+-------------
+.. doxygenclass:: paddle::NeuralNetwork
+ :members:
+
+ParallelNeuralNetwork
+---------------------
+.. doxygenclass:: paddle::ParallelNeuralNetwork
+ :members:
diff --git a/doc/source/index.md b/doc/source/index.md
deleted file mode 100644
index 55fcdeb3df..0000000000
--- a/doc/source/index.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Source Code Documents
-
-## cuda
-
-- [CUDA](cuda/cuda/index.rst)
-- [Matrix](cuda/matrix/index.rst)
-- [RNN](cuda/rnn/index.rst)
-- [Utils](cuda/utils/index.rst)
-
-## gserver
-
-- [Activations](gserver/activations/index.rst)
-- [Data Providers](gserver/dataprovider/index.rst)
-- [Evaluators](gserver/evaluators/index.rst)
-- [Gradient Machines](gserver/gradientmachines/index.rst)
-- [Layers](gserver/layers/index.rst)
-
-## math
-
-- [Matrix](math/matrix/index.rst)
-- [Utils](math/utils/index.rst)
-
-## parameter
-
-- [Parameter](parameter/parameter/index.rst)
-- [Update](parameter/update/index.rst)
-- [Optimizer](parameter/optimizer/index.rst)
-
-## pserver
-
-- [Client](pserver/client/index.rst)
-- [Network](pserver/network/index.rst)
-- [Server](pserver/server/index.rst)
-
-## trainer
-
-- [Trainer](trainer/trainer.rst)
-
-## api
-
-- [API](api/api.rst)
-
-## utils
-
-- [CustomStackTrace](utils/customStackTrace.rst)
-- [Enumeration wrapper](utils/enum.rst)
-- [Lock](utils/lock.rst)
-- [Queue](utils/queue.rst)
-- [Thread](utils/thread.rst)
diff --git a/doc/source/index.rst b/doc/source/index.rst
new file mode 100644
index 0000000000..36323c888e
--- /dev/null
+++ b/doc/source/index.rst
@@ -0,0 +1,14 @@
+Source Code Documents
+=====================
+
+.. toctree::
+ :maxdepth: 1
+
+ gserver/index.rst
+ trainer.rst
+ parameter/index.rst
+ pserver/index.rst
+ api.rst
+ cuda/index.rst
+ math/index.rst
+ utils/index.rst
diff --git a/doc/source/math/functions.rst b/doc/source/math/functions.rst
new file mode 100644
index 0000000000..aef12e0f00
--- /dev/null
+++ b/doc/source/math/functions.rst
@@ -0,0 +1,10 @@
+Functions
+=========
+
+MathFunctions
+-------------
+.. doxygenfile:: paddle/math/MathFunctions.h
+
+SIMDFunctions
+-------------
+.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/index.rst b/doc/source/math/index.rst
new file mode 100644
index 0000000000..2ec16f2b44
--- /dev/null
+++ b/doc/source/math/index.rst
@@ -0,0 +1,10 @@
+Math
+====
+
+.. toctree::
+ :maxdepth: 2
+
+ vector.rst
+ matrix.rst
+ functions.rst
+ utils.rst
diff --git a/doc/source/math/matrix.rst b/doc/source/math/matrix.rst
new file mode 100644
index 0000000000..9bb20f618d
--- /dev/null
+++ b/doc/source/math/matrix.rst
@@ -0,0 +1,76 @@
+Matrix
+======
+
+Base
+----
+
+BaseMatrix Template
+```````````````````
+.. doxygenclass:: paddle::BaseMatrixT
+ :members:
+
+Matrix
+``````
+.. doxygenclass:: paddle::Matrix
+ :members:
+
+MatrixOffset
+````````````
+.. doxygenclass:: paddle::MatrixOffset
+ :members:
+
+CpuMatrix
+---------
+
+CpuMatrix
+`````````
+.. doxygenclass:: paddle::CpuMatrix
+ :members:
+
+SharedCpuMatrix
+```````````````
+.. doxygenclass:: paddle::SharedCpuMatrix
+ :members:
+
+GpuMatrix
+---------
+.. doxygenclass:: paddle::GpuMatrix
+ :members:
+
+CpuSparseMatrix
+---------------
+
+CpuSparseMatrix
+```````````````
+.. doxygenclass:: paddle::CpuSparseMatrix
+ :members:
+
+SparseRowCpuMatrix
+``````````````````
+.. doxygenclass:: paddle::SparseRowCpuMatrix
+ :members:
+
+SparseAutoGrowRowCpuMatrix
+``````````````````````````
+.. doxygenclass:: paddle::SparseAutoGrowRowCpuMatrix
+ :members:
+
+SparsePrefetchRowCpuMatrix
+``````````````````````````
+.. doxygenclass:: paddle::SparsePrefetchRowCpuMatrix
+ :members:
+
+SparseRowIdsCpuMatrix
+`````````````````````
+.. doxygenclass:: paddle::SparseRowIdsCpuMatrix
+ :members:
+
+CacheRowCpuMatrix
+`````````````````
+.. doxygenclass:: paddle::CacheRowCpuMatrix
+ :members:
+
+GpuSparseMatrix
+---------------
+.. doxygenclass:: paddle::GpuSparseMatrix
+ :members:
diff --git a/doc/source/math/matrix/index.rst b/doc/source/math/matrix/index.rst
deleted file mode 100644
index 68410f2a27..0000000000
--- a/doc/source/math/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- matrix.rst
diff --git a/doc/source/math/matrix/matrix.rst b/doc/source/math/matrix/matrix.rst
deleted file mode 100644
index b12e3934f4..0000000000
--- a/doc/source/math/matrix/matrix.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-Matrix
-=======
-
-Base
---------
-.. doxygenfile:: paddle/math/BaseMatrix.h
-
-Sparse Matrix
-----------------
-.. doxygenfile:: paddle/math/Matrix.h
-.. doxygenfile:: paddle/math/Vector.h
-.. doxygenfile:: paddle/math/MathUtils.h
-.. doxygenfile:: paddle/math/SparseMatrix.h
-.. doxygenfile:: paddle/math/SparseRowMatrix.h
-.. doxygenfile:: paddle/math/CpuSparseMatrix.h
-
-Others
-----------
-.. doxygenfile:: paddle/math/MathFunctions.h
-.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/utils/utils.rst b/doc/source/math/utils.rst
similarity index 62%
rename from doc/source/math/utils/utils.rst
rename to doc/source/math/utils.rst
index 3df721a47b..55d9961a39 100644
--- a/doc/source/math/utils/utils.rst
+++ b/doc/source/math/utils.rst
@@ -1,9 +1,18 @@
-Utils
-=======
+Memory Manager
+==============
Memory Handle
---------------
+-------------
.. doxygenfile:: paddle/math/MemoryHandle.h
+
+Allocator
+---------
.. doxygenfile:: paddle/math/Allocator.h
+
+PoolAllocator
+`````````````
.. doxygenfile:: paddle/math/PoolAllocator.h
+
+Storage
+-------
.. doxygenfile:: paddle/math/Storage.h
diff --git a/doc/source/math/utils/index.rst b/doc/source/math/utils/index.rst
deleted file mode 100644
index e5fe335da2..0000000000
--- a/doc/source/math/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- utils.rst
diff --git a/doc/source/math/vector.rst b/doc/source/math/vector.rst
new file mode 100644
index 0000000000..07f7062aba
--- /dev/null
+++ b/doc/source/math/vector.rst
@@ -0,0 +1,37 @@
+Vector
+======
+
+BaseVector
+``````````
+.. doxygenclass:: paddle::BaseVector
+ :members:
+
+Vector Template
+```````````````
+.. doxygenclass:: paddle::VectorT
+ :members:
+
+CpuVector Template
+``````````````````
+.. doxygenclass:: paddle::CpuVectorT
+ :members:
+
+GpuVector Template
+``````````````````
+.. doxygenclass:: paddle::GpuVectorT
+ :members:
+
+ParallelCpuVector Template
+``````````````````````````
+.. doxygenclass:: paddle::ParallelCpuVectorT
+ :members:
+
+ParallelGpuVector Template
+``````````````````````````
+.. doxygenclass:: paddle::ParallelGpuVectorT
+ :members:
+
+CpuGpuVector Template
+`````````````````````
+.. doxygenclass:: paddle::CpuGpuVectorT
+ :members:
diff --git a/doc/source/parameter/index.rst b/doc/source/parameter/index.rst
new file mode 100644
index 0000000000..3bf6948dc3
--- /dev/null
+++ b/doc/source/parameter/index.rst
@@ -0,0 +1,9 @@
+Parameter
+=========
+
+.. toctree::
+ :maxdepth: 2
+
+ parameter.rst
+ optimizer.rst
+ updater.rst
diff --git a/doc/source/parameter/optimizer.rst b/doc/source/parameter/optimizer.rst
new file mode 100644
index 0000000000..b5b8b850b3
--- /dev/null
+++ b/doc/source/parameter/optimizer.rst
@@ -0,0 +1,22 @@
+Optimizer
+=========
+
+ParameterOptimizer
+------------------
+.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
+
+Regularizer
+-----------
+.. doxygenfile:: paddle/parameter/Regularizer.h
+
+FirstOrderOptimizer
+-------------------
+.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
+
+AverageOptimizer
+----------------
+.. doxygenfile:: paddle/parameter/AverageOptimizer.h
+
+OptimizerWithRegularizer
+------------------------
+.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/optimizer/index.rst b/doc/source/parameter/optimizer/index.rst
deleted file mode 100644
index 3338af5608..0000000000
--- a/doc/source/parameter/optimizer/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- optimizer.rst
diff --git a/doc/source/parameter/optimizer/optimizer.rst b/doc/source/parameter/optimizer/optimizer.rst
deleted file mode 100644
index 3d9e49217e..0000000000
--- a/doc/source/parameter/optimizer/optimizer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Optimizer
-============
-
-.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
-.. doxygenfile:: paddle/parameter/AverageOptimizer.h
-.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
-.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/parameter/parameter.rst b/doc/source/parameter/parameter.rst
similarity index 66%
rename from doc/source/parameter/parameter/parameter.rst
rename to doc/source/parameter/parameter.rst
index 2b7afdb409..2daa62d4e6 100644
--- a/doc/source/parameter/parameter/parameter.rst
+++ b/doc/source/parameter/parameter.rst
@@ -1,16 +1,12 @@
Parameter
-=============
-
-Weight
---------
-.. doxygenfile:: paddle/parameter/Weight.h
-
-Regularizer
-------------
-.. doxygenfile:: paddle/parameter/Regularizer.h
+=========
Parameter
--------------
+---------
.. doxygenfile:: paddle/parameter/Argument.h
.. doxygenfile:: paddle/parameter/Parameter.h
.. doxygenfile:: paddle/parameter/ParallelParameter.h
+
+Weight
+------
+.. doxygenfile:: paddle/parameter/Weight.h
diff --git a/doc/source/parameter/parameter/index.rst b/doc/source/parameter/parameter/index.rst
deleted file mode 100644
index e7ed70ec4c..0000000000
--- a/doc/source/parameter/parameter/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- parameter.rst
diff --git a/doc/source/parameter/update/index.rst b/doc/source/parameter/update/index.rst
deleted file mode 100644
index 1bbd733193..0000000000
--- a/doc/source/parameter/update/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- update.rst
diff --git a/doc/source/parameter/update/update.rst b/doc/source/parameter/updater.rst
similarity index 75%
rename from doc/source/parameter/update/update.rst
rename to doc/source/parameter/updater.rst
index c417602f03..dfa22e8e7d 100644
--- a/doc/source/parameter/update/update.rst
+++ b/doc/source/parameter/updater.rst
@@ -1,7 +1,14 @@
-Update
-==========
+Updater
+=======
+Base
+----
.. doxygenfile:: paddle/parameter/ParameterUpdaterBase.h
+
+Hook
+----
.. doxygenfile:: paddle/parameter/ParameterUpdaterHook.h
-.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
+Functions
+---------
+.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
diff --git a/doc/source/pserver/client.rst b/doc/source/pserver/client.rst
new file mode 100644
index 0000000000..e5bba0706a
--- /dev/null
+++ b/doc/source/pserver/client.rst
@@ -0,0 +1,12 @@
+Client
+======
+
+BaseClient
+----------
+.. doxygenclass:: paddle::BaseClient
+ :members:
+
+ParameterClient2
+----------------
+.. doxygenclass:: paddle::ParameterClient2
+ :members:
diff --git a/doc/source/pserver/client/client.rst b/doc/source/pserver/client/client.rst
deleted file mode 100644
index fc7ed90d3d..0000000000
--- a/doc/source/pserver/client/client.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Client
-=========
-
-.. doxygenclass:: paddle::BaseClient
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-.. doxygenclass:: paddle::ParameterClient2
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
diff --git a/doc/source/pserver/client/index.rst b/doc/source/pserver/client/index.rst
deleted file mode 100644
index dc924c9ca8..0000000000
--- a/doc/source/pserver/client/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Client Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- client.rst
diff --git a/doc/source/pserver/index.rst b/doc/source/pserver/index.rst
new file mode 100644
index 0000000000..0031e9476b
--- /dev/null
+++ b/doc/source/pserver/index.rst
@@ -0,0 +1,10 @@
+PServer
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ client.rst
+ network.rst
+ server.rst
+ utils.rst
diff --git a/doc/source/pserver/network.rst b/doc/source/pserver/network.rst
new file mode 100644
index 0000000000..7004c9d91f
--- /dev/null
+++ b/doc/source/pserver/network.rst
@@ -0,0 +1,27 @@
+Network
+=======
+
+SocketServer
+------------
+.. doxygenclass:: paddle::SocketServer
+ :members:
+
+SocketWorker
+------------
+.. doxygenclass:: paddle::SocketWorker
+ :members:
+
+SocketClient
+------------
+.. doxygenclass:: paddle::SocketClient
+ :members:
+
+SocketChannel
+-------------
+.. doxygenclass:: paddle::SocketChannel
+ :members:
+
+MessageReader
+-------------
+.. doxygenclass:: paddle::MsgReader
+ :members:
diff --git a/doc/source/pserver/network/index.rst b/doc/source/pserver/network/index.rst
deleted file mode 100644
index 2fdf95e17d..0000000000
--- a/doc/source/pserver/network/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Network Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- network.rst
diff --git a/doc/source/pserver/network/network.rst b/doc/source/pserver/network/network.rst
deleted file mode 100644
index e000ff8dbb..0000000000
--- a/doc/source/pserver/network/network.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-Network
-==========
-
-Socket Server
-----------------
-.. doxygenclass:: paddle::SocketServer
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Socket Worker
-----------------
-.. doxygenclass:: paddle::SocketWorker
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Socket Client
-----------------
-.. doxygenclass:: paddle::SocketClient
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Socket Channel
----------------
-.. doxygenclass:: paddle::SocketChannel
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Message Reader
----------------
-.. doxygenclass:: paddle::MsgReader
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
diff --git a/doc/source/pserver/server.rst b/doc/source/pserver/server.rst
new file mode 100644
index 0000000000..35301acf8f
--- /dev/null
+++ b/doc/source/pserver/server.rst
@@ -0,0 +1,12 @@
+Server
+======
+
+ProtoServer
+-----------
+.. doxygenclass:: paddle::ProtoServer
+ :members:
+
+ParameterServer2
+----------------
+.. doxygenclass:: paddle::ParameterServer2
+ :members:
diff --git a/doc/source/pserver/server/index.rst b/doc/source/pserver/server/index.rst
deleted file mode 100644
index 09e3530bfe..0000000000
--- a/doc/source/pserver/server/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Server Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- server.rst
diff --git a/doc/source/pserver/server/server.rst b/doc/source/pserver/server/server.rst
deleted file mode 100644
index f3110fdd73..0000000000
--- a/doc/source/pserver/server/server.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Server
-==========
-
-.. doxygenclass:: paddle::ProtoServer
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-.. doxygenclass:: paddle::ParameterServer2
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
diff --git a/doc/source/trainer/trainer.rst b/doc/source/trainer.rst
similarity index 94%
rename from doc/source/trainer/trainer.rst
rename to doc/source/trainer.rst
index 12c24597e7..85f1feb4fc 100644
--- a/doc/source/trainer/trainer.rst
+++ b/doc/source/trainer.rst
@@ -14,7 +14,7 @@ RemoteParameterUpdater
:members:
ConcurrentRemoteParameterUpdater
----------------------------------
+--------------------------------
.. doxygenclass:: paddle::ConcurrentRemoteParameterUpdater
:members:
diff --git a/doc/source/utils/customStackTrace.rst b/doc/source/utils/customStackTrace.rst
index a4e6f05a40..cdc8930739 100644
--- a/doc/source/utils/customStackTrace.rst
+++ b/doc/source/utils/customStackTrace.rst
@@ -1,9 +1,4 @@
CustomStackTrace
================
-
-
-class CustomStackTrace
-----------------------
-
.. doxygenclass:: paddle::CustomStackTrace
:members:
diff --git a/doc/source/utils/enum.rst b/doc/source/utils/enum.rst
index 17166d35f7..e0da75afe1 100644
--- a/doc/source/utils/enum.rst
+++ b/doc/source/utils/enum.rst
@@ -1,9 +1,3 @@
-enumeration_wrapper
+Enumeration wrapper
===================
-
-
-namespace paddle::enumeration_wrapper
--------------------------------------
-
.. doxygennamespace:: paddle::enumeration_wrapper
-
diff --git a/doc/source/utils/index.rst b/doc/source/utils/index.rst
new file mode 100644
index 0000000000..7ddc47d172
--- /dev/null
+++ b/doc/source/utils/index.rst
@@ -0,0 +1,11 @@
+Utils
+=====
+
+.. toctree::
+ :maxdepth: 2
+
+ lock.rst
+ queue.rst
+ thread.rst
+ customStackTrace.rst
+ enum.rst
diff --git a/doc/source/utils/lock.rst b/doc/source/utils/lock.rst
index 0b027e403f..f011acb943 100644
--- a/doc/source/utils/lock.rst
+++ b/doc/source/utils/lock.rst
@@ -1,37 +1,32 @@
-Thread
-======
+Lock
+====
-
-class Thread
-------------
-
-.. doxygenclass:: paddle::Thread
+RWLock
+------
+.. doxygenclass:: paddle::RWLock
:members:
-
-class ThreadWorker
-------------------
-
-.. doxygenclass:: paddle::ThreadWorker
+ReadLockGuard
+-------------
+.. doxygenclass:: paddle::ReadLockGuard
:members:
-
-class SyncThreadPool
---------------------
-
-.. doxygenclass:: paddle::SyncThreadPool
+SpinLock
+--------
+.. doxygenclass:: paddle::SpinLock
:members:
-
-
-class MultiThreadWorker
------------------------
-.. doxygenclass:: paddle::MultiThreadWorker
+Semaphore
+---------
+.. doxygenclass:: paddle::Semaphore
:members:
-
-class AsyncThreadPool
----------------------
+ThreadBarrier
+-------------
+.. doxygenclass:: paddle::ThreadBarrier
+ :members:
-.. doxygenclass:: paddle::AsyncThreadPool
+LockedCondition
+---------------
+.. doxygenclass:: paddle::LockedCondition
:members:
diff --git a/doc/source/utils/queue.rst b/doc/source/utils/queue.rst
index 72a464ca67..98192648e2 100644
--- a/doc/source/utils/queue.rst
+++ b/doc/source/utils/queue.rst
@@ -1,16 +1,12 @@
Queue
=====
-
-class Queue
-------------
-
+Queue
+-----
.. doxygenclass:: paddle::Queue
:members:
-
-class BlockingQueue
--------------------
-
+BlockingQueue
+-------------
.. doxygenclass:: paddle::BlockingQueue
:members:
diff --git a/doc/source/utils/thread.rst b/doc/source/utils/thread.rst
index 2eb67dde6a..23d379a989 100644
--- a/doc/source/utils/thread.rst
+++ b/doc/source/utils/thread.rst
@@ -1,40 +1,27 @@
-Lock
-====
+Thread
+======
-
-class RWLock
-------------
-
-.. doxygenclass:: paddle::RWLock
+Thread
+------
+.. doxygenclass:: paddle::Thread
:members:
-class ReadLockGuard
--------------------
-
-.. doxygenclass:: paddle::ReadLockGuard
+ThreadWorker
+------------
+.. doxygenclass:: paddle::ThreadWorker
:members:
-class SpinLock
+SyncThreadPool
--------------
-
-.. doxygenclass:: paddle::SpinLock
+.. doxygenclass:: paddle::SyncThreadPool
:members:
-
-class Semaphore
----------------
-
-.. doxygenclass:: paddle::Semaphore
- :members:
-
-class ThreadBarrier
--------------------
-
-.. doxygenclass:: paddle::ThreadBarrier
+
+MultiThreadWorker
+-----------------
+.. doxygenclass:: paddle::MultiThreadWorker
:members:
-class LockedCondition
----------------------
-
-.. doxygenclass:: paddle::LockedCondition
+AsyncThreadPool
+---------------
+.. doxygenclass:: paddle::AsyncThreadPool
:members:
-
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
index d804c0a662..a6356baf16 100644
--- a/doc_cn/build_and_install/cmake/cblas_settings.csv
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -1,4 +1,5 @@
-MKL_ROOT,mkl的路径,在${MKL_ROOT}/include下需要包含mkl.h,在${MKL_ROOT}/lib目录下需要包含 mkl_core,mkl_sequential和mkl_intel_lp64三个库
-ATLAS_ROOT,ATLAS库的路径,在${ATLAS_ROOT}/include下需要包含cblas.h,而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
-OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h,而在${OPENBLAS_ROOT}/lib下需要包含openblas库
-REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
index 0b8015aaee..12b45eebb2 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -1,15 +1,14 @@
-选项,说明,默认值
-WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否使用运行时动态加载cuda动态库,而非静态加载cuda动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
-WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA支持,否
-WITH_GLOG,是否使用GLOG,如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS,如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢,打印的日志变多。但是方便调试和benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
-WITH_DOC,是否编译英文文档,否
-WITH_DOC_CN,是否编译中文文档,否
-WITH_SWIG_PY,是否编译python的swig接口,python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_GLOG,是否开启GLOG。如果不开启,则会使用一个简化版的日志,同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS。如果不开启,则会使用一个简化版的命令行参数解析器,同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
index bb5b18a073..f345ead2bf 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -1,62 +1,43 @@
-设置PaddlePaddle的编译选项
-==========================
-
-PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本,调用
-cmake可以将cmake项目文件,生成各个平台的makefile。详细的cmake使用方法可以参考
-`cmake的官方文档 `_ 。
-
-PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制,链接何种blas等等。所有的
-编译选项列表如下
-
-PaddlePaddle的编译选项
-----------------------
-
-bool型的编译选项
-++++++++++++++++
-设置下列编译选项时,可以在cmake的命令行设置。使用 -D命令即可。例如
-:code:`cmake -D WITH_GPU=OFF`
-
-.. csv-table:: PaddlePaddle的bool型编译选项
- :widths: 1, 7, 2
- :file: compile_options.csv
-
-blas相关的编译选项
-++++++++++++++++++
-
-PaddlePaddle可以使用 `MKL `_ ,
-`Atlas `_ ,
-`OpenBlas `_ 和
-`refference Blas `_ ,任意一种cblas实现。
-通过编译时指定路径来实现引用各种blas。
-
-cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
-也会读取相关路径变量来进行搜索。路径变量为\:
-
-
-.. csv-table:: PaddlePaddle的cblas编译选项
- :widths: 1, 9
- :header: "编译选项", "描述"
- :file: cblas_settings.csv
-
-这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
-量也可以通过调用cmake命令前通过环境变量指定。例如
-
-.. code-block:: bash
-
- export MKL_ROOT=/opt/mkl
- cmake
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
-
-cuda/cudnn相关的编译选项
-++++++++++++++++++++++++
-
-PaddlePaddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
-运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
-
-在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是
--D,例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 `_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如
+
+.. code-block:: bash
+
+ cmake .. -DWITH_GPU=OFF
+
+.. csv-table:: Bool型的编译选项
+ :widths: 1, 7, 2
+ :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库:`MKL `_ ,`ATLAS `_ ,`OpenBlAS `_ 和 `REFERENCE BLAS `_ 。
+
+.. csv-table:: BLAS路径相关的编译选项
+ :widths: 1, 2, 7
+ :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
+
+.. code-block:: bash
+
+ cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
\ No newline at end of file
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 6f51d55120..b539374cd4 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
@@ -112,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
}
void Arguments::setSlotSubSequenceStartPositions(
- size_t idx, IVector *vec) throw(RangeError) {
+ size_t idx, IVector* vec) throw(RangeError) {
auto& a = m->getArg(idx);
auto& v = m->cast(vec->getSharedPtr());
a.subSequenceStartPositions = std::make_shared(v);
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 25d94f5a6a..bc40d871d1 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/trainer/Trainer.h"
@@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
return retv;
}
-TrainerConfig* TrainerConfig::createFromProtoString(
- const std::string& str) {
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
auto retv = new TrainerConfig();
paddle::TrainerConfig trainerConfigProto;
auto conf = std::make_shared(trainerConfigProto);
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index bef499c678..9a4846d809 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
@@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
GradientMachine::~GradientMachine() { delete m; }
GradientMachine* GradientMachine::createFromPaddleModelPtr(
- const void* confPtr, GradientMatchineCreateMode mode,
+ const void* confPtr,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
auto& conf = *(const paddle::ModelConfig*)(confPtr);
std::vector realTypes;
@@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
}
GradientMachine* GradientMachine::createByConfigProtoStr(
- const std::string& protoStr, GradientMatchineCreateMode mode,
+ const std::string& protoStr,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
paddle::ModelConfig conf;
conf.ParseFromString(protoStr);
@@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
}
GradientMachine* GradientMachine::createByModelConfig(
- ModelConfig* conf, GradientMatchineCreateMode mode,
+ ModelConfig* conf,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
auto confPtr = &conf->m->conf->getModelConfig();
return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
}
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
+ Arguments* outArgs,
PassType passType) {
auto& in =
m->cast>(inArgs.getInternalArgumentsPtr());
@@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
}
void GradientMachine::forwardBackward(const Arguments& inArgs,
- Arguments* outArgs, PassType passType,
+ Arguments* outArgs,
+ PassType passType,
const UpdateCallback& callback) {
auto& in =
m->cast>(inArgs.getInternalArgumentsPtr());
@@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
void GradientMachine::randParameters() { m->machine->randParameters(); }
Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
- throw(UnsupportError) {
+ throw(UnsupportError) {
auto nn = std::dynamic_pointer_cast(m->machine);
if (nn) {
auto mat = nn->getLayerOutput(layerName);
@@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
}
SequenceGenerator* GradientMachine::asSequenceGenerator(
- const std::vector& dict, size_t begin_id, size_t end_id,
- size_t max_length, size_t beam_size) {
+ const std::vector& dict,
+ size_t begin_id,
+ size_t end_id,
+ size_t max_length,
+ size_t beam_size) {
SequenceGenerator* r =
SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
r->setDict(dict);
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index b990f650be..66a13bc603 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "PaddleAPI.h"
@@ -23,7 +22,8 @@ limitations under the License. */
template
void staticCastVector(std::vector* dest, const std::vector& src) {
dest->resize(src.size());
- std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
- return static_cast(t);
- });
+ std::transform(src.begin(),
+ src.end(),
+ dest->begin(),
+ [](T1 t) { return static_cast(t); });
}
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index e5493a381a..f257ee65aa 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h"
@@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
return m;
}
-Matrix* Matrix::createDense(const std::vector& data, size_t height,
- size_t width, bool useGpu) {
+Matrix* Matrix::createDense(const std::vector& data,
+ size_t height,
+ size_t width,
+ bool useGpu) {
auto m = new Matrix();
m->m->mat = paddle::Matrix::create(height, width, useGpu);
m->m->mat->copyFrom(data.data(), data.size());
return m;
}
-Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
- bool copy, bool useGpu)
- throw (UnsupportError) {
+Matrix* Matrix::createDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// Gpu mode only supports copy=True
if (!copy) {
@@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
}
}
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
bool copy) {
auto m = new Matrix();
if (copy) {
@@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
return m;
}
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
- bool isNonVal, bool isTrans, bool useGpu) {
+Matrix* Matrix::createSparse(size_t height,
+ size_t width,
+ size_t nnz,
+ bool isNonVal,
+ bool isTrans,
+ bool useGpu) {
auto m = new Matrix();
m->m->mat = paddle::Matrix::createSparseMatrix(
- height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
- isTrans, useGpu);
+ height,
+ width,
+ nnz,
+ isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+ isTrans,
+ useGpu);
return m;
}
@@ -221,7 +234,8 @@ FloatArray Matrix::getData() const {
}
void Matrix::sparseCopyFrom(
- const std::vector& rows, const std::vector& cols,
+ const std::vector& rows,
+ const std::vector& cols,
const std::vector& vals) throw(UnsupportError) {
auto cpuSparseMat =
std::dynamic_pointer_cast(m->mat);
@@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom(
void* Matrix::getSharedPtr() const { return &m->mat; }
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
+ int* dim1,
int* dim2) throw(UnsupportError) {
auto cpuMat = std::dynamic_pointer_cast(m->mat);
if (cpuMat) {
@@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
throw UnsupportError();
}
}
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
+ int* dim1,
int* dim2) throw(UnsupportError) {
static_assert(sizeof(paddle::real) == sizeof(float),
"Currently PaddleAPI only support for single "
@@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
} else if (auto gpuMat = dynamic_cast(m->mat.get())) {
auto src = gpuMat->getData();
auto dest = *view_m_data;
- hl_memcpy_device2host(dest, src,
- sizeof(paddle::real) * (*dim1) * (*dim2));
+ hl_memcpy_device2host(
+ dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
} else {
LOG(WARNING) << "Unexpected Situation";
throw UnsupportError();
@@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
}
}
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
+ int dim1,
int dim2) throw(UnsupportError, RangeError) {
if (isSparse()) {
throw UnsupportError();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5688ece44d..c07facdb12 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -61,8 +60,8 @@ class RangeError {};
/// Not support Error, such as access GPU memory directly, etc.
class UnsupportError : public std::runtime_error {
public:
- UnsupportError() : std::runtime_error(" ") {};
- UnsupportError(const std::string& message) : std::runtime_error(message) {};
+ UnsupportError() : std::runtime_error(" "){};
+ UnsupportError(const std::string& message) : std::runtime_error(message){};
};
/// This type will map to python's list of float.
@@ -112,7 +111,8 @@ public:
/**
* Create A Matrix with height,width, which is filled by zero.
*/
- static Matrix* createZero(size_t height, size_t width,
+ static Matrix* createZero(size_t height,
+ size_t width,
bool useGpu = isUsingGpu());
/**
@@ -124,8 +124,11 @@ public:
*
* @note the default sparse type is SPARSE_CSR.
*/
- static Matrix* createSparse(size_t height, size_t width, size_t nnz,
- bool isNonVal = true, bool trans = false,
+ static Matrix* createSparse(size_t height,
+ size_t width,
+ size_t nnz,
+ bool isNonVal = true,
+ bool trans = false,
bool useGpu = isUsingGpu());
/**
@@ -134,13 +137,17 @@ public:
* @param data list of float should be passed in python.
* @note the value will be copy into a new matrix.
*/
- static Matrix* createDense(const std::vector& data, size_t height,
- size_t width, bool useGpu = isUsingGpu());
-
- static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
- bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static Matrix* createDense(const std::vector& data,
+ size_t height,
+ size_t width,
+ bool useGpu = isUsingGpu());
+
+ static Matrix* createDenseFromNumpy(
+ float* data,
+ int dim1,
+ int dim2,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -151,7 +158,9 @@ public:
* @param copy true if copy into a new matrix, false will create
* matrix inplace.
*/
- static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+ static Matrix* createCpuDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
bool copy = false);
/// Create Gpu Dense Matrix from numpy matrix, dtype=float32
@@ -171,11 +180,13 @@ public:
* numpy_mat = m.toNumpyMat()
* @endcode
*/
- void toNumpyMatInplace(float** view_data, int* dim1,
+ void toNumpyMatInplace(float** view_data,
+ int* dim1,
int* dim2) throw(UnsupportError);
/// Copy To numpy mat.
- void copyToNumpyMat(float** view_m_data, int* dim1,
+ void copyToNumpyMat(float** view_m_data,
+ int* dim1,
int* dim2) throw(UnsupportError);
/// Copy From Numpy Mat
@@ -248,15 +259,18 @@ public:
static Vector* create(const std::vector& data,
bool useGpu = isUsingGpu());
- static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static Vector* createVectorFromNumpy(
+ float* data,
+ int dim,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu Vector from numpy array, which dtype=float32
*
* If copy is false, it will create vector inplace.
*/
- static Vector* createCpuVectorFromNumpy(float* data, int dim,
+ static Vector* createCpuVectorFromNumpy(float* data,
+ int dim,
bool copy = false);
/// Create Gpu Vector from numpy array, which dtype=float32
@@ -312,16 +326,19 @@ public:
static IVector* create(const std::vector& data,
bool useGpu = isUsingGpu());
- static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static IVector* createVectorFromNumpy(
+ int* data,
+ int dim,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu IVector from numpy array, which dtype=int32
*
* If copy is false, it will create vector inplace
*/
- static IVector* createCpuVectorFromNumpy(int* data, int dim,
+ static IVector* createCpuVectorFromNumpy(int* data,
+ int dim,
bool copy = false);
/**
* Create Gpu IVector from numpy array, which dtype=int32
@@ -605,7 +622,8 @@ class ParameterTraverseCallback {
public:
~ParameterTraverseCallback();
- void apply(const std::vector& vecs, const ParameterConfig& config,
+ void apply(const std::vector& vecs,
+ const ParameterConfig& config,
size_t sparseId);
private:
@@ -638,7 +656,8 @@ public:
void finishBatch();
- void update(const std::vector& vecs, const ParameterConfig& conf,
+ void update(const std::vector& vecs,
+ const ParameterConfig& conf,
size_t sparseId = NO_SPARSE_ID);
std::vector getParameterTypes() const;
@@ -678,7 +697,8 @@ public:
* model config by TrainerConfig
*/
static GradientMachine* createByModelConfig(
- ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+ ModelConfig* conf,
+ GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
const std::vector& parameterTypes = defaultParamTypes);
/**
@@ -701,7 +721,8 @@ public:
/**
* Combine forward/backward
*/
- void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+ void forwardBackward(const Arguments& inArgs,
+ Arguments* outArgs,
PassType passType,
const UpdateCallback& callback = UpdateCallback());
@@ -722,14 +743,17 @@ public:
*/
SequenceGenerator* asSequenceGenerator(
const std::vector& dict = std::vector(),
- size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+ size_t begin_id = 0UL,
+ size_t end_id = 0UL,
+ size_t max_length = 100UL,
size_t beam_size = -1UL);
private:
GradientMachinePrivate* m;
static GradientMachine* createFromPaddleModelPtr(
- const void* confPtr, GradientMatchineCreateMode mode,
+ const void* confPtr,
+ GradientMatchineCreateMode mode,
const std::vector& types);
// Not to use c++ 11 init-list, so we use static var as function default arg.
@@ -751,8 +775,8 @@ public:
/// Create A Trainer By TrainerConfig. using paddle command line.
static Trainer* createByCommandLine() throw(IOError);
- static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
- throw(IOError);
+ static Trainer* create(TrainerConfig* optConfig,
+ GradientMachine* gm) throw(IOError);
/// Start training
void startTrain();
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 8b56adc97c..c5876bb1c7 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/parameter/Parameter.h"
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index b13761ab09..21d031e4bc 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/parameter/ParameterOptimizer.h"
@@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
const paddle::ParameterOptimizer::TraverseCallback& callback)
: callback(callback) {}
- void apply(const std::vector& vecs, const ParameterConfig& conf,
+ void apply(const std::vector& vecs,
+ const ParameterConfig& conf,
size_t sparseId) {
std::vector real_vecs;
real_vecs.resize(vecs.size());
- std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
- if (v) {
- return *(paddle::VectorPtr*)(v->getSharedPtr());
- } else {
- return paddle::VectorPtr();
- }
- });
+ std::transform(vecs.begin(),
+ vecs.end(),
+ real_vecs.begin(),
+ [](Vector* v) {
+ if (v) {
+ return *(paddle::VectorPtr*)(v->getSharedPtr());
+ } else {
+ return paddle::VectorPtr();
+ }
+ });
paddle::ParameterConfig& real_conf =
*(paddle::ParameterConfig*)(const_cast(conf)
@@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
void ParameterOptimizer::update(const std::vector& vecs,
- const ParameterConfig& conf, size_t sparseId) {
- ParameterTraverseCallbackPrivate invoker([&](
- const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
- size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+ const ParameterConfig& conf,
+ size_t sparseId) {
+ ParameterTraverseCallbackPrivate invoker(
+ [&](const paddle::VectorPtr _vecs[],
+ const paddle::ParameterConfig& config,
+ size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
invoker.apply(vecs, conf, sparseId);
}
@@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector& vecs,
ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
const ParameterConfig& config) const {
- auto& param_config = *(paddle::ParameterConfig*)const_cast(
- config).getRawPtr();
+ auto& param_config =
+ *(paddle::ParameterConfig*)const_cast(config)
+ .getRawPtr();
auto callback = m->optimizer->needSpecialTraversal(param_config);
if (callback) {
auto retCallback = new ParameterTraverseCallback();
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 9d353ccc8e..d51be78d45 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/gserver/gradientmachines/GradientMachine.h"
#include "paddle/parameter/Argument.h"
@@ -42,8 +41,10 @@ struct Path {
// position
static void findNBest(paddle::GradientMachine* gradMachine,
std::vector& inArgs,
- std::vector& finalPaths, size_t bos_id,
- size_t eos_id, size_t max_length) {
+ std::vector& finalPaths,
+ size_t bos_id,
+ size_t eos_id,
+ size_t max_length) {
std::vector paths;
Path emptyPath;
paths.push_back(emptyPath);
@@ -166,7 +167,8 @@ public:
if (id < getSize()) {
Path& p = (*path_)[id];
std::ostringstream sout;
- std::transform(p.ids.begin(), p.ids.end(),
+ std::transform(p.ids.begin(),
+ p.ids.end(),
std::ostream_iterator(sout, split ? " " : ""),
[&](int id) { return (*dict_)[id]; });
return sout.str();
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index b61f36f740..7a6aa69fb6 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
: m(new TrainerPrivate()) {
- m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+ m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
}
-Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
- throw(IOError)
-{
+Trainer* Trainer::create(TrainerConfig* config,
+ GradientMachine* gm) throw(IOError) {
auto retv = new Trainer(config, gm);
if (retv->m->getConfig().IsInitialized()) {
return retv;
@@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
Matrix* Trainer::getLayerOutput(const std::string& layerName) {
auto nn = std::dynamic_pointer_cast(
- this->m->getGradientMachine());
+ this->m->getGradientMachine());
CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
auto m = nn->getLayerOutput(layerName);
return Matrix::createByPaddleMatrixPtr(&m);
}
-void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+void Trainer::forwardOneBatch(size_t batchSize) {
+ m->forwardOneBatch(batchSize);
+}
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
CHECK(dataProvider_) << "data_provider is not specified";
paddle::DataBatch dataBatch;
int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
@@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
void TrainerPrivate::forwardOneDataBatch(
const std::vector& inArgs) {
-
std::vector& outArgs = forwardOutput_;
if (config_->getOptConfig().use_sparse_remote_updater()) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index a8932351a6..1bba1df2e1 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l)
IntArray::IntArray(const int* b, const size_t l, bool f)
: buf(b), length(l), needFree(f) {}
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
+ const int* i,
+ size_t l,
bool f)
: valBuf(v), idxBuf(i), length(l), needFree(f) {}
-bool isUsingGpu() {return FLAGS_use_gpu;}
+bool isUsingGpu() { return FLAGS_use_gpu; }
-void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
bool isGpuVersion() {
#ifdef PADDLE_ONLY_CPU
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index d44cdefc35..cc1c098223 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/math/Vector.h"
@@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector& data, bool useGpu) {
return v;
}
-IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
- bool useGpu) throw (UnsupportError){
+IVector* IVector::createVectorFromNumpy(int* data,
+ int dim,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// if use gpu only copy=true is supported
if (!copy) {
@@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
if (auto cpuVec = dynamic_cast(m->vec.get())) {
std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
} else if (auto gpuVec = dynamic_cast(m->vec.get())) {
- hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
- sizeof(int) * (*dim1));
+ hl_memcpy_device2host(
+ *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
} else {
LOG(INFO) << "Unexpected situation";
}
@@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
}
}
-Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
- bool useGpu) throw (UnsupportError){
+Vector* Vector::createVectorFromNumpy(float* data,
+ int dim,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// if use gpu only copy=True is supported
if (!copy) {
@@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
if (auto cpuVec = dynamic_cast(m->vec.get())) {
std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
} else if (auto gpuVec = dynamic_cast(m->vec.get())) {
- hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
- sizeof(float) * (*dim1));
+ hl_memcpy_device2host(
+ *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
} else {
LOG(INFO) << "Unexpected situation";
}
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index cdb730bb3c..11dbfb54b2 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -81,5 +81,8 @@ else()
add_library(paddle_cuda ${CUDA_SOURCES})
endif()
-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+ ${CUDA_SOURCES}
+ ${CUDA_HEADERS}
+ ${CUDA_DSO_SOURCES}
+ ${CUDA_CXX_WITH_GPU_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index c8aabc7844..03e15b2223 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_ACTIVATION_FUNCTIONS_H_
#define HL_ACTIVATION_FUNCTIONS_H_
@@ -21,11 +20,8 @@ limitations under the License. */
/**
* Active functions: sigmoid, relu, tanh and linear.
*/
-#define HPPL_ACTIVE_FUNCTION {hppl::sigmoid, \
- hppl::relu, \
- hppl::tanh, \
- hppl::linear \
- }
+#define HPPL_ACTIVE_FUNCTION \
+ { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
namespace hppl {
@@ -42,18 +38,18 @@ public:
#ifdef __NVCC__
namespace gpu {
-static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
static __device__ Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#else
namespace cpu {
-static Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
static Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#ifdef __AVX__
namespace avx {
-static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#endif
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index db75809f5d..a6d9ff8483 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AGGREGATE_H_
#define HL_AGGREGATE_H_
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index cf062dd969..ed339e312a 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AVX_FUNCTIONS_H_
#define HL_AVX_FUNCTIONS_H_
#include
namespace hppl {
- __m256 relu(const __m256 a);
- __m256 sigmoid(const __m256 a);
- __m256 tanh(const __m256 a);
- __m256 linear(const __m256 a);
-
- __m256 relu(const __m256 a, const __m256 b);
- __m256 sigmoid(const __m256 a, const __m256 b);
- __m256 tanh(const __m256 a, const __m256 b);
- __m256 linear(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
} // namespace hppl
#endif // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 9f80898a1f..a076952467 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
-
#ifndef HL_BASE_H_
#define HL_BASE_H_
@@ -33,36 +31,36 @@ limitations under the License. */
* HPPL_STREAM_DEFAULT is HPPL default stream.
*/
typedef enum {
- HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
- HPPL_STREAM_1 = 1,
- HPPL_STREAM_2 = 2,
- HPPL_STREAM_3 = 3,
- HPPL_STREAM_4 = 4,
- HPPL_THREAD_STREAM_1 = 5,
- HPPL_THREAD_STREAM_2 = 6,
- HPPL_THREAD_STREAM_3 = 7,
- HPPL_THREAD_STREAM_4 = 8,
- HPPL_STREAM_END
+ HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+ HPPL_STREAM_1 = 1,
+ HPPL_STREAM_2 = 2,
+ HPPL_STREAM_3 = 3,
+ HPPL_STREAM_4 = 4,
+ HPPL_THREAD_STREAM_1 = 5,
+ HPPL_THREAD_STREAM_2 = 6,
+ HPPL_THREAD_STREAM_3 = 7,
+ HPPL_THREAD_STREAM_4 = 8,
+ HPPL_STREAM_END
} hl_stream_t;
/**
* @brief HPPL activation mode.
*/
typedef enum {
- HL_ACTIVATION_SIGMOID = 0,
- HL_ACTIVATION_RELU = 1,
- HL_ACTIVATION_TANH = 2,
- HL_ACTIVATION_LINEAR = 3,
- HL_ACTIVATION_END
+ HL_ACTIVATION_SIGMOID = 0,
+ HL_ACTIVATION_RELU = 1,
+ HL_ACTIVATION_TANH = 2,
+ HL_ACTIVATION_LINEAR = 3,
+ HL_ACTIVATION_END
} hl_activation_mode_t;
/**
* @brief Transpose type.
*/
typedef enum {
- HPPL_OP_N = 0, /* transpose */
- HPPL_OP_T = 1, /* non transpose */
- HPPL_OP_END
+ HPPL_OP_N = 0, /* transpose */
+ HPPL_OP_T = 1, /* non transpose */
+ HPPL_OP_END
} hl_trans_op_t;
/**
@@ -148,23 +146,21 @@ typedef struct {
* @brief Sparse matrix value type.
*/
typedef enum {
- HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
- HL_FLOAT_VALUE = 1,
- HL_VALUE_END
+ HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+ HL_FLOAT_VALUE = 1,
+ HL_VALUE_END
} hl_matrix_value_t;
-
/**
* @brief HPPL matrix format.
*/
typedef enum {
- HL_SPARSE_CSR = 0,
- HL_SPARSE_CSC = 1,
- HL_SPARSE_END
+ HL_SPARSE_CSR = 0,
+ HL_SPARSE_CSC = 1,
+ HL_SPARSE_END
} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s * hl_matrix_s;
+typedef struct _hl_matrix_s *hl_matrix_s;
/**
* @brief HPPL sparse matrix.
@@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
* @param nnz nonzero values of sparse matrix.
*/
typedef struct {
- hl_matrix_s matrix;
- hl_matrix_format_t format;
- hl_matrix_value_t type;
- int rows;
- int cols;
- size_t nnz;
+ hl_matrix_s matrix;
+ hl_matrix_format_t format;
+ hl_matrix_value_t type;
+ int rows;
+ int cols;
+ size_t nnz;
} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
#ifndef PADDLE_TYPE_DOUBLE
@@ -195,7 +191,7 @@ typedef struct {
*
* HL_FLOAT_MIN: 1.17549435e-38F
*/
-#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
/**
* if real == double
*
@@ -203,20 +199,18 @@ typedef struct {
*
* HL_FLOAT_MIN: 2.2250738585072014e-308
*/
-#define HL_FLOAT_MIN 1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
#endif
-
/**
* The maximum input value for exp, used to avoid overflow problem.
*
* Currently only used for tanh function.
*/
-#define EXP_MAX_INPUT 40.0
-
+#define EXP_MAX_INPUT 40.0
/**
* @brief DIVUP(x, y) is similar to ceil(x / y).
@@ -224,7 +218,7 @@ typedef struct {
* the size of blockDim.
*/
#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
#endif
#ifdef __NVCC__
@@ -233,7 +227,7 @@ typedef struct {
#include "hl_cuda.h"
#include "cuda_runtime.h"
-extern __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
extern __thread cudaStream_t default_stream;
#define STREAM_DEFAULT default_stream
@@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream;
* @brief Check cuda kernel execution.
* @param msg error string
*/
-#define CHECK_SYNC(msg) \
- if (true == g_sync_flag) { \
- hl_stream_synchronize(HPPL_STREAM_DEFAULT); \
- cudaError_t err \
- = (cudaError_t)hl_get_device_last_error(); \
- CHECK_EQ(cudaSuccess, err) << "[" << msg << "] " \
- << "CUDA error: " \
- << hl_get_device_error_string((size_t)err); \
+#define CHECK_SYNC(msg) \
+ if (true == g_sync_flag) { \
+ hl_stream_synchronize(HPPL_STREAM_DEFAULT); \
+ cudaError_t err = (cudaError_t)hl_get_device_last_error(); \
+ CHECK_EQ(cudaSuccess, err) \
+ << "[" << msg << "] " \
+ << "CUDA error: " << hl_get_device_error_string((size_t)err); \
}
-#endif /* __NVCC__ */
+#endif /* __NVCC__ */
-#endif /* HL_BASE_H_ */
+#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index 414c7996ac..f3630e9762 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_BATCH_TRANSPOSE_H_
#define HL_BATCH_TRANSPOSE_H_
@@ -31,10 +30,7 @@ limitations under the License. */
* order. Each batch has height * width data, which are
* arranged in height-first (or row-first) manner.
*/
-extern void batchTranspose(const real* input,
- real* output,
- int width,
- int height,
- int batchSize);
+extern void batchTranspose(
+ const real* input, real* output, int width, int height, int batchSize);
#endif // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 70b5be6fda..cffaac634f 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CNN_H_
#define HL_CNN_H_
@@ -37,15 +36,21 @@ limitations under the License. */
* @param[in] alpha
* @param[in] beta
*/
-extern void hl_shrink_col2feature(
- const real * dataCol, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataIm,
- real alpha = 1.0f, real beta = 0.0f);
+extern void hl_shrink_col2feature(const real* dataCol,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataIm,
+ real alpha = 1.0f,
+ real beta = 0.0f);
/**
* @brief Expand feature to column.
@@ -65,14 +70,19 @@ extern void hl_shrink_col2feature(
* @param[out] dataCol expand data.
*
*/
-extern void hl_expand_feature2col(
- const real* dataIm, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataCol);
+extern void hl_expand_feature2col(const real* dataIm,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataCol);
/**
* @brief Maximum pool forward.
@@ -94,15 +104,21 @@ extern void hl_expand_feature2col(
* @param[in] tgtStride stride between output data samples.
*
*/
-extern void hl_maxpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride);
+extern void hl_maxpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride);
/**
* @brief Maximum pool backward.
@@ -125,20 +141,28 @@ extern void hl_maxpool_forward(
* @param[in] paddingH padding height.
* @param[in] paddingW padding width.
* @param[out] targetGrad output grad.
- * @param[in] outStride stride between output data samples.
+ * @param[in] outStride stride between output data samples.
*
*/
-extern void hl_maxpool_backward(
- const int frameCnt, const real* inputData,
- const real* outData, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real scaleA, real scaleB,
- real* targetGrad, const int outStride);
+extern void hl_maxpool_backward(const int frameCnt,
+ const real* inputData,
+ const real* outData,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real scaleA,
+ real scaleB,
+ real* targetGrad,
+ const int outStride);
/**
* @brief Averge pool forward.
@@ -160,15 +184,21 @@ extern void hl_maxpool_backward(
* @param[in] tgtStride stride between output data samples.
*
*/
-extern void hl_avgpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride);
+extern void hl_avgpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride);
/**
* @brief Maximum pool backward.
@@ -189,19 +219,26 @@ extern void hl_avgpool_forward(
* @param[in] scaleA scale.
* @param[in] scaleB scale.
* @param[out] backGrad output grad.
- * @param[in] outStride stride between output data samples.
+ * @param[in] outStride stride between output data samples.
*
*/
-extern void hl_avgpool_backward(
- const int frameCnt, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- int paddingH, int paddingW,
- real scaleA, real scaleB,
- real* backGrad, const int outStride);
+extern void hl_avgpool_backward(const int frameCnt,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ int paddingH,
+ int paddingW,
+ real scaleA,
+ real scaleB,
+ real* backGrad,
+ const int outStride);
/**
* @brief Cross-map-respose normalize forward.
@@ -218,10 +255,16 @@ extern void hl_avgpool_backward(
* @param[in] beta scale.
*
*/
-extern void hl_CMRNorm_forward(
- size_t frameCnt, const real* in, real* scale, real* out,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta);
+extern void hl_CMRNorm_forward(size_t frameCnt,
+ const real* in,
+ real* scale,
+ real* out,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta);
/**
* @brief Cross-map-respose normalize backward.
@@ -240,11 +283,18 @@ extern void hl_CMRNorm_forward(
* @param[in] beta scale.
*
*/
-extern void hl_CMRNorm_backward(
- size_t frameCnt, const real* inV, const real* scale,
- const real* outV, const real* outDiff, real *inDiff,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta);
+extern void hl_CMRNorm_backward(size_t frameCnt,
+ const real* inV,
+ const real* scale,
+ const real* outV,
+ const real* outDiff,
+ real* inDiff,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta);
/**
* @brief Bilinear interpolation forward.
@@ -278,24 +328,24 @@ extern void hl_bilinear_forward(const real* inData,
const real ratioH,
const real ratioW);
- /**
- * @brief Bilinear interpolation backward.
- *
- * @param[out] inGrad input gradient.
- * @param[in] inImgH input image height.
- * @param[in] inImgW input image width.
- * @param[in] inputH input batchSize.
- * @param[in] inputW input image data dim.
- * @param[in] outGrad output gradient.
- * @param[in] outImgH output image height.
- * @param[in] outImgW output image width.
- * @param[in] outputH output batchSize.
- * @param[in] outputW output image data dim.
- * @param[in] numChannels number of channels.
- * @param[in] ratioH inImgH / outImgH.
- * @param[in] ratioW inImgW / outImgW.
- *
- */
+/**
+* @brief Bilinear interpolation backward.
+*
+* @param[out] inGrad input gradient.
+* @param[in] inImgH input image height.
+* @param[in] inImgW input image width.
+* @param[in] inputH input batchSize.
+* @param[in] inputW input image data dim.
+* @param[in] outGrad output gradient.
+* @param[in] outImgH output image height.
+* @param[in] outImgW output image width.
+* @param[in] outputH output batchSize.
+* @param[in] outputW output image data dim.
+* @param[in] numChannels number of channels.
+* @param[in] ratioH inImgH / outImgH.
+* @param[in] ratioW inImgW / outImgW.
+*
+*/
extern void hl_bilinear_backward(real* inGrad,
const size_t inImgH,
const size_t inImgW,
@@ -321,9 +371,13 @@ extern void hl_bilinear_backward(real* inGrad,
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
-extern void hl_maxout_forward(
- const real* inData, real* outData, int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_forward(const real* inData,
+ real* outData,
+ int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t groups);
/**
* @brief MaxOut backward.
@@ -336,8 +390,12 @@ extern void hl_maxout_forward(
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
-extern void hl_maxout_backward(
- real* inGrad, const real* outGrad, const int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_backward(real* inGrad,
+ const real* outGrad,
+ const int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t groups);
#endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index d763658c93..2c7d665101 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_H_
#define HL_CUDA_H_
-#include "hl_base.h"
#include
+#include "hl_base.h"
/**
* @brief HPPL event.
*/
-typedef struct _hl_event_st * hl_event_t;
-
+typedef struct _hl_event_st *hl_event_t;
/**
* @brief return cuda runtime api version.
@@ -42,7 +40,7 @@ extern void hl_start();
* if device is NULL, will start all GPU.
* @param[in] number number of devices.
*/
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
/**
* @brief Queries if a device may directly access a peer device's memory.
@@ -126,7 +124,7 @@ extern int hl_get_device();
*
* @return dest_d pointer to device memory.
*/
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
/**
* @brief Free device memory.
@@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
*
* @return dest_h pointer to host memory.
*/
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
/**
* @brief Free host page-lock memory.
@@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
* @param[in] stream stream id.
*/
extern void hl_memcpy_async(void *dst,
- void *src,
- size_t size,
- hl_stream_t stream);
+ void *src,
+ size_t size,
+ hl_stream_t stream);
/**
* @brief Waits for stream tasks to complete.
@@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
*
* @return time Time between start and end in ms.
*/
-extern float hl_event_elapsed_time(hl_event_t start,
- hl_event_t end);
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
/**
* @brief Records an event.
@@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
/**
* @brief Returns the last error string from a cuda runtime call.
*/
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
/**
* @brief Returns the last error string from a cuda runtime call.
@@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
*
* @see hl_get_device_last_error()
*/
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
/**
* @brief Returns the last error number.
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index d757317eb4..db8c03c2c0 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUBLAS_H_
#define HL_CUDA_CUBLAS_H_
@@ -29,12 +28,8 @@ limitations under the License. */
* @param[in] ldc the first dimension of C_d.
*
*/
-extern void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc);
+extern void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
/*
* @brief Matrix transpose, while lda = dimN, ldc = dimM.
@@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN);
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
/*
* @brief Matrix inverse
@@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d,
* @param[in] ldc the first dimension of C_d
*
*/
-extern void hl_matrix_inverse(real *A_d,
- real *C_d,
- int dimN,
- int lda,
- int ldc);
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d,
* @param[in] ldc the first dimension of C_d.
*
*/
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
* @param[in] beta scalar used for multiplication.
*
*/
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief This function performs the matrix-vector multiplication.
@@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
*
*/
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta,
- int lda, int incb, int incc);
+extern void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta,
+ int lda,
+ int incb,
+ int incc);
/**
* @brief This function performs the matrix-vector multiplication.
@@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
* @param[in] beta scalar used for multiplication.
*
*/
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta);
+extern void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta);
#endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index f256cb54df..3a2f916210 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUDNN_H_
#define HL_CUDA_CUDNN_H_
@@ -22,7 +21,7 @@ limitations under the License. */
* hppl pooling mode
*/
typedef enum {
- HL_POOLING_MAX = 0,
+ HL_POOLING_MAX = 0,
// average includes padded values
HL_POOLING_AVERAGE = 1,
// average does not include padded values
@@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
* @param[in] sizeInBytes gpu workspace size (bytes).
* @param[in] convBwdFilterAlgo backward filter algorithm.
*/
-extern void hl_convolution_backward_filter(
- hl_tensor_descriptor input,
- real* input_data,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_grad_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdFilterAlgo);
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
+ real* input_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_grad_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdFilterAlgo);
/**
* @brief convolution backward data(calculate input image grad data).
@@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
* @param[in] sizeInBytes gpu workspace size (bytes).
* @param[in] convBwdDataAlgo backward data algorithm.
*/
-extern void hl_convolution_backward_data(
- hl_tensor_descriptor input,
- real* input_data_grad,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdDataAlgo);
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
+ real* input_data_grad,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdDataAlgo);
/**
* @brief convolution backward bias(calculate bias grad data).
@@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
* @param[in] height matrix height.
* @param[in] width matrix width.
*/
-extern void hl_softmax_forward(real *input,
- real *output,
+extern void hl_softmax_forward(real* input,
+ real* output,
int height,
int width);
@@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
* @param[in] height matrix height.
* @param[in] width matrix width.
*/
-extern void hl_softmax_backward(real *output_value,
- real *output_grad,
+extern void hl_softmax_backward(real* output_value,
+ real* output_grad,
int height,
int width);
@@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
*
*/
extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar);
+ real* savedMean,
+ real* savedVar);
/**
* @brief cudnn batch norm forward.
@@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
*
*/
extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedVar,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedVar,
double epsilon);
/**
@@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
* @param[in] inGradDesc input tensor descriptor desc.
* @param[in] inGrad input data.
* @param[in] dBnParamDesc tensor descriptor desc.
- * bnScale, bnBias, running mean/var, save_mean/var.
+ * bnScale, bnBias, running mean/var,
+ * save_mean/var.
* @param[in] scale batch normalization scale parameter (in original
* paper scale is referred to as gamma).
* @param[in] scaleGrad batch normalization scale parameter (in original
@@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
*
*/
extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar);
+ real* savedMean,
+ real* savedInvVar);
#endif // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index f36c724e2d..1eb9f9ca88 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_DSO_LOADER_H_
#define HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 65f366461c..91ce9a0678 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_FUNCTIONS_H_
#define HL_FUNCTIONS_H_
@@ -21,30 +20,30 @@ limitations under the License. */
/**
* sigmoid threshold maximum
*/
-#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
/**
* sigmoid threshold minimum
*/
-#define SIGMOID_THRESHOLD_MAX 13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
#ifndef __NVCC__
namespace hppl {
- /*
- * forward activation
- */
- real relu(const real a);
- real sigmoid(const real a);
- real tanh(const real a);
- real linear(const real a);
-
- /*
- * backward activation
- */
- real relu(const real a, const real b);
- real sigmoid(const real a, const real b);
- real tanh(const real a, const real b);
- real linear(const real a, const real b);
+/*
+ * forward activation
+ */
+real relu(const real a);
+real sigmoid(const real a);
+real tanh(const real a);
+real linear(const real a);
+
+/*
+ * backward activation
+ */
+real relu(const real a, const real b);
+real sigmoid(const real a, const real b);
+real tanh(const real a, const real b);
+real linear(const real a, const real b);
} // namespace hppl
#ifdef __AVX__
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 05039663b6..3be0df3b93 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_GPU_H_
#define HL_GPU_H_
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 1f95e318a1..7e527a7902 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_LSTM_H_
#define HL_LSTM_H_
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 6195e30b99..96648661e3 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_MATRIX_H_
#define HL_MATRIX_H_
@@ -30,13 +29,8 @@ limitations under the License. */
* @param[in] beta scalar used for addition.
*
*/
-extern void hl_matrix_add(real* A_d,
- real* B_d,
- real* C_d,
- int dimM,
- int dimN,
- real alpha,
- real beta);
+extern void hl_matrix_add(
+ real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
/**
* @brief Matrix Softmax.
*
@@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
/**
* @brief Matrix softmax derivative.
@@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_softmax_derivative(real* grad_d,
- real* output_d,
- real* sftmaxSum_d,
- int dimM,
- int dimN);
+extern void hl_matrix_softmax_derivative(
+ real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
/**
* @brief Sequence softmax.
@@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d,
* @param[in] numSequence sequence number.
*
*/
-extern void hl_sequence_softmax_forward(real *A_d,
- real *C_d,
+extern void hl_sequence_softmax_forward(real* A_d,
+ real* C_d,
const int* index,
int numSequence);
@@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_classification_error(real* A_d,
- int* B_d,
- real* C_d,
- int dimM,
- int dimN);
+extern void hl_matrix_classification_error(
+ real* A_d, int* B_d, real* C_d, int dimM, int dimN);
/**
* @brief Matrix cross entropy.
@@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_cross_entropy(real* A_d,
- real* C_d,
- int* label_d,
- int dimM,
- int dimN);
+extern void hl_matrix_cross_entropy(
+ real* A_d, real* C_d, int* label_d, int dimM, int dimN);
/**
* @brief Matrix cross entropy back propagation.
@@ -120,11 +105,8 @@ extern void hl_matrix_cross_entropy(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_cross_entropy_bp(real* grad_d,
- real* output_d,
- int* label_d,
- int dimM,
- int dimN);
+extern void hl_matrix_cross_entropy_bp(
+ real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
/**
* @brief Matrix multi-binary label cross entropy
@@ -135,11 +117,8 @@ extern void hl_matrix_cross_entropy_bp(real* grad_d,
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*/
-extern void hl_matrix_multi_binary_cross_entropy(real* output,
- real* entropy,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN);
+extern void hl_matrix_multi_binary_cross_entropy(
+ real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
/**
* @brief Matrix multi-binary label cross entropy backprop
@@ -150,11 +129,8 @@ extern void hl_matrix_multi_binary_cross_entropy(real* output,
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*/
-extern void hl_matrix_multi_binary_cross_entropy_bp(real* output,
- real* grad,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN);
+extern void hl_matrix_multi_binary_cross_entropy_bp(
+ real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
/**
* @brief Matrix zero memory.
@@ -176,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num);
* @param[in] partial_sum
*/
-extern void hl_param_relu_forward(real* output,
- real* input,
- real* w,
- int width,
- int height,
- int partial_sum);
+extern void hl_param_relu_forward(
+ real* output, real* input, real* w, int width, int height, int partial_sum);
/**
* @brief parameter relu backward w
*
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 46d86b2982..bb5124df44 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SEQUENCE_H_
#define HL_SEQUENCE_H_
@@ -32,7 +31,7 @@ limitations under the License. */
extern void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
- int *index,
+ int* index,
int numSequences,
int dim);
@@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input,
* @param[in] dim input dimension.
*
*/
-extern void hl_max_sequence_backward(real* outputGrad,
- int *index,
- real* inputGrad,
- int numSequences,
- int dim);
+extern void hl_max_sequence_backward(
+ real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
/**
* @brief Context projection forward.
@@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad,
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
- * @param[in] beginPad number of extra timesteps added at the beginning.
+ * @param[in] beginPad number of extra timesteps added at the
+ * beginning.
* @param[in] isPadding trainable padding.
*
*/
@@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad,
* @param[in] totalPad number of extra timesteps.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
- * @param[in] beginPad number of extra timesteps added at the beginning.
+ * @param[in] beginPad number of extra timesteps added at the
+ * beginning.
*
*/
extern void hl_context_projection_backward_weight(real* outputGrad,
@@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
* @param[in] seq2batch copy direction.
*
*/
-extern void hl_sequence2batch_copy(real *batch,
- real *sequence,
- const int *batchIndex,
+extern void hl_sequence2batch_copy(real* batch,
+ real* sequence,
+ const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
@@ -167,9 +165,9 @@ extern void hl_sequence2batch_copy(real *batch,
* @param[in] seq2batch copy direction.
*
*/
-extern void hl_sequence2batch_add(real *batch,
- real *sequence,
- int *batchIndex,
+extern void hl_sequence2batch_add(real* batch,
+ real* sequence,
+ int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index 9acdebdebf..c4e0be23e2 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SPARSE_H_
#define HL_SPARSE_H_
@@ -31,7 +30,7 @@ limitations under the License. */
*/
extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
*
*/
extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- void * dest_d,
+ void *dest_d,
size_t size,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
*
*/
extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- real* value_d,
- int* rows_d,
- int* cols_d,
+ real *value_d,
+ int *rows_d,
+ int *cols_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
*/
extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
@@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d,
* @note transb is not support HPPL_OP_T.
*
*/
-extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_sparse_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
hl_sparse_matrix_s C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
* @note transa is not support HPPL_OP_T.
*
*/
-extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_dense_mul_csr(real *A_d,
+ hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief Memcpy csc_matrix to host.
@@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
hl_sparse_matrix_s csr_matrix,
hl_stream_t stream);
-
/**
* @brief A_d[j] += B_d[i,j] for i in range(height)
*
@@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
* @param[in] scale scale of B_d
*
*/
-extern void hl_sparse_matrix_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale);
+extern void hl_sparse_matrix_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
/**
* @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
*/
-extern void hl_matrix_csr_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale);
+extern void hl_matrix_csr_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
/**
* @brief A_d[i,j] += B_d[j]
@@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d,
*
*/
extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale);
/**
* @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
*/
extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale);
/**
@@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
*
*/
extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
@@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
* @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
*/
extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
@@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
* @return return rows pointer, which is gpu address
*
*/
-extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
/**
* @brief get cols pionter of GpuSparseMatrix
@@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
* @return return cols pointer, which is gpu address
*
*/
-extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
/**
* @brief get value pionter of GpuSparseMatrix
@@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
* @return return value pointer, which is gpu address
*
*/
-extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
+extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
#endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index 3c9428e925..b4ac83a66a 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TABLE_APPLY_H_
#define HL_TABLE_APPLY_H_
@@ -31,8 +30,10 @@ limitations under the License. */
* @param[in] dim width of table.
*
*/
-extern void hl_matrix_select_rows(real* output, int ldo,
- real* table, int ldt,
+extern void hl_matrix_select_rows(real* output,
+ int ldo,
+ real* table,
+ int ldt,
int* ids,
int numSamples,
int tableSize,
@@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo,
* @param[in] dim width of table.
*
*/
-extern void hl_matrix_add_to_rows(real* table, int ldt,
- real* input, int ldi,
+extern void hl_matrix_add_to_rows(real* table,
+ int ldt,
+ real* input,
+ int ldi,
int* ids,
int numSamples,
int tableSize,
@@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt,
*
*/
template
-extern void hl_vector_select_from(T* dst, int sized,
- const T* src, int sizes,
- const int* ids, int sizei);
+extern void hl_vector_select_from(
+ T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
-#endif /* HL_TABLE_APPLY_H_ */
+#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index 4414b0b2d2..b0a88c66a1 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TIME_H_
#define HL_TIME_H_
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index a38d4cf862..e8cfebbf6a 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TOP_K_H_
#define HL_TOP_K_H_
@@ -31,9 +30,11 @@ limitations under the License. */
* @param[in] numSamples height of input value.
*
*/
-extern void hl_matrix_top_k(real* topVal, int ldv,
- int * topIds,
- real* src, int lds,
+extern void hl_matrix_top_k(real* topVal,
+ int ldv,
+ int* topIds,
+ real* src,
+ int lds,
int dim,
int beamSize,
int numSamples);
@@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv,
*
* @note Only support HL_SPARSE_CSR format.
*/
-extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
- int * topIds,
+extern void hl_sparse_matrix_top_k(real* topVal,
+ int ldv,
+ int* topIds,
hl_sparse_matrix_s src,
int beamSize,
int numSamples);
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index 4c0c68f3c9..bb53fc581e 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AGGREGATE_STUB_H_
#define HL_AGGREGATE_STUB_H_
#include "hl_aggregate.h"
-inline void hl_matrix_row_sum(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_row_max(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_row_min(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_sum(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_max(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_min(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c6f32ad337..2f73b9671e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -12,84 +12,134 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CNN_STUB_H_
#define HL_CNN_STUB_H_
#include "hl_cnn.h"
-inline void hl_shrink_col2feature(
- const real * dataCol, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataIm,
- real alpha, real beta) {}
-
-inline void hl_expand_feature2col(
- const real* dataIm, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataCol) {}
-
-inline void hl_maxpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride) {}
-
-inline void hl_maxpool_backward(
- const int frameCnt, const real* inputData,
- const real* outData, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real scaleA, real scaleB,
- real* targetGrad, const int outStride) {}
-
-inline void hl_avgpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride) {}
-
-inline void hl_avgpool_backward(
- const int frameCnt, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- int paddingH, int paddingW,
- real scaleA, real scaleB,
- real* backGrad, const int outStride) {}
-
-inline void hl_CMRNorm_forward(
- size_t frameCnt, const real* in, real* scale, real* out,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta) {}
-
-inline void hl_CMRNorm_backward(
- size_t frameCnt, const real* inV, const real* scale,
- const real* outV, const real* outDiff, real *inDiff,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta) {}
+inline void hl_shrink_col2feature(const real* dataCol,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataIm,
+ real alpha,
+ real beta) {}
+
+inline void hl_expand_feature2col(const real* dataIm,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataCol) {}
+
+inline void hl_maxpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+ const real* inputData,
+ const real* outData,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real scaleA,
+ real scaleB,
+ real* targetGrad,
+ const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ int paddingH,
+ int paddingW,
+ real scaleA,
+ real scaleB,
+ real* backGrad,
+ const int outStride) {}
+
+inline void hl_CMRNorm_forward(size_t frameCnt,
+ const real* in,
+ real* scale,
+ real* out,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta) {}
+
+inline void hl_CMRNorm_backward(size_t frameCnt,
+ const real* inV,
+ const real* scale,
+ const real* outV,
+ const real* outDiff,
+ real* inDiff,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta) {}
inline void hl_bilinear_forward(const real* inData,
const size_t inImgH,
@@ -106,25 +156,33 @@ inline void hl_bilinear_forward(const real* inData,
const real ratioW) {}
inline void hl_bilinear_backward(real* inGrad,
- const size_t inImgH,
- const size_t inImgW,
- const size_t inputH,
- const size_t inputW,
- const real* outGrad,
- const size_t outImgH,
- const size_t outImgW,
- const size_t outputH,
- const size_t outputW,
- const size_t numChannels,
- const real ratioH,
- const real ratioW) {}
-
-inline void hl_maxout_forward(
- const real* inData, real* outData, int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t group) {}
-
-inline void hl_maxout_backward(
- real* inGrad, const real* outGrad, const int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t group) {}
+ const size_t inImgH,
+ const size_t inImgW,
+ const size_t inputH,
+ const size_t inputW,
+ const real* outGrad,
+ const size_t outImgH,
+ const size_t outImgW,
+ const size_t outputH,
+ const size_t outputW,
+ const size_t numChannels,
+ const real ratioH,
+ const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+ real* outData,
+ int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+ const real* outGrad,
+ const int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t group) {}
#endif // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 903dcbe835..85f7c390c4 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -12,41 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUBLAS_STUB_H_
#define HL_CUDA_CUBLAS_STUB_H_
#include "hl_cuda_cublas.h"
-inline void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_inverse(real *A_d,
- real *C_d,
- int dimN,
- int lda,
- int ldc) {}
-
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
- real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc) {}
+inline void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+inline void hl_matrix_inverse(
+ real *A_d, real *C_d, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
+ real *C_d,
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
#endif // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index b96804afd8..3beb0e5b51 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUDNN_STUB_H_
#define HL_CUDA_CUDNN_STUB_H_
#include "hl_cuda_cudnn.h"
-inline int hl_get_cudnn_lib_version() {
- return 0;
-}
+inline int hl_get_cudnn_lib_version() { return 0; }
inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
@@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input,
hl_pooling_descriptor pooling) {}
inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
- int input_feature_maps,
- int output_feature_maps,
- int height,
- int width) {}
+ int input_feature_maps,
+ int output_feature_maps,
+ int height,
+ int width) {}
inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
- hl_tensor_descriptor image,
- hl_filter_descriptor filter,
- int padding_height,
- int padding_width,
- int stride_height,
- int stride_width) {}
+ hl_tensor_descriptor image,
+ hl_filter_descriptor filter,
+ int padding_height,
+ int padding_width,
+ int stride_height,
+ int stride_width) {}
inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
- hl_tensor_descriptor image,
- hl_filter_descriptor filter,
- int padding_height,
- int padding_width,
- int stride_height,
- int stride_width) {}
+ hl_tensor_descriptor image,
+ hl_filter_descriptor filter,
+ int padding_height,
+ int padding_width,
+ int stride_height,
+ int stride_width) {}
inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
inline void hl_conv_workspace(hl_tensor_descriptor input,
- hl_tensor_descriptor output,
- hl_filter_descriptor filter,
- hl_convolution_descriptor conv,
- int* convFwdAlgo,
- size_t* fwdLimitBytes,
- int* convBwdDataAlgo,
- size_t* bwdDataLimitBytes,
- int* convBwdFilterAlgo,
- size_t* bwdFilterLimitBytes) {}
+ hl_tensor_descriptor output,
+ hl_filter_descriptor filter,
+ hl_convolution_descriptor conv,
+ int* convFwdAlgo,
+ size_t* fwdLimitBytes,
+ int* convBwdDataAlgo,
+ size_t* bwdDataLimitBytes,
+ int* convBwdFilterAlgo,
+ size_t* bwdFilterLimitBytes) {}
inline void hl_convolution_forward(hl_tensor_descriptor input,
real* input_data,
@@ -116,86 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input,
int convFwdAlgo) {}
inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
- real* bias_data,
- hl_tensor_descriptor output,
- real* output_data) {}
-
-inline void hl_convolution_backward_filter(
- hl_tensor_descriptor input,
- real* input_data,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_grad_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(
- hl_tensor_descriptor input,
- real* input_data_grad,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdDataAlgo) {}
+ real* bias_data,
+ hl_tensor_descriptor output,
+ real* output_data) {}
+
+inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
+ real* input_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_grad_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(hl_tensor_descriptor input,
+ real* input_data_grad,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdDataAlgo) {}
inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
- real* bias_grad_data,
- hl_tensor_descriptor output,
- real* output_grad_data) {}
+ real* bias_grad_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data) {}
-inline void hl_softmax_forward(real *input,
- real *output,
- int height,
- int width) {}
-
-inline void hl_softmax_backward(real *output_value,
- real *output_grad,
+inline void hl_softmax_forward(real* input,
+ real* output,
int height,
int width) {}
+inline void hl_softmax_backward(real* output_value,
+ real* output_grad,
+ int height,
+ int width) {}
+
inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar) {}
+ real* savedMean,
+ real* savedVar) {}
inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedVar,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedVar,
double epsilon) {}
inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar) {}
+ real* savedMean,
+ real* savedInvVar) {}
#endif // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index fa7904421d..24923a0d4a 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_STUB_H_
#define HL_CUDA_STUB_H_
@@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {}
inline void hl_init(int device) {}
-inline int hl_get_cuda_lib_version(int device) {
- return 0;
-}
+inline int hl_get_cuda_lib_version(int device) { return 0; }
inline void hl_fini() {}
inline void hl_set_sync_flag(bool flag) {}
-inline bool hl_get_sync_flag() {
- return false;
-}
+inline bool hl_get_sync_flag() { return false; }
-inline int hl_get_device_count() { return 0; }
+inline int hl_get_device_count() { return 0; }
inline void hl_set_device(int device) {}
-inline int hl_get_device() { return 0; }
+inline int hl_get_device() { return 0; }
-inline void* hl_malloc_device(size_t size) { return NULL; }
+inline void *hl_malloc_device(size_t size) { return NULL; }
inline void hl_free_mem_device(void *dest_d) {}
-inline void* hl_malloc_host(size_t size) { return NULL; }
+inline void *hl_malloc_host(size_t size) { return NULL; }
inline void hl_free_mem_host(void *dest_h) {}
@@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {}
inline void hl_srand(unsigned int seed) {}
-inline void hl_memcpy_async(void *dst, void *src, size_t size,
+inline void hl_memcpy_async(void *dst,
+ void *src,
+ size_t size,
hl_stream_t stream) {}
inline void hl_stream_synchronize(hl_stream_t stream) {}
@@ -83,11 +80,11 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
inline void hl_event_synchronize(hl_event_t event) {}
-inline int hl_get_device_last_error() { return 0; }
+inline int hl_get_device_last_error() { return 0; }
-inline const char* hl_get_device_error_string() { return NULL; }
+inline const char *hl_get_device_error_string() { return NULL; }
-inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+inline const char *hl_get_device_error_string(size_t err) { return NULL; }
inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 2700bef02a..7ccda032d2 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_LSTM_STUB_H_
#define HL_LSTM_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 76cac2e577..1bd78d23fb 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_MATRIX_STUB_H_
#define HL_MATRIX_STUB_H_
@@ -26,48 +25,30 @@ inline void hl_matrix_add(real* A_d,
real alpha,
real beta) {}
-inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
-inline void hl_sequence_softmax_forward(real *A_d,
- real *C_d,
+inline void hl_sequence_softmax_forward(real* A_d,
+ real* C_d,
const int* index,
int numSequence) {}
-inline void hl_matrix_softmax_derivative(real* grad_d,
- real* output_d,
- real* sftmaxSum_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_classification_error(real* A_d,
- int* B_d,
- real* C_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_cross_entropy(real* A_d,
- real* C_d,
- int* label_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(real* grad_d,
- real* output_d,
- int* label_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(real* output,
- real* entropy,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(real* output,
- real* grad,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN) {}
+inline void hl_matrix_softmax_derivative(
+ real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
+
+inline void hl_matrix_classification_error(
+ real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy(
+ real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(
+ real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy(
+ real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy_bp(
+ real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
inline void hl_matrix_zero_mem(real* data, int num) {}
@@ -101,7 +82,6 @@ inline void hl_cossim(real* output,
int input2_height,
real scale) {}
-
inline void hl_cossim_derivative(real* grad,
real* output,
real* prevOutX,
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index aabd956c37..381f0a6f26 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SEQUENCE_STUB_H_
#define HL_SEQUENCE_STUB_H_
@@ -21,15 +20,12 @@ limitations under the License. */
inline void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
- int *index,
+ int* index,
int numSequences,
int dim) {}
-inline void hl_max_sequence_backward(real* outputGrad,
- int *index,
- real* inputGrad,
- int numSequences,
- int dim) {}
+inline void hl_max_sequence_backward(
+ real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
inline void hl_context_projection_forward(real* input,
const int* sequence,
@@ -60,16 +56,16 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
int contextStart,
int beginPad) {}
-inline void hl_sequence2batch_copy(real *batch,
- real *sequence,
- const int *batchIndex,
+inline void hl_sequence2batch_copy(real* batch,
+ real* sequence,
+ const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
-inline void hl_sequence2batch_add(real *batch,
- real *sequence,
- int *batchIndex,
+inline void hl_sequence2batch_add(real* batch,
+ real* sequence,
+ int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index 346a1900dd..d47bdd2c47 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SPARSE_STUB_H_
#define HL_SPARSE_STUB_H_
@@ -20,7 +19,7 @@ limitations under the License. */
inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
@@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- void * dest_d,
+ void *dest_d,
size_t size,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- real* value_d,
- int* rows_d,
- int* cols_d,
+ real *value_d,
+ int *rows_d,
+ int *cols_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
@@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
inline void hl_matrix_dense_mul_csc(real *A_d,
hl_trans_op_t transa,
@@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d,
real alpha,
real beta) {}
-inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+inline void hl_sparse_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
hl_sparse_matrix_s C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
-inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+inline void hl_matrix_dense_mul_csr(real *A_d,
+ hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
inline void hl_memcpy_from_csc_matrix(real *csc_val,
size_t val_size,
@@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val,
hl_sparse_matrix_s csr_matrix,
hl_stream_t stream) {}
-inline void hl_sparse_matrix_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale) {}
+inline void hl_sparse_matrix_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
-inline void hl_matrix_csr_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale) {}
+inline void hl_matrix_csr_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale) {}
inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale) {}
inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {}
inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {}
-inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
- return NULL;
-}
+inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
-inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
- return NULL;
-}
+inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
-inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
return NULL;
}
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2922d4dc29..2412ed5abc 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -32,32 +32,35 @@
#include
/* yes I know, the top of this file is quite ugly */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
/* __m128 is ugly to write */
-typedef __m256 v8sf; // vector of 8 float (avx)
-typedef __m256i v8si; // vector of 8 int (avx)
-typedef __m128i v4si; // vector of 8 int (avx)
+typedef __m256 v8sf; // vector of 8 float (avx)
+typedef __m256i v8si; // vector of 8 int (avx)
+typedef __m128i v4si; // vector of 8 int (avx)
-#define _PI32AVX_CONST(Name, Val) \
- static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+#define _PI32AVX_CONST(Name, Val) \
+ static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+ Val, Val, Val, Val}
_PI32AVX_CONST(1, 1);
_PI32AVX_CONST(inv1, ~1);
_PI32AVX_CONST(2, 2);
_PI32AVX_CONST(4, 4);
-
/* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val) \
- static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PI32_CONST256(Name, Val) \
- static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PS256_CONST_TYPE(Name, Type, Val) \
- static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-
-_PS256_CONST(1 , 1.0f);
+#define _PS256_CONST(Name, Val) \
+ static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val) \
+ static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val) \
+ static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
_PS256_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
@@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f);
_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
_PS256_CONST(cephes_log_q1, -2.12194440e-4);
_PS256_CONST(cephes_log_q2, 0.693359375);
@@ -94,50 +97,51 @@ typedef union imm_xmm_union {
v4si xmm[2];
} imm_xmm_union;
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
- imm_xmm_union u __attribute__((aligned(32))); \
- u.imm = imm_; \
- xmm0_ = u.xmm[0]; \
- xmm1_ = u.xmm[1]; \
-}
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
+ { \
imm_xmm_union u __attribute__((aligned(32))); \
- u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+ u.imm = imm_; \
+ xmm0_ = u.xmm[0]; \
+ xmm1_ = u.xmm[1]; \
}
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
+ { \
+ imm_xmm_union u __attribute__((aligned(32))); \
+ u.xmm[0] = xmm0_; \
+ u.xmm[1] = xmm1_; \
+ imm_ = u.imm; \
+ }
-#define AVX2_BITOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, int a) \
-{ \
- /* use SSE2 instruction to perform the bitop AVX2 */ \
- v4si x1, x2; \
- v8si ret; \
- COPY_IMM_TO_XMM(x, x1, x2); \
- x1 = _mm_##fn(x1,a); \
- x2 = _mm_##fn(x2,a); \
- COPY_XMM_TO_IMM(x1, x2, ret); \
- return(ret); \
-}
+#define AVX2_BITOP_USING_SSE2(fn) \
+ static inline v8si avx2_mm256_##fn(v8si x, int a) { \
+ /* use SSE2 instruction to perform the bitop AVX2 */ \
+ v4si x1, x2; \
+ v8si ret; \
+ COPY_IMM_TO_XMM(x, x1, x2); \
+ x1 = _mm_##fn(x1, a); \
+ x2 = _mm_##fn(x2, a); \
+ COPY_XMM_TO_IMM(x1, x2, ret); \
+ return (ret); \
+ }
//#warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)
-#define AVX2_INTOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
-{ \
- /* use SSE2 instructions to perform the AVX2 integer operation */ \
- v4si x1, x2; \
- v4si y1, y2; \
- v8si ret; \
- COPY_IMM_TO_XMM(x, x1, x2); \
- COPY_IMM_TO_XMM(y, y1, y2); \
- x1 = _mm_##fn(x1,y1); \
- x2 = _mm_##fn(x2,y2); \
- COPY_XMM_TO_IMM(x1, x2, ret); \
- return(ret); \
-}
+#define AVX2_INTOP_USING_SSE2(fn) \
+ static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \
+ /* use SSE2 instructions to perform the AVX2 integer operation */ \
+ v4si x1, x2; \
+ v4si y1, y2; \
+ v8si ret; \
+ COPY_IMM_TO_XMM(x, x1, x2); \
+ COPY_IMM_TO_XMM(y, y1, y2); \
+ x1 = _mm_##fn(x1, y1); \
+ x2 = _mm_##fn(x2, y2); \
+ COPY_XMM_TO_IMM(x1, x2, ret); \
+ return (ret); \
+ }
//#warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2(and_si128)
@@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32)
#define avx2_mm256_add_epi32 _mm256_add_epi32
#endif /* __AVX2__ */
-
-/* natural logarithm computed for 8 simultaneous float
+/* natural logarithm computed for 8 simultaneous float
return NaN for x <= 0
*/
v8sf log256_ps(v8sf x) {
v8si imm0;
- v8sf one = *(v8sf*)_ps256_1;
+ v8sf one = *(v8sf *)_ps256_1;
- //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+ // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
- x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */
+ x = _mm256_max_ps(
+ x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
// can be done with AVX2
imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
/* keep only the fractional part */
- x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
- x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+ x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+ x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
// this is again another AVX2 instruction
- imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+ imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
v8sf e = _mm256_cvtepi32_ps(imm0);
e = _mm256_add_ps(e, one);
- /* part2:
+ /* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
- //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
- v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+ // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+ v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
v8sf tmp = _mm256_and_ps(x, mask);
x = _mm256_sub_ps(x, one);
e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
x = _mm256_add_ps(x, tmp);
- v8sf z = _mm256_mul_ps(x,x);
+ v8sf z = _mm256_mul_ps(x, x);
- v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+ v8sf y = *(v8sf *)_ps256_cephes_log_p0;
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
y = _mm256_mul_ps(y, x);
y = _mm256_mul_ps(y, z);
-
- tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
- y = _mm256_add_ps(y, tmp);
+ tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+ y = _mm256_add_ps(y, tmp);
- tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+ tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
y = _mm256_sub_ps(y, tmp);
- tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+ tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
x = _mm256_add_ps(x, y);
x = _mm256_add_ps(x, tmp);
- x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+ x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS256_CONST(cephes_exp_C1, 0.693359375);
@@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf exp256_ps(v8sf x) {
v8sf tmp = _mm256_setzero_ps(), fx;
v8si imm0;
- v8sf one = *(v8sf*)_ps256_1;
+ v8sf one = *(v8sf *)_ps256_1;
- x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
- x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+ x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+ x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
- fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
- fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+ fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+ fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
/* how to perform a floorf with SSE: just below */
- //imm0 = _mm256_cvttps_epi32(fx);
- //tmp = _mm256_cvtepi32_ps(imm0);
-
+ // imm0 = _mm256_cvttps_epi32(fx);
+ // tmp = _mm256_cvtepi32_ps(imm0);
+
tmp = _mm256_floor_ps(fx);
/* if greater, substract 1 */
- //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
- v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+ // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+ v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
- tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
- v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+ tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+ v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
- z = _mm256_mul_ps(x,x);
-
- v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+ z = _mm256_mul_ps(x, x);
+
+ v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
@@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
// another two AVX2 instructions
- imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+ imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_slli_epi32(imm0, 23);
v8sf pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
@@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625);
_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1, 8.3321608736E-3);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
_PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0, 2.443315711809948E-005);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
_PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2, 4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 8 sines at onces using AVX intrisics
@@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
surprising but correct result.
*/
-v8sf sin256_ps(v8sf x) { // any x
+v8sf sin256_ps(v8sf x) { // any x
v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
v8si imm0, imm2;
@@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit = x;
/* take the absolute value */
- x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+ x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
/* extract the sign bit (upper one) */
- sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
-
+ sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
/* scale by 4/Pi */
- y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+ y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
- /*
- Here we start a series of integer operations, which are in the
- realm of AVX2.
- If we don't have AVX, let's perform them using SSE2 directives
- */
+/*
+ Here we start a series of integer operations, which are in the
+ realm of AVX2.
+ If we don't have AVX, let's perform them using SSE2 directives
+*/
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
- imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
- imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+ imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+ imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
y = _mm256_cvtepi32_ps(imm2);
/* get the swap sign flag */
- imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+ imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_slli_epi32(imm0, 29);
- /* get the polynom selection mask
+ /* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4
#include "hl_functions.h"
namespace hppl {
- extern __m256 exp(__m256 a);
+extern __m256 exp(__m256 a);
- __m256 relu(const __m256 a) {
- __m256 tmp = _mm256_set1_ps(0.0f);
- return _mm256_max_ps(a, tmp);
- }
+__m256 relu(const __m256 a) {
+ __m256 tmp = _mm256_set1_ps(0.0f);
+ return _mm256_max_ps(a, tmp);
+}
- __m256 sigmoid(const __m256 a) {
- __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
- __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
- __m256 tmp = _mm256_max_ps(a, min);
- tmp = _mm256_min_ps(tmp, max);
- tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
- tmp = exp(tmp);
- tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
- tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
- return tmp;
- }
+__m256 sigmoid(const __m256 a) {
+ __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+ __m256 tmp = _mm256_max_ps(a, min);
+ tmp = _mm256_min_ps(tmp, max);
+ tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+ tmp = exp(tmp);
+ tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+ tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+ return tmp;
+}
- __m256 tanh(const __m256 a) {
- __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
- __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
- tmp = _mm256_min_ps(tmp, max);
- tmp = exp(tmp);
- return _mm256_sub_ps(
- _mm256_div_ps(_mm256_set1_ps(2.0f),
- _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
- }
+__m256 tanh(const __m256 a) {
+ __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+ __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+ tmp = _mm256_min_ps(tmp, max);
+ tmp = exp(tmp);
+ return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+ _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+ _mm256_set1_ps(1.0f));
+}
- __m256 linear(const __m256 a) {
- return a;
- }
+__m256 linear(const __m256 a) { return a; }
- __m256 relu(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(a,
+__m256 relu(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(
+ a,
_mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
- _mm256_set1_ps(1.0f)));
- }
+ _mm256_set1_ps(1.0f)));
+}
- __m256 sigmoid(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(_mm256_mul_ps(a, b),
- _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
- }
+__m256 sigmoid(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(_mm256_mul_ps(a, b),
+ _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
- __m256 tanh(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(a,
- _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
- }
+__m256 tanh(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(
+ a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
- __m256 linear(const __m256 a, const __m256 b) {
- return a;
- }
+__m256 linear(const __m256 a, const __m256 b) { return a; }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index b8352c2d53..af00f352e5 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -12,46 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include "hl_functions.h"
namespace hppl {
- real relu(const real a) {
- return a > 0.0f ? a : 0.0f;
- }
-
- real sigmoid(const real a) {
- const real min = SIGMOID_THRESHOLD_MIN;
- const real max = SIGMOID_THRESHOLD_MAX;
- real tmp = (a < min) ? min : ((a > max) ? max : a);
- return 1.0 / (1.0 + exp(-tmp));
- }
-
- real tanh(const real a) {
- real tmp = -2.0 * a;
- tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
- return (2.0 / (1.0 + exp(tmp))) - 1.0;
- }
-
- real linear(const real a) {
- return a;
- }
-
- real relu(const real a, const real b) {
- return a * (b > 0.0f ? 1.0f : 0.0f);
- }
-
- real sigmoid(const real a, const real b) {
- return a * b * (1 - b);
- }
-
- real tanh(const real a, const real b) {
- return a * (1.0f - b * b);
- }
-
- real linear(const real a, const real b) {
- return a;
- }
+real relu(const real a) { return a > 0.0f ? a : 0.0f; }
+
+real sigmoid(const real a) {
+ const real min = SIGMOID_THRESHOLD_MIN;
+ const real max = SIGMOID_THRESHOLD_MAX;
+ real tmp = (a < min) ? min : ((a > max) ? max : a);
+ return 1.0 / (1.0 + exp(-tmp));
+}
+
+real tanh(const real a) {
+ real tmp = -2.0 * a;
+ tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+ return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+real linear(const real a) { return a; }
+
+real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
+
+real sigmoid(const real a, const real b) { return a * b * (1 - b); }
+
+real tanh(const real a, const real b) { return a * (1.0f - b * b); }
+
+real linear(const real a, const real b) { return a; }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index f16376ec93..e8ba232d44 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include "hl_cuda.h"
@@ -24,7 +23,7 @@ limitations under the License. */
namespace dynload {
std::once_flag cublas_dso_flag;
-void* cublas_dso_handle = nullptr;
+void *cublas_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -34,31 +33,30 @@ void* cublas_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- cublasStatus_t operator()(Args... args) { \
- typedef cublasStatus_t (*cublasFunc)(Args...); \
- std::call_once(cublas_dso_flag, GetCublasDsoHandle, \
- &cublas_dso_handle); \
- void* p_##__name = dlsym(cublas_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ cublasStatus_t operator()(Args... args) { \
+ typedef cublasStatus_t (*cublasFunc)(Args...); \
+ std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+ void *p_##__name = dlsym(cublas_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
} __name; // struct DynLoad__##__name
#else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- cublasStatus_t operator()(Args... args) { \
- return __name(args...); \
- } \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ cublasStatus_t operator()(Args... args) { \
+ return __name(args...); \
+ } \
} __name; // struct DynLoad__##__name
#endif
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
- DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
// include all needed cublas functions in HPPL
+// clang-format off
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSgemv) \
__macro(cublasDgemv) \
@@ -88,41 +86,41 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
} /* namespace dynload */
-
+// clang-format on
#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
#else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
#endif
-const char* hl_cublas_get_error_string(cublasStatus_t status) {
- switch(status) {
- case CUBLAS_STATUS_NOT_INITIALIZED:
- return "[cublas status]: not initialized";
- case CUBLAS_STATUS_ALLOC_FAILED:
- return "[cublas status]: allocate failed";
- case CUBLAS_STATUS_INVALID_VALUE:
- return "[cublas status]: invalid value";
- case CUBLAS_STATUS_ARCH_MISMATCH:
- return "[cublas status]: arch mismatch";
- case CUBLAS_STATUS_MAPPING_ERROR:
- return "[cublas status]: mapping error";
- case CUBLAS_STATUS_EXECUTION_FAILED:
- return "[cublas status]: execution failed";
- case CUBLAS_STATUS_INTERNAL_ERROR:
- return "[cublas status]: internal error";
- case CUBLAS_STATUS_SUCCESS:
- return "[cublas status]: success";
- default:
- return "[cublas status]: unknown error";
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
+ switch (status) {
+ case CUBLAS_STATUS_NOT_INITIALIZED:
+ return "[cublas status]: not initialized";
+ case CUBLAS_STATUS_ALLOC_FAILED:
+ return "[cublas status]: allocate failed";
+ case CUBLAS_STATUS_INVALID_VALUE:
+ return "[cublas status]: invalid value";
+ case CUBLAS_STATUS_ARCH_MISMATCH:
+ return "[cublas status]: arch mismatch";
+ case CUBLAS_STATUS_MAPPING_ERROR:
+ return "[cublas status]: mapping error";
+ case CUBLAS_STATUS_EXECUTION_FAILED:
+ return "[cublas status]: execution failed";
+ case CUBLAS_STATUS_INTERNAL_ERROR:
+ return "[cublas status]: internal error";
+ case CUBLAS_STATUS_SUCCESS:
+ return "[cublas status]: success";
+ default:
+ return "[cublas status]: unknown error";
}
}
@@ -131,27 +129,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) {
* support << operator for more details error info.
*/
cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func) \
- g_cublasStat = cublas_func; \
- CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
- << "Cublas Error: " \
- << hl_cublas_get_error_string(g_cublasStat) \
- << " "
+#define CHECK_CUBLAS(cublas_func) \
+ g_cublasStat = cublas_func; \
+ CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+ << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
- << "[cublas init] Cublas create handle faild!";
+ << "[cublas init] Cublas create handle faild!";
CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
- << "[cublas init] Cublas set stream faild!";
+ << "[cublas init] Cublas set stream faild!";
}
-void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc) {
+void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
real alpha = 1.0;
real beta = 0.0;
@@ -159,11 +151,18 @@ void hl_matrix_transpose(real *A_d,
CHECK_NOTNULL(C_d);
CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
- CUBLAS_OP_T, CUBLAS_OP_N,
- dimM, dimN,
- &alpha, A_d, lda,
- &beta, nullptr, dimM,
- C_d, ldc));
+ CUBLAS_OP_T,
+ CUBLAS_OP_N,
+ dimM,
+ dimN,
+ &alpha,
+ A_d,
+ lda,
+ &beta,
+ nullptr,
+ dimM,
+ C_d,
+ ldc));
CHECK_SYNC("hl_matrix_transpose failed");
}
@@ -181,21 +180,20 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
real **inout_d = (real **)hl_malloc_device(sizeof(real *));
hl_memcpy(inout_d, inout_h, sizeof(real *));
- int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));
+ int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
int *info_d = (int *)t_resource.gpu_mem;
/* Note: cublasSgetrfBatched is used to calculate a number of
small-sized matrices. There may be a better way to reconstruct
the API for better performance.
*/
- CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
- dimN, inout_d, lda, pivot_d,
- info_d, 1));
+ CHECK_CUBLAS(
+ CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
- int info_h;
+ int info_h;
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
- LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+ LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
}
/* Step 2: Compute the inverse of the matrix given its LU decomposition */
@@ -204,27 +202,40 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
hl_memcpy(out_d, out_h, sizeof(real *));
CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
- dimN, (const real **)inout_d, lda, pivot_d,
- out_d, ldc, info_d, 1));
+ dimN,
+ (const real **)inout_d,
+ lda,
+ pivot_d,
+ out_d,
+ ldc,
+ info_d,
+ 1));
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
- LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+ LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
}
hl_free_mem_device(inout_d);
hl_free_mem_device(pivot_d);
hl_free_mem_device(out_d);
-
+
CHECK_SYNC("hl_matrix_inverse failed");
}
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc) {
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
@@ -232,8 +243,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
int m = (transa == HPPL_OP_N) ? dimM : dimK;
int n = (transa == HPPL_OP_N) ? dimK : dimM;
- hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
- alpha, beta, lda, ldb, ldc);
+ hl_matrix_mul_vector(
+ A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
return;
}
@@ -241,8 +252,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
int m = (transb == HPPL_OP_N) ? dimK : dimN;
int n = (transb == HPPL_OP_N) ? dimN : dimK;
hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
- hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
- alpha, beta, ldb, 1, 1);
+ hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
return;
}
@@ -251,26 +261,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else {
LOG(FATAL) << "parameter transa error!";
}
@@ -278,24 +309,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
CHECK_SYNC("hl_matrix_mul failed");
}
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {
int lda = (HPPL_OP_N == transa) ? dimK : dimM;
int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
int ldc = dimN;
- hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
- dimK, alpha, beta, lda, ldb, ldc);
+ hl_matrix_mul(A_d,
+ transa,
+ B_d,
+ transb,
+ C_d,
+ dimM,
+ dimN,
+ dimK,
+ alpha,
+ beta,
+ lda,
+ ldb,
+ ldc);
}
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta,
- int lda, int incb, int incc) {
+void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta,
+ int lda,
+ int incb,
+ int incc) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
@@ -304,21 +357,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
if (HPPL_OP_N == trans) {
stat = CUBLAS_GEMV(t_resource.handle,
CUBLAS_OP_T,
- dimN, dimM,
+ dimN,
+ dimM,
&alpha,
- A_d, lda,
- B_d, incb,
+ A_d,
+ lda,
+ B_d,
+ incb,
&beta,
- C_d, incc);
+ C_d,
+ incc);
} else if (HPPL_OP_T == trans) {
stat = CUBLAS_GEMV(t_resource.handle,
CUBLAS_OP_N,
- dimN, dimM,
+ dimN,
+ dimM,
&alpha,
- A_d, lda,
- B_d, incb,
+ A_d,
+ lda,
+ B_d,
+ incb,
&beta,
- C_d, incc);
+ C_d,
+ incc);
} else {
LOG(FATAL) << "parameter transa error!";
}
@@ -327,10 +388,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
CHECK_SYNC("hl_matrix_mul_vector");
}
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta) {
- hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
- alpha, beta, dimN, 1, 1);
+void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta) {
+ hl_matrix_mul_vector(
+ A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
}
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 92b28e4345..9d4ff08a78 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include "hl_cuda_cudnn.h"
@@ -22,9 +21,10 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
- "Specify cuDNN max workspace limit, in units MB, "
- "4096MB=4GB by default.");
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+ 4096,
+ "Specify cuDNN max workspace limit, in units MB, "
+ "4096MB=4GB by default.");
namespace dynload {
@@ -41,16 +41,15 @@ void* cudnn_dso_handle = nullptr;
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- using cudnn_func = decltype(__name(args...))(*)(Args...); \
- std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, \
- &cudnn_dso_handle); \
- void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ using cudnn_func = decltype(__name(args...)) (*)(Args...); \
+ std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+ void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
} __name; /* struct DynLoad__##__name */
#else
@@ -69,6 +68,7 @@ void* cudnn_dso_handle = nullptr;
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
+// clang-format off
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor) \
__macro(cudnnSetTensor4dDescriptorEx) \
@@ -141,58 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#undef CUDNN_DNN_ROUTINE_EACH
-
+// clang-format on
} /* namespace dynload */
/**
* Check build-in cudnn function using glog and it **does not**
* support << operator for more details error info.
*/
-#define CHECK_CUDNN(cudnnFunc) \
- do { \
- cudnnStatus_t cudnnStat = cudnnFunc; \
- CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \
- << "Cudnn Error: " \
- << dynload::cudnnGetErrorString(cudnnStat); \
+#define CHECK_CUDNN(cudnnFunc) \
+ do { \
+ cudnnStatus_t cudnnStat = cudnnFunc; \
+ CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \
+ << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
} while (0)
bool g_is_libcudnn_init = false;
int g_cudnn_lib_version = 0;
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc)
-{
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
}
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
- size_t cudnn_dso_ver = dynload::cudnnGetVersion();
- size_t cudnn_dso_major = cudnn_dso_ver / 1000;
- size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
- // Compare cudnn header version with that of cudnn.so.
- CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
- (cudnn_cuh_major == cudnn_dso_major))
- << "[cudnn init] libcudnn v" << cudnn_dso_major <<
- " with header v" << cudnn_cuh_major << " unmatched!\n"
- << "PaddlePaddle Requirement: "
- << "(header v[2-3] with libcudnn v[2-3]) Or "
- << "(header v4 with libcudnn v4) Or "
- << "(header v5 with libcudnn v5).";
-
- CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
- << "cudnn v5 requires cuda version >= 7.5";
-
- CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
- CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
- g_is_libcudnn_init = true;
- g_cudnn_lib_version = cudnn_dso_ver;
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+ size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+ size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+ size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+ // Compare cudnn header version with that of cudnn.so.
+ CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+ (cudnn_cuh_major == cudnn_dso_major))
+ << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+ << cudnn_cuh_major << " unmatched!\n"
+ << "PaddlePaddle Requirement: "
+ << "(header v[2-3] with libcudnn v[2-3]) Or "
+ << "(header v4 with libcudnn v4) Or "
+ << "(header v5 with libcudnn v5).";
+
+ CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+ << "cudnn v5 requires cuda version >= 7.5";
+
+ CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+ CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+ g_is_libcudnn_init = true;
+ g_cudnn_lib_version = cudnn_dso_ver;
}
-int hl_get_cudnn_lib_version() {
- return g_cudnn_lib_version;
-}
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
void hl_conv_workspace(hl_tensor_descriptor input,
hl_tensor_descriptor output,
@@ -206,94 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input,
size_t* bwdFilterLimitBytes) {
#if CUDNN_VERSION >= 4000
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
-
- // Specify workspace limit directly
- size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
- // cudnn convolution forward configuration
- cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
- t_resource.cudnn_handle,
- fwd_src_desc,
- fwd_filter_desc,
- fwd_conv_desc,
- fwd_dest_desc,
- CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convFwdAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
- t_resource.cudnn_handle,
- fwd_src_desc,
- fwd_filter_desc,
- fwd_conv_desc,
- fwd_dest_desc,
- static_cast(*convFwdAlgo),
- fwdLimitBytes));
-
- // cudnn convolution backward data configuration
- cudnnFilterDescriptor_t bwd_data_filter_desc =
- GET_FILTER_DESCRIPTOR(filter);
- cudnnTensorDescriptor_t bwd_data_diff_desc =
- GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bwd_data_grad_desc =
- GET_TENSOR_DESCRIPTOR(input);
- cudnnConvolutionDescriptor_t bwd_data_conv_desc =
- GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
- t_resource.cudnn_handle,
- bwd_data_filter_desc,
- bwd_data_diff_desc,
- bwd_data_conv_desc,
- bwd_data_grad_desc,
- CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convBwdDataAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
- t_resource.cudnn_handle,
- bwd_data_filter_desc,
- bwd_data_diff_desc,
- bwd_data_conv_desc,
- bwd_data_grad_desc,
- static_cast(*convBwdDataAlgo),
- bwdDataLimitBytes));
-
- // cudnn convolution backward filter configuration
- cudnnTensorDescriptor_t bwd_filter_src_desc =
- GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t bwd_filter_diff_desc =
- GET_TENSOR_DESCRIPTOR(output);
- cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
- GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnFilterDescriptor_t bwd_filter_grad_desc =
- GET_FILTER_DESCRIPTOR(filter);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
- t_resource.cudnn_handle,
- bwd_filter_src_desc,
- bwd_filter_diff_desc,
- bwd_filter_conv_desc,
- bwd_filter_grad_desc,
- CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convBwdFilterAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
- t_resource.cudnn_handle, bwd_filter_src_desc,
- bwd_filter_diff_desc, bwd_filter_conv_desc,
- bwd_filter_grad_desc,
- static_cast(*convBwdFilterAlgo),
- bwdFilterLimitBytes));
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+
+ // Specify workspace limit directly
+ size_t memoryLimitBytes =
+ (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+ // cudnn convolution forward configuration
+ cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+ t_resource.cudnn_handle,
+ fwd_src_desc,
+ fwd_filter_desc,
+ fwd_conv_desc,
+ fwd_dest_desc,
+ CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convFwdAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+ t_resource.cudnn_handle,
+ fwd_src_desc,
+ fwd_filter_desc,
+ fwd_conv_desc,
+ fwd_dest_desc,
+ static_cast(*convFwdAlgo),
+ fwdLimitBytes));
+
+ // cudnn convolution backward data configuration
+ cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+ GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+ t_resource.cudnn_handle,
+ bwd_data_filter_desc,
+ bwd_data_diff_desc,
+ bwd_data_conv_desc,
+ bwd_data_grad_desc,
+ CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convBwdDataAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+ t_resource.cudnn_handle,
+ bwd_data_filter_desc,
+ bwd_data_diff_desc,
+ bwd_data_conv_desc,
+ bwd_data_grad_desc,
+ static_cast(*convBwdDataAlgo),
+ bwdDataLimitBytes));
+
+ // cudnn convolution backward filter configuration
+ cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+ GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+ t_resource.cudnn_handle,
+ bwd_filter_src_desc,
+ bwd_filter_diff_desc,
+ bwd_filter_conv_desc,
+ bwd_filter_grad_desc,
+ CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convBwdFilterAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+ t_resource.cudnn_handle,
+ bwd_filter_src_desc,
+ bwd_filter_diff_desc,
+ bwd_filter_conv_desc,
+ bwd_filter_grad_desc,
+ static_cast(*convBwdFilterAlgo),
+ bwdFilterLimitBytes));
#endif
}
@@ -302,78 +294,75 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
int batch_size,
int feature_maps,
int height,
- int width)
-{
- CHECK_NOTNULL(image_desc);
+ int width) {
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc =
- (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
- CHECK_NOTNULL(hl_desc);
+ cudnn_tensor_descriptor hl_desc =
+ (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+ CHECK_NOTNULL(hl_desc);
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- hl_desc->desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- batch_size,
- feature_maps,
- height,
- width));
-
- hl_desc->format = CUDNN_TENSOR_NCHW;
- hl_desc->data_type = data_type;
- hl_desc->batch_size = batch_size;
- hl_desc->feature_maps = feature_maps;
- hl_desc->height = height;
- hl_desc->width = width;
-
- *image_desc = (hl_tensor_descriptor)hl_desc;
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ batch_size,
+ feature_maps,
+ height,
+ width));
+
+ hl_desc->format = CUDNN_TENSOR_NCHW;
+ hl_desc->data_type = data_type;
+ hl_desc->batch_size = batch_size;
+ hl_desc->feature_maps = feature_maps;
+ hl_desc->height = height;
+ hl_desc->width = width;
+
+ *image_desc = (hl_tensor_descriptor)hl_desc;
}
void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
- CHECK_NOTNULL(image_desc);
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc =
- (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
- CHECK_NOTNULL(hl_desc);
+ cudnn_tensor_descriptor hl_desc =
+ (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+ CHECK_NOTNULL(hl_desc);
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
- hl_desc->data_type = data_type;
+ hl_desc->data_type = data_type;
- *image_desc = (hl_tensor_descriptor)hl_desc;
+ *image_desc = (hl_tensor_descriptor)hl_desc;
}
void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int batch_size,
int feature_maps,
int height,
- int width)
-{
- const int stride_w = 1;
- const int stride_h = width * stride_w;
- const int stride_c = height * stride_h;
- const int stride_n = feature_maps * stride_c;
- return hl_tensor_reshape(image_desc,
- batch_size,
- feature_maps,
- height,
- width,
- stride_n,
- stride_c,
- stride_h,
- stride_w);
+ int width) {
+ const int stride_w = 1;
+ const int stride_h = width * stride_w;
+ const int stride_c = height * stride_h;
+ const int stride_n = feature_maps * stride_c;
+ return hl_tensor_reshape(image_desc,
+ batch_size,
+ feature_maps,
+ height,
+ width,
+ stride_n,
+ stride_c,
+ stride_h,
+ stride_w);
}
void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -384,45 +373,42 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int nStride,
int cStride,
int hStride,
- int wStride)
-{
- CHECK_NOTNULL(image_desc);
-
- cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
- CHECK_NOTNULL(hl_desc->desc);
-
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
- hl_desc->data_type,
- batch_size,
- feature_maps,
- height,
- width,
- nStride,
- cStride,
- hStride,
- wStride));
-
- hl_desc->batch_size = batch_size;
- hl_desc->feature_maps = feature_maps;
- hl_desc->height = height;
- hl_desc->width = width;
+ int wStride) {
+ CHECK_NOTNULL(image_desc);
+
+ cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+ CHECK_NOTNULL(hl_desc->desc);
+
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+ hl_desc->data_type,
+ batch_size,
+ feature_maps,
+ height,
+ width,
+ nStride,
+ cStride,
+ hStride,
+ wStride));
+
+ hl_desc->batch_size = batch_size;
+ hl_desc->feature_maps = feature_maps;
+ hl_desc->height = height;
+ hl_desc->width = width;
}
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
- CHECK_NOTNULL(image_desc);
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
- CHECK_NOTNULL(hl_desc->desc);
+ cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+ CHECK_NOTNULL(hl_desc->desc);
- CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+ CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
- hl_desc->desc = NULL;
+ hl_desc->desc = NULL;
- free(image_desc);
+ free(image_desc);
}
-
void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
hl_pooling_mode_t mode,
int height,
@@ -430,99 +416,93 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
int height_padding,
int width_padding,
int stride_height,
- int stride_width)
-{
- cudnnPoolingMode_t cudnn_mode;
- switch (mode)
- {
- case HL_POOLING_MAX:
- cudnn_mode = CUDNN_POOLING_MAX;
- break;
- case HL_POOLING_AVERAGE:
- cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
- break;
- case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
- cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
- break;
- default:
- LOG(FATAL) << "parameter mode error";
- }
-
- CHECK_NOTNULL(pooling_desc);
-
- cudnn_pooling_descriptor hl_pooling_desc =
- (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
- CHECK_NOTNULL(hl_pooling_desc);
-
- CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
- CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
- hl_pooling_desc->desc,
- cudnn_mode,
+ int stride_width) {
+ cudnnPoolingMode_t cudnn_mode;
+ switch (mode) {
+ case HL_POOLING_MAX:
+ cudnn_mode = CUDNN_POOLING_MAX;
+ break;
+ case HL_POOLING_AVERAGE:
+ cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+ break;
+ case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+ cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+ break;
+ default:
+ LOG(FATAL) << "parameter mode error";
+ }
+
+ CHECK_NOTNULL(pooling_desc);
+
+ cudnn_pooling_descriptor hl_pooling_desc =
+ (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+ CHECK_NOTNULL(hl_pooling_desc);
+
+ CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+ CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+ cudnn_mode,
#if CUDNN_VERSION >= 5000
- CUDNN_PROPAGATE_NAN,
+ CUDNN_PROPAGATE_NAN,
#endif
- height,
- width,
- height_padding,
- width_padding,
- stride_height,
- stride_width));
-
- hl_pooling_desc->mode = cudnn_mode;
- hl_pooling_desc->window_height = height;
- hl_pooling_desc->window_width = width;
- hl_pooling_desc->stride_height = stride_height;
- hl_pooling_desc->stride_width = stride_width;
-
- *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+ height,
+ width,
+ height_padding,
+ width_padding,
+ stride_height,
+ stride_width));
+
+ hl_pooling_desc->mode = cudnn_mode;
+ hl_pooling_desc->window_height = height;
+ hl_pooling_desc->window_width = width;
+ hl_pooling_desc->stride_height = stride_height;
+ hl_pooling_desc->stride_width = stride_width;
+
+ *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
}
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
- CHECK_NOTNULL(pooling_desc);
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
+ CHECK_NOTNULL(pooling_desc);
- cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
- CHECK_NOTNULL(hl_pooling->desc);
+ cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
- CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+ CHECK_NOTNULL(hl_pooling->desc);
+ CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
- hl_pooling->desc = NULL;
+ hl_pooling->desc = NULL;
- free(pooling_desc);
+ free(pooling_desc);
}
void hl_pooling_forward(hl_tensor_descriptor input,
real* input_image,
hl_tensor_descriptor output,
real* output_image,
- hl_pooling_descriptor pooling)
-{
- cudnnPoolingDescriptor_t pooling_desc;
- cudnnTensorDescriptor_t input_desc;
- cudnnTensorDescriptor_t output_desc;
-
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(pooling);
- CHECK_NOTNULL(input_image);
- CHECK_NOTNULL(output_image);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- input_desc = ((cudnn_tensor_descriptor)input)->desc;
- output_desc = ((cudnn_tensor_descriptor)output)->desc;
- pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
- CHECK_CUDNN(dynload::cudnnPoolingForward(
- t_resource.cudnn_handle,
- pooling_desc,
- &alpha,
- input_desc,
- input_image,
- &beta,
- output_desc,
- output_image));
- CHECK_SYNC("hl_pooling_forward failed");
+ hl_pooling_descriptor pooling) {
+ cudnnPoolingDescriptor_t pooling_desc;
+ cudnnTensorDescriptor_t input_desc;
+ cudnnTensorDescriptor_t output_desc;
+
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(pooling);
+ CHECK_NOTNULL(input_image);
+ CHECK_NOTNULL(output_image);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ input_desc = ((cudnn_tensor_descriptor)input)->desc;
+ output_desc = ((cudnn_tensor_descriptor)output)->desc;
+ pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+ CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+ pooling_desc,
+ &alpha,
+ input_desc,
+ input_image,
+ &beta,
+ output_desc,
+ output_image));
+ CHECK_SYNC("hl_pooling_forward failed");
}
void hl_pooling_backward(hl_tensor_descriptor input,
@@ -531,94 +511,87 @@ void hl_pooling_backward(hl_tensor_descriptor input,
hl_tensor_descriptor output,
real* output_image,
real* output_image_grad,
- hl_pooling_descriptor pooling)
-{
- cudnnPoolingDescriptor_t pooling_desc;
- cudnnTensorDescriptor_t input_desc;
- cudnnTensorDescriptor_t output_desc;
-
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(pooling);
- CHECK_NOTNULL(input_image);
- CHECK_NOTNULL(input_image_grad);
- CHECK_NOTNULL(output_image);
- CHECK_NOTNULL(output_image_grad);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- input_desc = ((cudnn_tensor_descriptor)input)->desc;
- output_desc = ((cudnn_tensor_descriptor)output)->desc;
- pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
- CHECK_CUDNN(dynload::cudnnPoolingBackward(
- t_resource.cudnn_handle,
- pooling_desc,
- &alpha,
- output_desc,
- output_image,
- output_desc,
- output_image_grad,
- input_desc,
- input_image,
- &beta,
- input_desc,
- input_image_grad));
+ hl_pooling_descriptor pooling) {
+ cudnnPoolingDescriptor_t pooling_desc;
+ cudnnTensorDescriptor_t input_desc;
+ cudnnTensorDescriptor_t output_desc;
+
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(pooling);
+ CHECK_NOTNULL(input_image);
+ CHECK_NOTNULL(input_image_grad);
+ CHECK_NOTNULL(output_image);
+ CHECK_NOTNULL(output_image_grad);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ input_desc = ((cudnn_tensor_descriptor)input)->desc;
+ output_desc = ((cudnn_tensor_descriptor)output)->desc;
+ pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+ CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+ pooling_desc,
+ &alpha,
+ output_desc,
+ output_image,
+ output_desc,
+ output_image_grad,
+ input_desc,
+ input_image,
+ &beta,
+ input_desc,
+ input_image_grad));
CHECK_SYNC("hl_pooling_backward failed");
}
-
void hl_create_filter_descriptor(hl_filter_descriptor* filter,
int input_feature_maps,
int output_feature_maps,
int height,
- int width)
-{
- CHECK_NOTNULL(filter);
+ int width) {
+ CHECK_NOTNULL(filter);
- cudnn_filter_descriptor hl_filter =
- (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
- CHECK_NOTNULL(hl_filter);
+ cudnn_filter_descriptor hl_filter =
+ (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+ CHECK_NOTNULL(hl_filter);
- CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+ CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
- hl_filter->desc,
- data_type,
+ CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+ data_type,
#if CUDNN_VERSION >= 5000
- CUDNN_TENSOR_NCHW,
+ CUDNN_TENSOR_NCHW,
#endif
- output_feature_maps,
- input_feature_maps,
- height,
- width));
-
- hl_filter->data_type = data_type;
- hl_filter->output_feature_maps = output_feature_maps;
- hl_filter->input_feature_maps = input_feature_maps;
- hl_filter->filter_height = height;
- hl_filter->filter_width = width;
-
- *filter = (hl_filter_descriptor)hl_filter;
+ output_feature_maps,
+ input_feature_maps,
+ height,
+ width));
+
+ hl_filter->data_type = data_type;
+ hl_filter->output_feature_maps = output_feature_maps;
+ hl_filter->input_feature_maps = input_feature_maps;
+ hl_filter->filter_height = height;
+ hl_filter->filter_width = width;
+
+ *filter = (hl_filter_descriptor)hl_filter;
}
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
+ CHECK_NOTNULL(filter);
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
- CHECK_NOTNULL(filter);
+ cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+ CHECK_NOTNULL(hl_filter->desc);
- cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
- CHECK_NOTNULL(hl_filter->desc);
+ CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
- CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+ hl_filter->desc = NULL;
- hl_filter->desc = NULL;
-
- free(filter);
+ free(filter);
}
void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
@@ -627,38 +600,36 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
- int stride_width)
-{
- CHECK_NOTNULL(conv);
-
- cudnn_convolution_descriptor hl_conv =
- (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
- CHECK_NOTNULL(hl_conv);
-
- CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
- cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
- CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
- hl_conv->desc,
- padding_height,
- padding_width,
- stride_height,
- stride_width,
- 1,
- 1,
- mode));
-
- hl_conv->input_image = image;
- hl_conv->filter = filter;
- hl_conv->padding_height = padding_height;
- hl_conv->padding_width = padding_width;
- hl_conv->stride_height = stride_height;
- hl_conv->stride_width = stride_width;
- hl_conv->upscalex = 1;
- hl_conv->upscaley = 1;
- hl_conv->mode = mode;
-
- *conv = (hl_convolution_descriptor)hl_conv;
+ int stride_width) {
+ CHECK_NOTNULL(conv);
+
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+ sizeof(_cudnn_convolution_descriptor));
+
+ CHECK_NOTNULL(hl_conv);
+ CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+ cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+ CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+ padding_height,
+ padding_width,
+ stride_height,
+ stride_width,
+ 1,
+ 1,
+ mode));
+
+ hl_conv->input_image = image;
+ hl_conv->filter = filter;
+ hl_conv->padding_height = padding_height;
+ hl_conv->padding_width = padding_width;
+ hl_conv->stride_height = stride_height;
+ hl_conv->stride_width = stride_width;
+ hl_conv->upscalex = 1;
+ hl_conv->upscaley = 1;
+ hl_conv->mode = mode;
+
+ *conv = (hl_convolution_descriptor)hl_conv;
}
void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
@@ -667,47 +638,44 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
- int stride_width)
-{
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(image);
- CHECK_NOTNULL(filter);
-
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
- CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
- conv_desc,
- padding_height,
- padding_width,
- stride_height,
- stride_width,
- 1,
- 1,
- mode));
-
- cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
- hl_conv->input_image = image;
- hl_conv->filter = filter;
- hl_conv->padding_height = padding_height;
- hl_conv->padding_width = padding_width;
- hl_conv->stride_height = stride_height;
- hl_conv->stride_width = stride_width;
- hl_conv->upscalex = 1;
- hl_conv->upscaley = 1;
- hl_conv->mode = mode;
+ int stride_width) {
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(image);
+ CHECK_NOTNULL(filter);
+
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+ CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+ padding_height,
+ padding_width,
+ stride_height,
+ stride_width,
+ 1,
+ 1,
+ mode));
+
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+ hl_conv->input_image = image;
+ hl_conv->filter = filter;
+ hl_conv->padding_height = padding_height;
+ hl_conv->padding_width = padding_width;
+ hl_conv->stride_height = stride_height;
+ hl_conv->stride_width = stride_width;
+ hl_conv->upscalex = 1;
+ hl_conv->upscaley = 1;
+ hl_conv->mode = mode;
}
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
- CHECK_NOTNULL(conv);
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
+ CHECK_NOTNULL(conv);
- cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
- CHECK_NOTNULL(hl_conv->desc);
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+ CHECK_NOTNULL(hl_conv->desc);
- CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
- hl_conv->desc = NULL;
+ CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+ hl_conv->desc = NULL;
- free(conv);
+ free(conv);
}
void hl_convolution_forward(hl_tensor_descriptor input,
@@ -720,87 +688,83 @@ void hl_convolution_forward(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convFwdAlgo) {
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(input_data);
- CHECK_NOTNULL(output_data);
- CHECK_NOTNULL(filter_data);
- cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- real alpha = 1.0f;
- real beta = 1.0f;
- CHECK_CUDNN(dynload::cudnnConvolutionForward(
- t_resource.cudnn_handle,
- &alpha,
- src_desc,
- input_data,
- filter_desc,
- filter_data,
- conv_desc,
- static_cast(convFwdAlgo),
- gpuWorkSpace,
- sizeInBytes,
- &beta,
- dest_desc,
- output_data));
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(input_data);
+ CHECK_NOTNULL(output_data);
+ CHECK_NOTNULL(filter_data);
+ cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ CHECK_CUDNN(dynload::cudnnConvolutionForward(
+ t_resource.cudnn_handle,
+ &alpha,
+ src_desc,
+ input_data,
+ filter_desc,
+ filter_data,
+ conv_desc,
+ static_cast(convFwdAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
+ &beta,
+ dest_desc,
+ output_data));
CHECK_SYNC("hl_convolution_forward failed");
}
void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
real* bias_data,
hl_tensor_descriptor output,
- real* output_data)
-{
- CHECK_NOTNULL(bias);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(bias_data);
- CHECK_NOTNULL(output_data);
-
- cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
- real alpha = 1.0f;
- real beta = 1.0f;
-
- CHECK_CUDNN(dynload::cudnnAddTensor(
- t_resource.cudnn_handle,
+ real* output_data) {
+ CHECK_NOTNULL(bias);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(bias_data);
+ CHECK_NOTNULL(output_data);
+
+ cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+ real alpha = 1.0f;
+ real beta = 1.0f;
+
+ CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
#if CUDNN_VERSION < 4000
- CUDNN_ADD_SAME_C,
+ CUDNN_ADD_SAME_C,
#endif
- &alpha,
- bias_desc,
- bias_data,
- &beta,
- output_desc,
- output_data));
+ &alpha,
+ bias_desc,
+ bias_data,
+ &beta,
+ output_desc,
+ output_data));
CHECK_SYNC("hl_convolution_forward_add_bias failed");
}
void hl_convolution_backward_bias(hl_tensor_descriptor bias,
real* bias_grad_data,
hl_tensor_descriptor output,
- real* output_grad_data)
-{
- CHECK_NOTNULL(bias);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(bias_grad_data);
- CHECK_NOTNULL(output_grad_data);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
- t_resource.cudnn_handle,
- &alpha,
- diff_desc,
- output_grad_data,
- &beta,
- bias_desc,
- bias_grad_data));
+ real* output_grad_data) {
+ CHECK_NOTNULL(bias);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(bias_grad_data);
+ CHECK_NOTNULL(output_grad_data);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+ &alpha,
+ diff_desc,
+ output_grad_data,
+ &beta,
+ bias_desc,
+ bias_grad_data));
CHECK_SYNC("hl_convolution_backward_bias failed");
}
@@ -814,38 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convBwdFilterAlgo) {
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(input_data);
+ CHECK_NOTNULL(output_grad_data);
+ CHECK_NOTNULL(filter_grad_data);
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(input_data);
- CHECK_NOTNULL(output_grad_data);
- CHECK_NOTNULL(filter_grad_data);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
- t_resource.cudnn_handle,
- &alpha,
- src_desc,
- input_data,
- diff_desc,
- output_grad_data,
- conv_desc,
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+ t_resource.cudnn_handle,
+ &alpha,
+ src_desc,
+ input_data,
+ diff_desc,
+ output_grad_data,
+ conv_desc,
#if CUDNN_VERSION >= 4000
- static_cast(convBwdFilterAlgo),
- gpuWorkSpace,
- sizeInBytes,
+ static_cast(convBwdFilterAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
#endif
- &beta,
- grad_desc,
- filter_grad_data));
+ &beta,
+ grad_desc,
+ filter_grad_data));
CHECK_SYNC("hl_convolution_backward_filter failed");
}
@@ -859,121 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convBwdDataAlgo) {
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
- t_resource.cudnn_handle,
- &alpha,
- filter_desc,
- filter_data,
- diff_desc,
- output_grad_data,
- conv_desc,
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+ t_resource.cudnn_handle,
+ &alpha,
+ filter_desc,
+ filter_data,
+ diff_desc,
+ output_grad_data,
+ conv_desc,
#if CUDNN_VERSION >= 4000
- static_cast(convBwdDataAlgo),
- gpuWorkSpace,
- sizeInBytes,
+ static_cast(convBwdDataAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
#endif
- &beta,
- grad_desc,
- input_data_grad));
+ &beta,
+ grad_desc,
+ input_data_grad));
CHECK_SYNC("hl_convolution_backward_data failed");
}
-
-void hl_softmax_forward(real *input,
- real *output,
- int height,
- int width)
-{
+void hl_softmax_forward(real* input, real* output, int height, int width) {
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- t_resource.cudnn_desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- height,
- width,
- 1,
- 1));
-
- real alpha = 1.0f;
- real beta = 0.0f;
- CHECK_CUDNN(dynload::cudnnSoftmaxForward(
- t_resource.cudnn_handle,
- CUDNN_SOFTMAX_ACCURATE,
- CUDNN_SOFTMAX_MODE_CHANNEL,
- &alpha,
- t_resource.cudnn_desc,
- input,
- &beta,
- t_resource.cudnn_desc,
- output));
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ height,
+ width,
+ 1,
+ 1));
+
+ real alpha = 1.0f;
+ real beta = 0.0f;
+ CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ t_resource.cudnn_desc,
+ input,
+ &beta,
+ t_resource.cudnn_desc,
+ output));
CHECK_SYNC("hl_softmax_forward failed");
}
-void hl_softmax_backward(real *output_value,
- real *output_grad,
+void hl_softmax_backward(real* output_value,
+ real* output_grad,
int height,
- int width)
-{
+ int width) {
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- t_resource.cudnn_desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- height,
- width,
- 1,
- 1));
-
- real alpha = 1.0f;
- real beta = 0.0f;
- CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
- t_resource.cudnn_handle,
- CUDNN_SOFTMAX_ACCURATE,
- CUDNN_SOFTMAX_MODE_CHANNEL,
- &alpha,
- t_resource.cudnn_desc,
- output_value,
- t_resource.cudnn_desc,
- output_grad,
- &beta,
- t_resource.cudnn_desc,
- output_grad));
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ height,
+ width,
+ 1,
+ 1));
+
+ real alpha = 1.0f;
+ real beta = 0.0f;
+ CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ t_resource.cudnn_desc,
+ output_value,
+ t_resource.cudnn_desc,
+ output_grad,
+ &beta,
+ t_resource.cudnn_desc,
+ output_grad));
CHECK_SYNC("hl_softmax_backward failed");
}
void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar) {
+ real* savedMean,
+ real* savedVar) {
#if CUDNN_VERSION >= 4007
if ((NULL != runningMean && NULL == runningInvVar) ||
(NULL == runningMean && NULL != runningInvVar)) {
LOG(FATAL) << "runningMean and runningInvVar can be NULL "
- << "but only at the same time.";
+ << "but only at the same time.";
}
if ((NULL != savedMean && NULL == savedVar) ||
(NULL == savedMean && NULL != savedVar)) {
@@ -987,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
- t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
- input, yDesc, output, bnDesc, scale, bias, factor,
- runningMean, runningInvVar, epsilon, savedMean, savedVar));
+ CHECK_CUDNN(
+ dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ yDesc,
+ output,
+ bnDesc,
+ scale,
+ bias,
+ factor,
+ runningMean,
+ runningInvVar,
+ epsilon,
+ savedMean,
+ savedVar));
CHECK_SYNC("hl_batch_norm_forward_training failed");
#else
@@ -1000,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
}
void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
- hl_tensor_descriptor outputDesc,
- real *output,
- hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedInvVar,
- double epsilon) {
+ real* input,
+ hl_tensor_descriptor outputDesc,
+ real* output,
+ hl_tensor_descriptor bnParamDesc,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedInvVar,
+ double epsilon) {
#if CUDNN_VERSION >= 4007
cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
@@ -1016,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
- t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
- input, yDesc, output, bnDesc, scale, bias,
- estimatedMean, estimatedInvVar, epsilon));
+ CHECK_CUDNN(
+ dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ yDesc,
+ output,
+ bnDesc,
+ scale,
+ bias,
+ estimatedMean,
+ estimatedInvVar,
+ epsilon));
CHECK_SYNC("hl_batch_norm_forward_inference failed");
#else
@@ -1029,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
}
void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar) {
+ real* savedMean,
+ real* savedInvVar) {
#if CUDNN_VERSION >= 4007
if ((NULL != savedMean && NULL == savedInvVar) ||
(NULL == savedMean && NULL != savedInvVar)) {
@@ -1055,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
- t_resource.cudnn_handle, mode, &alpha, &beta,
- &alpha, &beta,
- xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
- bnDesc, scale, scaleGrad, biasGrad, epsilon,
- savedMean, savedInvVar));
+ CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ dyDesc,
+ outGrad,
+ dxDesc,
+ inGrad,
+ bnDesc,
+ scale,
+ scaleGrad,
+ biasGrad,
+ epsilon,
+ savedMean,
+ savedInvVar));
CHECK_SYNC("hl_batch_norm_backward failed");
#else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index aa1d184a3e..6b71a53848 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
-#include
+#include
#include
-#include
#include
-#include
+#include
+#include
#include
#include "hl_cuda.h"
#include "hl_cuda.ph"
-#include "hl_thread.ph"
#include "hl_dso_loader.h"
+#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
namespace dynload {
std::once_flag curand_dso_flag;
-void* curand_dso_handle = nullptr;
+void *curand_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -38,34 +37,35 @@ void* curand_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- curandStatus_t operator()(Args... args) { \
- typedef curandStatus_t (*curandFunc)(Args...); \
- std::call_once(curand_dso_flag, GetCurandDsoHandle, \
- &curand_dso_handle); \
- void* p_##__name = dlsym(curand_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ curandStatus_t operator()(Args... args) { \
+ typedef curandStatus_t (*curandFunc)(Args...); \
+ std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+ void *p_##__name = dlsym(curand_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- curandStatus_t operator()(Args... args) { \
- return __name(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ curandStatus_t operator()(Args... args) { \
+ return __name(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#endif
/* include all needed curand functions in HPPL */
+// clang-format off
#define CURAND_RAND_ROUTINE_EACH(__macro) \
__macro(curandCreateGenerator) \
__macro(curandSetStream) \
__macro(curandSetPseudoRandomGeneratorSeed)\
__macro(curandGenerateUniform) \
__macro(curandGenerateUniformDouble)
+// clang-format on
CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
@@ -73,7 +73,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef DYNAMIC_LOAD_CURAND_WRAP
std::once_flag cudart_dso_flag;
-void* cudart_dso_handle = nullptr;
+void *cudart_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -83,28 +83,28 @@ void* cudart_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- using cudart_func = decltype(__name(args...))(*)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, \
- &cudart_dso_handle); \
- void* p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ using cudart_func = decltype(__name(args...)) (*)(Args...); \
+ std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+ void *p_##__name = dlsym(cudart_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- return __name(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ return __name(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#endif
/* include all needed cuda functions in HPPL */
+// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaMalloc) \
__macro(cudaHostAlloc) \
@@ -137,56 +137,57 @@ void* cudart_dso_handle = nullptr;
__macro(cudaGetErrorString) \
__macro(cudaProfilerStart) \
__macro(cudaProfilerStop)
+// clang-format on
+
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#undef CUDA_ROUNTINE_EACH
#undef DYNAMIC_LOAD_CUDART_WRAP
-} /* namespace dynload */
+} /* namespace dynload */
/**
* @brief global resource.
*/
-int g_system_device_num = 0; /* system device number */
-int device_num = 0; /* use device number */
-hl_device_prop *g_device; /* device info table */
-__thread thread_device_resources *t_device; /* device resources table */
+int g_system_device_num = 0; /* system device number */
+int device_num = 0; /* use device number */
+hl_device_prop *g_device; /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
int g_cuda_lib_version = 0;
/* number of global stream */
-#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
/* number of thread stream */
-#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
/* sizeof of device memory */
-#define HPPL_GPU_MEMORY_SIZE (256*4)
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
/**
* Check build-in cuda function using glog and it **does not**
* support << operator for more details error info.
*/
-#define CHECK_CUDA(cudaFunc) \
- do { \
- cudaError_t cudaStat = cudaFunc; \
- CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
- << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc) \
+ do { \
+ cudaError_t cudaStat = cudaFunc; \
+ CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
+ << dynload::cudaGetErrorString(cudaStat); \
} while (0)
/**
* @brief thread resource.
*/
-__thread _hl_thread_resource t_resource = {
- {0}, /* stream */
- 0, /* handle */
- 0, /* gen */
- 0, /* cudnn_handle */
- 0, /* cudnn_desc */
- NULL, /* gen_mutex */
- NULL, /* gpu_mem */
- NULL, /* cpu_mem */
- 0, /* event */
- -1, /* device */
- 0, /* major */
- false}; /* is_init */
+__thread _hl_thread_resource t_resource = {{0}, /* stream */
+ 0, /* handle */
+ 0, /* gen */
+ 0, /* cudnn_handle */
+ 0, /* cudnn_desc */
+ NULL, /* gen_mutex */
+ NULL, /* gpu_mem */
+ NULL, /* cpu_mem */
+ 0, /* event */
+ -1, /* device */
+ 0, /* major */
+ false}; /* is_init */
__thread cudaStream_t default_stream = 0;
__thread bool g_sync_flag = true;
@@ -200,18 +201,17 @@ inline pid_t gettid() {
uint64_t tid;
pthread_threadid_np(NULL, &tid);
#else
- #ifndef __NR_gettid
- #define __NR_gettid 224
- #endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
pid_t tid = syscall(__NR_gettid);
#endif
- CHECK_NE(tid, -1);
- return tid;
+ CHECK_NE((int)tid, -1);
+ return tid;
}
void hl_init(int device) {
- CHECK(hl_start_flag)
- << "[Init failed] hl_start() did not succeed.";
+ CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
/* thread has been initialized */
if (true == t_resource.is_init) {
@@ -222,16 +222,16 @@ void hl_init(int device) {
/* create thread devcie resources */
char *tmp;
thread_device_resources device_res;
- tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
- device_num*sizeof(_thread_device_resources));
+ tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+ device_num * sizeof(_thread_device_resources));
CHECK_NOTNULL(tmp);
- t_device = (thread_device_resources*)tmp;
- device_res = (thread_device_resources)((char*)tmp +
- g_system_device_num*sizeof(thread_device_resources*));
- memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+ t_device = (thread_device_resources *)tmp;
+ device_res = (thread_device_resources)(
+ (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+ memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
- char *tmp_stream = (char *)
- malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+ char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+ sizeof(cudaStream_t));
CHECK_NOTNULL(tmp_stream);
int num = 0;
@@ -241,8 +241,9 @@ void hl_init(int device) {
}
t_device[dev] = &device_res[num];
- t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
- num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+ t_device[dev]->stream =
+ (cudaStream_t *)(tmp_stream +
+ num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
hl_create_thread_resources(dev, t_device[dev]);
num++;
@@ -268,14 +269,14 @@ void hl_fini() {
t_resource.stream[i] = 0;
}
- char* tmp = (char*)t_device;
- char* tmp_stream = NULL;
+ char *tmp = (char *)t_device;
+ char *tmp_stream = NULL;
for (int dev = 0; dev < g_system_device_num; dev++) {
if (!t_device[dev]) {
continue;
}
if (!tmp_stream) {
- tmp_stream = (char*)t_device[dev]->stream;
+ tmp_stream = (char *)t_device[dev]->stream;
}
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
@@ -292,9 +293,7 @@ void hl_fini() {
t_resource.is_init = false;
}
-int hl_get_device_count() {
- return device_num;
-}
+int hl_get_device_count() { return device_num; }
void hl_set_device(int device) {
if (device == t_resource.device) {
@@ -302,7 +301,7 @@ void hl_set_device(int device) {
}
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device: " << device << " is not specified in startup.";
+ << "Device: " << device << " is not specified in startup.";
CHECK_CUDA(dynload::cudaSetDevice(device));
@@ -314,11 +313,11 @@ void hl_set_device(int device) {
if (true == t_resource.is_init) {
for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
t_resource.stream[i] =
- t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+ t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
}
t_resource.gpu_mem = t_device[device]->gpu_mem;
t_resource.cpu_mem = t_device[device]->cpu_mem;
- t_resource.event = t_device[device]->mem_event;
+ t_resource.event = t_device[device]->mem_event;
}
t_resource.handle = g_device[device]->device_resources->handle;
@@ -336,11 +335,11 @@ int hl_get_device() {
return device;
}
-void* hl_malloc_device(size_t size) {
+void *hl_malloc_device(size_t size) {
void *dest_d;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+ CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
return dest_d;
}
@@ -350,14 +349,15 @@ void hl_free_mem_device(void *dest_d) {
cudaError_t err = dynload::cudaFree(dest_d);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
- << hl_get_device_error_string();
+ << hl_get_device_error_string();
}
-void* hl_malloc_host(size_t size) {
+void *hl_malloc_host(size_t size) {
void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+ CHECK_CUDA(
+ dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
return dest_h;
}
@@ -366,8 +366,8 @@ void hl_free_mem_host(void *dest_h) {
CHECK_NOTNULL(dest_h);
cudaError_t err = dynload::cudaFreeHost(dest_h);
- CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
- << hl_get_device_error_string();
+ CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+ << hl_get_device_error_string();
}
void hl_memcpy(void *dst, void *src, size_t size) {
@@ -389,8 +389,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
}
CHECK_NOTNULL(src_h);
CHECK_NOTNULL(dest_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
- cudaMemcpyHostToDevice));
+ CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
}
void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -399,8 +398,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_h);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
- cudaMemcpyDeviceToHost));
+ CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
}
void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -409,8 +407,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_d);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
- cudaMemcpyDeviceToDevice));
+ CHECK_CUDA(
+ dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
}
void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -424,8 +422,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
CHECK_LT(stream, HPPL_STREAM_END);
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
- cu_stream));
+ CHECK_CUDA(
+ dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
}
void hl_start() {
@@ -436,8 +434,8 @@ void hl_start() {
bool hl_device_can_access_peer(int device, int peerDevice) {
int canAccessPeer;
- CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
- peerDevice));
+ CHECK_CUDA(
+ dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
if (canAccessPeer == 1) {
return true;
@@ -479,32 +477,32 @@ void hl_create_global_resources(hl_device_prop device_prop) {
/* create curand gen */
CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
- CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
- << "[Start failed] Curand init failed.";
+ CURAND_RNG_PSEUDO_DEFAULT),
+ CURAND_STATUS_SUCCESS)
+ << "[Start failed] Curand init failed.";
- CHECK_EQ(dynload::curandSetStream(device_res->gen,
- device_res->stream[0]), CURAND_STATUS_SUCCESS)
- << "[Start failed] Curand set stream failed!";
+ CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+ CURAND_STATUS_SUCCESS)
+ << "[Start failed] Curand set stream failed!";
/* create cudnn handle */
hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
int seed = gettid();
- CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
- device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+ CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+ seed + device),
+ CURAND_STATUS_SUCCESS);
- device_res->gen_mutex =
- (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+ device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
pthread_mutex_init(device_res->gen_mutex, NULL);
CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
}
-int hl_get_cuda_version() {
- return g_cuda_lib_version;
-}
+int hl_get_cuda_version() { return g_cuda_lib_version; }
-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+ thread_device_resources device_res) {
CHECK_CUDA(dynload::cudaSetDevice(device));
/* create thread stream */
@@ -513,15 +511,15 @@ void hl_create_thread_resources(int device, thread_device_resources device_res)
}
/* allocation device memory */
- device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+ device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
/* allocation host memory */
- device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+ device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
}
-void hl_specify_devices_start(int* device, int number) {
+void hl_specify_devices_start(int *device, int number) {
if (hl_start_flag) return;
/* 1. get the number of devices */
@@ -533,20 +531,19 @@ void hl_specify_devices_start(int* device, int number) {
/* 2. check device & create device property table */
CHECK_LE(number, g_system_device_num)
- << "[Start failed] System does not have enough device. "
- << "Device number: " << g_system_device_num
- << "Input number: " << number;
+ << "[Start failed] System does not have enough device. "
+ << "Device number: " << g_system_device_num << "Input number: " << number;
char *tmp;
hl_device_prop device_prop;
- tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
- number*sizeof(_hl_device_prop));
+ tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+ number * sizeof(_hl_device_prop));
CHECK(tmp) << "[Start failed] System memory is not enough.";
- g_device = (hl_device_prop*)tmp;
- device_prop = (hl_device_prop)((char*)tmp +
- g_system_device_num*sizeof(hl_device_prop*));
- memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+ g_device = (hl_device_prop *)tmp;
+ device_prop = (hl_device_prop)(
+ (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+ memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
int num = 0;
for (int i = 0; i < number; i++) {
int dev;
@@ -557,13 +554,13 @@ void hl_specify_devices_start(int* device, int number) {
}
CHECK_LT(dev, g_system_device_num)
- << "[Start failed] The specified device number is "
- << "out of range. Max device number: " << g_system_device_num - 1
- << " Specified devcie number: "<< dev;
+ << "[Start failed] The specified device number is "
+ << "out of range. Max device number: " << g_system_device_num - 1
+ << " Specified devcie number: " << dev;
if (g_device[dev]) {
/* Warning */
- LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+ LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
continue;
}
@@ -574,11 +571,11 @@ void hl_specify_devices_start(int* device, int number) {
device_num = num;
/* 3. create global device resources */
- char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+ char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
CHECK_NOTNULL(tmp_res);
- char *tmp_stream =
- (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+ char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+ sizeof(cudaStream_t));
CHECK_NOTNULL(tmp_stream);
num = 0;
@@ -587,10 +584,11 @@ void hl_specify_devices_start(int* device, int number) {
continue;
}
- g_device[i]->device_resources = (global_device_resources)(tmp_res +
- num*sizeof(_global_device_resources));
- g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
- num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+ g_device[i]->device_resources = (global_device_resources)(
+ tmp_res + num * sizeof(_global_device_resources));
+ g_device[i]->device_resources->stream =
+ (cudaStream_t *)(tmp_stream +
+ num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
hl_create_global_resources(g_device[i]);
num++;
@@ -600,9 +598,9 @@ void hl_specify_devices_start(int* device, int number) {
hl_start_flag = true;
/* set default device */
if (device == NULL) {
- hl_set_device(0);
+ hl_set_device(0);
} else {
- hl_set_device(device[0]);
+ hl_set_device(device[0]);
}
}
@@ -610,35 +608,31 @@ void hl_rand(real *dest_d, size_t num) {
pthread_mutex_lock(t_resource.gen_mutex);
CHECK_EQ(
#ifndef PADDLE_TYPE_DOUBLE
- dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+ dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
#else
- dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+ dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
#endif
- CURAND_STATUS_SUCCESS);
+ CURAND_STATUS_SUCCESS);
pthread_mutex_unlock(t_resource.gen_mutex);
CHECK_SYNC("hl_rand failed");
}
void hl_srand(unsigned int seed) {
pthread_mutex_lock(t_resource.gen_mutex);
- CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
- t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+ CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+ CURAND_STATUS_SUCCESS);
pthread_mutex_unlock(t_resource.gen_mutex);
}
-void hl_set_sync_flag(bool flag) {
- g_sync_flag = flag;
-}
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
-bool hl_get_sync_flag() {
- return g_sync_flag;
-}
+bool hl_get_sync_flag() { return g_sync_flag; }
void hl_stream_synchronize(hl_stream_t stream) {
cudaStream_t cu_stream;
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
@@ -647,8 +641,8 @@ void hl_stream_synchronize(hl_stream_t stream) {
void hl_create_event(hl_event_t *event) {
CHECK_NOTNULL(event);
- struct _hl_event_st* st_event =
- (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+ struct _hl_event_st *st_event =
+ (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
@@ -660,8 +654,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
CHECK_NOTNULL(start);
CHECK_NOTNULL(end);
- CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
- start->cu_event, end->cu_event));
+ CHECK_CUDA(
+ dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
return time;
}
@@ -669,24 +663,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
cudaStream_t cu_stream;
CHECK_NOTNULL(event);
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaEventRecord(
- event->cu_event, cu_stream));
+ CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
}
void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
cudaStream_t cu_stream;
CHECK_NOTNULL(event);
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaStreamWaitEvent(
- cu_stream, event->cu_event, 0));
+ CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
}
void hl_destroy_event(hl_event_t event) {
@@ -705,15 +697,15 @@ void hl_event_synchronize(hl_event_t event) {
void hl_get_device_name(char *name, int len, int device) {
CHECK_NOTNULL(name);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device <<") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
- strncpy(name, g_device[device]->device_name , len);
+ strncpy(name, g_device[device]->device_name, len);
}
void hl_get_device_memory(size_t *mem_size, int device) {
CHECK_NOTNULL(mem_size);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device <<") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
*mem_size = g_device[device]->device_mem;
}
@@ -722,31 +714,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
CHECK_NOTNULL(major);
CHECK_NOTNULL(minor);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device << ") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
*major = g_device[device]->major;
*minor = g_device[device]->minor;
}
-int hl_get_device_last_error() {
- return (int)dynload::cudaGetLastError();
-}
+int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
-const char* hl_get_device_error_string() {
+const char *hl_get_device_error_string() {
cudaError_t err = dynload::cudaGetLastError();
return dynload::cudaGetErrorString(err);
}
-const char* hl_get_device_error_string(size_t err) {
+const char *hl_get_device_error_string(size_t err) {
return dynload::cudaGetErrorString((cudaError_t)err);
}
-void hl_device_synchronize() {
- CHECK_CUDA(dynload::cudaDeviceSynchronize());
-}
+void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
void hl_set_device_flags_block() {
- CHECK_CUDA(dynload::cudaSetDeviceFlags(
- cudaDeviceScheduleBlockingSync));
+ CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
}
bool hl_cuda_event_is_ready(hl_event_t event) {
@@ -759,11 +746,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
return true;
}
-void hl_profiler_start() {
- CHECK_CUDA(dynload::cudaProfilerStart());
-}
-
-void hl_profiler_end() {
- CHECK_CUDA(dynload::cudaProfilerStop());
-}
+void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index 27bbd03bc3..ff6b830b7a 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifdef PADDLE_USE_DSO
#include
@@ -29,26 +28,26 @@ limitations under the License. */
namespace dynload {
extern std::once_flag cudart_dso_flag;
-extern void* cudart_dso_handle;
+extern void *cudart_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
**/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
- struct DynLoad__##__name { \
- template \
- __type operator()(Args... args) { \
- typedef __type (*cudartFunc)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, \
- &cudart_dso_handle); \
- void* p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
+ struct DynLoad__##__name { \
+ template \
+ __type operator()(Args... args) { \
+ typedef __type (*cudartFunc)(Args...); \
+ std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+ void *p_##__name = dlsym(cudart_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
/* include all needed cuda functions in HPPL */
+// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaLaunch, cudaError_t) \
__macro(cudaSetupArgument, cudaError_t) \
@@ -61,16 +60,17 @@ extern void* cudart_dso_handle;
__macro(__cudaInitModule, char) \
__macro(__cudaRegisterTexture, void) \
__macro(__cudaRegisterSurface, void)
+// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#if CUDART_VERSION >= 7000
- DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
#endif
#undef CUDA_ROUNTINE_EACH
-} /* namespace dynload */
+} /* namespace dynload */
#if CUDART_VERSION >= 7000
__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
@@ -78,131 +78,120 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
dim3 blockDim,
void **args,
size_t sharedMem,
- cudaStream_t stream)
-{
- return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+ cudaStream_t stream) {
+ return dynload::cudaLaunchKernel(
+ func, gridDim, blockDim, args, sharedMem, stream);
}
#endif /* CUDART_VERSION >= 7000 */
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
return dynload::cudaLaunch(func);
}
__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
size_t size,
- size_t offset)
-{
+ size_t offset) {
return dynload::cudaSetupArgument(arg, size, offset);
}
__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
dim3 blockDim,
size_t sharedMem,
- cudaStream_t stream)
-{
- return dynload::cudaConfigureCall(gridDim, blockDim,
- sharedMem, stream);
+ cudaStream_t stream) {
+ return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
}
extern "C" {
-void** CUDARTAPI __cudaRegisterFatBinary(
- void *fatCubin
-)
-{
+void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
return dynload::__cudaRegisterFatBinary(fatCubin);
-
}
-void CUDARTAPI __cudaUnregisterFatBinary(
- void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
}
-void CUDARTAPI __cudaRegisterFunction(
- void **fatCubinHandle,
- const char *hostFun,
- char *deviceFun,
- const char *deviceName,
- int thread_limit,
- uint3 *tid,
- uint3 *bid,
- dim3 *bDim,
- dim3 *gDim,
- int *wSize
-) {
- return dynload::__cudaRegisterFunction(
- fatCubinHandle, hostFun, deviceFun, deviceName,
- thread_limit, tid, bid, bDim, gDim, wSize);
+void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
+ const char *hostFun,
+ char *deviceFun,
+ const char *deviceName,
+ int thread_limit,
+ uint3 *tid,
+ uint3 *bid,
+ dim3 *bDim,
+ dim3 *gDim,
+ int *wSize) {
+ return dynload::__cudaRegisterFunction(fatCubinHandle,
+ hostFun,
+ deviceFun,
+ deviceName,
+ thread_limit,
+ tid,
+ bid,
+ bDim,
+ gDim,
+ wSize);
}
-void CUDARTAPI __cudaRegisterVar(
- void **fatCubinHandle,
- char *hostVar,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global
-) {
- return dynload::__cudaRegisterVar(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, ext, size, constant, global);
+void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
+ char *hostVar,
+ char *deviceAddress,
+ const char *deviceName,
+ int ext,
+ int size,
+ int constant,
+ int global) {
+ return dynload::__cudaRegisterVar(fatCubinHandle,
+ hostVar,
+ deviceAddress,
+ deviceName,
+ ext,
+ size,
+ constant,
+ global);
}
-
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
- void **fatCubinHandle,
- void **hostVarPtrAddress,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global
-) {
- return dynload::__cudaRegisterManagedVar(
- fatCubinHandle, hostVarPtrAddress, deviceAddress,
- deviceName, ext, size, constant, global);
+extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
+ void **hostVarPtrAddress,
+ char *deviceAddress,
+ const char *deviceName,
+ int ext,
+ int size,
+ int constant,
+ int global) {
+ return dynload::__cudaRegisterManagedVar(fatCubinHandle,
+ hostVarPtrAddress,
+ deviceAddress,
+ deviceName,
+ ext,
+ size,
+ constant,
+ global);
}
-char CUDARTAPI __cudaInitModule(
- void **fatCubinHandle
-) {
+char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
return dynload::__cudaInitModule(fatCubinHandle);
}
-void CUDARTAPI __cudaRegisterTexture(
- void **fatCubinHandle,
- const struct textureReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int norm,
- int ext
-) {
+void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
+ const struct textureReference *hostVar,
+ const void **deviceAddress,
+ const char *deviceName,
+ int dim,
+ int norm,
+ int ext) {
return dynload::__cudaRegisterTexture(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, dim, norm, ext);
+ fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
}
-void CUDARTAPI __cudaRegisterSurface(
- void **fatCubinHandle,
- const struct surfaceReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int ext
-) {
+void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
+ const struct surfaceReference *hostVar,
+ const void **deviceAddress,
+ const char *deviceName,
+ int dim,
+ int ext) {
return dynload::__cudaRegisterSurface(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, dim, ext);
+ fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
}
} /* extern "C" */
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index b564b96903..1a3ce08619 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,27 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
-P_DEFINE_string(cudnn_dir, "",
+P_DEFINE_string(cudnn_dir,
+ "",
"Specify path for loading libcudnn.so. For instance, "
- "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
- "cudnn from LD_LIBRARY_PATH");
+ "/usr/local/cudnn/lib. If empty [default], dlopen "
+ "will search cudnn from LD_LIBRARY_PATH");
-P_DEFINE_string(cuda_dir, "",
+P_DEFINE_string(cuda_dir,
+ "",
"Specify path for loading cuda library, such as libcublas, "
- "libcurand. For instance, /usr/local/cuda/lib64. "
- "(Note: libcudart can not be specified by cuda_dir, since some "
+ "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+ "libcudart can not be specified by cuda_dir, since some "
"build-in function in cudart already ran before main entry). "
- "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+ "If default, dlopen will search cuda from LD_LIBRARY_PATH");
-static inline std::string join(const std::string& part1, const std::string& part2) {
+static inline std::string join(const std::string& part1,
+ const std::string& part2) {
// directory separator
const char sep = '/';
-
if (!part2.empty() && part2.front() == sep) {
return part2;
}
@@ -46,100 +47,115 @@ static inline std::string join(const std::string& part1, const std::string& part
return ret;
}
-static inline void GetDsoHandleFromDefaultPath(
- std::string& dso_path, void** dso_handle, int dynload_flags) {
- VLOG(3) << "Try to find cuda library: " << dso_path
- << " from default system path.";
- // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+ void** dso_handle,
+ int dynload_flags) {
+ VLOG(3) << "Try to find cuda library: " << dso_path
+ << " from default system path.";
+ // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+ *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+ if (nullptr == *dso_handle) {
+ dso_path = join("/usr/local/cuda/lib/", dso_path);
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
- // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
- // bring System Integrity Projection (SIP), if dso_handle
- // is null, search from default package path in Mac OS.
- #if defined(__APPLE__) || defined(__OSX__)
if (nullptr == *dso_handle) {
- dso_path = join("/usr/local/cuda/lib/", dso_path);
- *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
- if (nullptr == *dso_handle) {
- if (dso_path == "libcudnn.dylib") {
- LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
- << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
- << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
- << "/usr/local/cuda/lib/libcudnn*";
- }
- }
- }
- #endif
+ if (dso_path == "libcudnn.dylib") {
+ LOG(FATAL)
+ << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
+ << "For instance, sudo tar -xzf "
+ "cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT
+ << "/usr/local \n sudo chmod a+r "
+ "/usr/local/cuda/include/cudnn.h " // NOLINT
+ << "/usr/local/cuda/lib/libcudnn*";
+ }
+ }
+ }
+#endif
}
-static inline void GetDsoHandleFromSearchPath(
- const std::string& search_root,
- const std::string& dso_name,
- void** dso_handle) {
- int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
- *dso_handle = nullptr;
-
- std::string dlPath = dso_name;
- if (search_root.empty()) {
- GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
- } else {
- // search xxx.so from custom path
- dlPath = join(search_root, dso_name);
- *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
- // if not found, search from default path
- if (nullptr == dso_handle) {
- LOG(WARNING) << "Failed to find cuda library: " << dlPath;
- dlPath = dso_name;
- GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
- }
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+ const std::string& dso_name,
+ void** dso_handle) {
+ int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+ *dso_handle = nullptr;
+
+ std::string dlPath = dso_name;
+ if (search_root.empty()) {
+ GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+ } else {
+ // search xxx.so from custom path
+ dlPath = join(search_root, dso_name);
+ *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+ // if not found, search from default path
+ if (nullptr == *dso_handle) {
+ LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+ dlPath = dso_name;
+ GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
}
+ }
- CHECK(nullptr != *dso_handle)
- << "Failed to find cuda library: " << dlPath << std::endl
- << "Please specify its path correctly using one of the following ideas: \n"
-
- << "Idea 1. set cuda and cudnn lib path at runtime. "
- << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
- << "For instance, issue command: paddle train --use_gpu=1 "
- << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
-
- << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
- << "DYLD_LIBRARY_PATH on Mac OS. \n"
- << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
-
- << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
- << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
- << "always work well.";
+ CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+ << std::endl
+ << "Please specify its path correctly using "
+ "one of the following ways: \n" // NOLINT
+
+ << "Method 1. set cuda and cudnn lib path at "
+ "runtime. "
+ << "http://www.paddlepaddle.org/doc/ui/"
+ "cmd_argument/"
+ "argument_outline.html \n" // NOLINT
+ << "For instance, issue command: paddle train "
+ "--use_gpu=1 "
+ << "--cuda_dir=/usr/local/cuda/lib64 "
+ "--cudnn_dir=/usr/local/cudnn/lib "
+ "...\n" // NOLINT
+
+ << "Method 2. set environment variable "
+ "LD_LIBRARY_PATH on Linux or "
+ << "DYLD_LIBRARY_PATH on Mac OS. \n"
+ << "For instance, issue command: export "
+ "LD_LIBRARY_PATH=... \n"
+
+ << "Note: After Mac OS 10.11, using the "
+ "DYLD_LIBRARY_PATH is impossible "
+ << "unless System Integrity Protection (SIP) "
+ "is disabled. However, "
+ "method 1 " // NOLINT
+ << "always work well.";
}
void GetCublasDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
#endif
}
void GetCudnnDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
#endif
}
void GetCudartDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
+ GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
+ GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
#endif
}
void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
#endif
}
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index 76d48c4a9b..f4bf888bab 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "avx_mathfun.h"
namespace hppl {
-__m256 exp(__m256 a) {
- return exp256_ps(a);
-}
+__m256 exp(__m256 a) { return exp256_ps(a); }
-__m256 log(__m256 a) {
- return log256_ps(a);
-}
+__m256 log(__m256 a) { return log256_ps(a); }
-__m256 sin(__m256 a) {
- return sin256_ps(a);
-}
+__m256 sin(__m256 a) { return sin256_ps(a); }
-__m256 cos(__m256 a) {
- return cos256_ps(a);
-}
+__m256 cos(__m256 a) { return cos256_ps(a); }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index adc88d60dd..d52b2a1df0 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include
@@ -21,8 +20,7 @@ limitations under the License. */
using std::chrono::high_resolution_clock;
int64_t getCurrentTimeStick() {
- high_resolution_clock::time_point tp = high_resolution_clock::now();
- high_resolution_clock::duration dtn = tp.time_since_epoch();
- return dtn.count();
+ high_resolution_clock::time_point tp = high_resolution_clock::now();
+ high_resolution_clock::duration dtn = tp.time_since_epoch();
+ return dtn.count();
}
-
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 27eed75d4d..f1bb94216c 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -51,12 +51,14 @@ static ClassRegistrar gActivationRegistrar;
* @brief Macro for registering a derived activation class
*/
#define END_DEFINE_ACTIVATION(ACTIVATION_NAME) \
- }; \
+ } \
+ ; \
const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
#ACTIVATION_NAME; \
static InitFunction __reg_activation__##ACTIVATION_NAME([] { \
- gActivationRegistrar.registerClass< \
- ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+ gActivationRegistrar \
+ .registerClass( \
+ #ACTIVATION_NAME); \
});
/**
@@ -111,14 +113,22 @@ void backward(Argument& act) {
outputG->softmaxBackward(*outputV);
} else {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+ Matrix::resizeOrCreate(sftMaxDot_,
+ outputG->getHeight(),
outputG->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
- Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
- /* trans */ false, useGpu(act.deviceId));
+ /* trans */ false,
+ useGpu(act.deviceId));
+ Matrix::resizeOrCreate(sftMaxSum_,
+ outputG->getHeight(),
+ 1,
+ /* trans */ false,
+ useGpu(act.deviceId));
if (!one_ || one_->getWidth() != outputG->getWidth()) {
- Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(one_,
+ 1,
+ outputG->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
one_->one();
}
@@ -130,7 +140,6 @@ void backward(Argument& act) {
}
END_DEFINE_ACTIVATION(softmax)
-
/**
* @brief Sequence_softmax Activation
* @note Softmax on all frames of one sequence.
@@ -146,10 +155,16 @@ void forward(Argument& act) {
CHECK_EQ(act.value->getWidth(), 1UL);
if (!argument_.value) {
- argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
- /* trans= */ false, useGpu(act.deviceId));
- argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
- /* trans= */ false, useGpu(act.deviceId));
+ argument_.value = Matrix::create(nullptr,
+ /* height= */ 1,
+ 1,
+ /* trans= */ false,
+ useGpu(act.deviceId));
+ argument_.grad = Matrix::create(nullptr,
+ /* height= */ 1,
+ 1,
+ /* trans= */ false,
+ useGpu(act.deviceId));
}
auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
@@ -267,8 +282,11 @@ END_DEFINE_ACTIVATION(softrelu)
BEGIN_DEFINE_ACTIVATION(abs)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->abs(*act.value);
@@ -286,8 +304,11 @@ END_DEFINE_ACTIVATION(abs)
BEGIN_DEFINE_ACTIVATION(square)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->square(*act.value);
@@ -317,8 +338,11 @@ END_DEFINE_ACTIVATION(exponential)
BEGIN_DEFINE_ACTIVATION(log)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->log(*act.value);
@@ -333,11 +357,9 @@ ActivationFunction* ActivationFunction::create(const std::string& type) {
std::vector ActivationFunction::getAllRegisteredTypes() {
std::vector types;
- gActivationRegistrar.forEachType([&](const std::string& type) {
- types.push_back(type);
- });
+ gActivationRegistrar.forEachType(
+ [&](const std::string& type) { types.push_back(type); });
return types;
}
-
} // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index c483372256..e9ed5c619a 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
#include
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 2cfb5a3a18..e6cc4a246a 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "DataProvider.h"
#include "paddle/utils/Util.h"
@@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
}
}
-DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
bool useGpu,
int64_t batchSize) {
batchSize_ = batchSize;
@@ -155,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() {
}
ClassRegistrar
-DataProvider::registrar_;
+ DataProvider::registrar_;
DataProvider* DataProvider::create(const DataConfig& config,
const ModelConfig& modelConfig,
@@ -182,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
for (int i = 0; i < config_.constant_slots_size(); ++i) {
MemoryHandlePtr handle =
constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
- Matrix::resizeOrCreate(constantSlots[i], batchSize,
+ Matrix::resizeOrCreate(constantSlots[i],
+ batchSize,
1, // = width
false, // = trans
useGpu_); // = useGpu
@@ -216,7 +216,8 @@ void DataProvider::initAsyncLoader() {
}
SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
- bool useGpu, bool withInfo)
+ bool useGpu,
+ bool withInfo)
: DataProvider(config, useGpu) {
/* initialize the size of a sample, and the buffer */
sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
@@ -337,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() {
sampleNumInBuf_ =
n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
hInputLabelBuf_->getData() + n,
- hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+ hInputInfoBuf_->getData() + n,
+ bufferCapacity_ - n);
/* for stachastic gradient training */
if (!skipShuffle_) {
@@ -357,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
SimpleDataProvider::~SimpleDataProvider() {}
-int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size) {
(void)info;
int64_t n = std::min(labels_.size() - currentSampleIndex_, size);
- memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+ memcpy(data,
+ &data_[currentSampleIndex_ * sampleDim_],
n * sampleDim_ * sizeof(real));
memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
currentSampleIndex_ += n;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 112e45de1c..8b7fb27f82 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -44,15 +43,15 @@ namespace paddle {
* @brief Macro for registering a data provider. The class type should contain
* a consturctor with parameter (DataConfig, bool).
*/
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
- static InitFunction __reg_type_##__type_name([]() {\
- DataProvider::registrar_.registerClass(\
- #__type_name, \
- [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
- DataProvider* dp = new __class_name (conf, useGpu);\
- return dp;\
- });\
-})
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name) \
+ static InitFunction __reg_type_##__type_name([]() { \
+ DataProvider::registrar_.registerClass( \
+ #__type_name, \
+ [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+ DataProvider* dp = new __class_name(conf, useGpu); \
+ return dp; \
+ }); \
+ })
/**
* @def REGISTER_DATA_PROVIDER_EX
@@ -61,8 +60,8 @@ namespace paddle {
*/
#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name) \
static InitFunction __reg_type_##__type_name([] { \
- DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-})
+ DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+ })
class DataBatch;
class BufferBatch;
@@ -181,7 +180,8 @@ public:
* @param[in] size DataBatch.getSize()
* @param[in] dataId sub dataprovider id (in MultiDataProvider)
*/
- void appendArguments(const std::vector& argus, int size,
+ void appendArguments(const std::vector& argus,
+ int size,
int dataId) {
size_ += size;
for (const auto& argu : argus) {
@@ -259,9 +259,7 @@ typedef Queue BufferBatchQueue;
class DoubleBuffer {
public:
- DoubleBuffer(DataProvider* dataPool,
- bool useGpu,
- int64_t batchSize = 0);
+ DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
virtual ~DoubleBuffer();
void removeOneBatch(DataBatch* dataBatch);
@@ -310,7 +308,7 @@ public:
/**
* @brief create only used for unittest.
*/
- inline static DataProvider* create(const DataConfig &config,
+ inline static DataProvider* create(const DataConfig& config,
bool useGpu = FLAGS_use_gpu) {
return create(config, ModelConfig(), useGpu);
}
@@ -462,7 +460,9 @@ protected:
*
* label[n] is the label for the n-th sample.
*/
- virtual int64_t fillBufferImp(real* data, int* label, int* info,
+ virtual int64_t fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size) = 0;
};
@@ -475,7 +475,9 @@ public:
protected:
void loadData(const std::string& fileName);
void loadDataFile(const std::string& fileName);
- virtual int64_t fillBufferImp(real* data, int* label, int* info,
+ virtual int64_t fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size);
protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 0689f90f3e..6c178e29ee 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "DataProvider.h"
@@ -65,8 +64,8 @@ void DataProviderGroup::reset() {
provider_ = nullptr;
// shuffle file list
- std::shuffle(fileList_.begin(), fileList_.end(),
- ThreadLocalRandomEngine::get());
+ std::shuffle(
+ fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
startLoader();
DataProvider::reset();
@@ -113,8 +112,9 @@ void DataProviderGroup::startLoader() {
size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
std::vector fileVec(fileList_.begin() + startPos,
fileList_.begin() + endPos);
- loader_->addJob([this, fileVec]()
- -> ProviderPtrType { return this->loadFile(fileVec); });
+ loader_->addJob([this, fileVec]() -> ProviderPtrType {
+ return this->loadFile(fileVec);
+ });
}
loader_->stopAddJob();
}
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 8e4f53978a..51fb1f2666 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "paddle/utils/Util.h"
#include "MultiDataProvider.h"
#include "paddle/utils/Logging.h"
@@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config,
"MultiDataProvider";
subConfig.set_async_load_data(false);
}
- subDataProviders_[i] =
- std::unique_ptr(DataProvider::create(subConfig,
- modelConfig,
- useGpu_));
+ subDataProviders_[i] = std::unique_ptr(
+ DataProvider::create(subConfig, modelConfig, useGpu_));
}
}
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index b498ba6516..876467c04f 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "DataProvider.h"
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 344644755f..0a7ff80246 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "ProtoDataProvider.h"
#include "paddle/utils/Util.h"
#include "paddle/utils/StringUtil.h"
@@ -23,7 +22,8 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "DataProviderGroup.h"
-P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+P_DEFINE_double(memory_threshold_on_load_data,
+ 1.0,
"stop loading data when memory is not sufficient");
namespace paddle {
@@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup);
REGISTER_DATA_PROVIDER(proto_sequence_group,
DataProviderGroup);
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll)
: DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
if (loadDataAll) {
@@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
}
slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
const unsigned int* ids = sample.vector_slots(i).ids().data();
- memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+ memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
+ ids,
sizeof(*ids) * slotSize);
slot.indices.push_back(slot.indices.back() + slotSize);
if (subSlotSize) {
@@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
slot.varDenseData[oldSize].data.resize(varDim);
const float* values = sample.vector_slots(i).values().data();
#ifdef PADDLE_TYPE_DOUBLE
- std::copy(values, values + varDim,
- slot.varDenseData[oldSize].data.data());
+ std::copy(
+ values, values + varDim, slot.varDenseData[oldSize].data.data());
#else
- memcpy(slot.varDenseData[oldSize].data.data(), values,
+ memcpy(slot.varDenseData[oldSize].data.data(),
+ values,
sizeof(real) * varDim);
#endif
slot.varDenseData[oldSize].dims.resize(
@@ -374,8 +377,9 @@ void ProtoDataProvider::reset() {
}
void ProtoDataProvider::shuffle() {
- std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
- ThreadLocalRandomEngine::get());
+ std::shuffle(shuffledSequenceIds_.begin(),
+ shuffledSequenceIds_.end(),
+ ThreadLocalRandomEngine::get());
}
/*
@@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
if (!iidData()) {
ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
- numSequences + 1, /* useGpu= */ false);
+ numSequences + 1,
+ /* useGpu= */ false);
int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
int pos = 0;
int i = 0;
@@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
switch (slotType) {
case SlotDef::VECTOR_DENSE: {
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value = Matrix::createSparseMatrix(
- size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
- false, useGpu_);
+ cpuArguments[slot].value =
+ Matrix::createSparseMatrix(size,
+ dim,
+ size /*DEFAULT_AVG_WIDTH = 1*/,
+ NO_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
- slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
+ slots_[slot].sparseNonValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
slots_[slot].sparseNonValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::VECTOR_SPARSE_VALUE: {
if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value = Matrix::createSparseMatrix(
- size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
- SPARSE_CSR, false, useGpu_);
+ cpuArguments[slot].value =
+ Matrix::createSparseMatrix(size,
+ dim,
+ size /*DEFAULT_AVG_WIDTH = 1*/,
+ FLOAT_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast(mat)) {
- std::dynamic_pointer_cast(mat)->copyFrom(
- dataPos.data(), slots_[slot].indices.data(),
- slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+ std::dynamic_pointer_cast(mat)
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
+ slots_[slot].sparseFloatValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
slots_[slot].sparseFloatValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
break;
}
case SlotDef::INDEX: {
- IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ size,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
for (int i = 0; i < size; ++i) {
@@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
if (oldWidth < height) {
totalDim = width * height * depth;
}
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ totalDim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
}
} else {
- memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+ memcpy(buf,
+ slots_[slot].varDenseData[dataPos[0]].data.data(),
sizeof(real) * totalDim);
}
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1, /* size == 1 currently */
+ /* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
@@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
case SlotDef::VAR_MDIM_INDEX: {
CHECK_EQ(size, 1);
size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
- IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ totalDim,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
- memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+ memcpy(buf,
+ slots_[slot].varIndices[dataPos[0]].data(),
sizeof(int) * totalDim);
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1, /* size == 1 currently */
+ /* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
@@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
gpuArguments[i].sequenceStartPositions =
cpuArguments[i].sequenceStartPositions;
} else {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
}
hl_stream_synchronize(HPPL_STREAM_1);
@@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
sampleLoop(op, size);
// current slot: sequenceStartPositions
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1,
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1,
+ /* useGpu= */ false);
switch (slotType) {
case SlotDef::VECTOR_SPARSE_VALUE:
@@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
};
int subSize = subSampleLoop(op, size, slot);
ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].subSequenceStartPositions, subSize + 1,
- false);
+ cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
int* currPosOfArgumentSubSeqStart =
- cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+ cpuArguments[slot].subSequenceStartPositions->getMutableData(
+ false);
int64_t* subSeqs = dataSubPos.data();
int64_t* subIndexs = slots_[slot].subIndices.data();
int allSubSequenceLength = 0;
@@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::INDEX: {
// label slot
- IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ size,
/* useGpu= */ false);
// fill labels
int* buf = cpuArguments[slot].ids->getData();
@@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
case SlotDef::VECTOR_DENSE: {
// copy values
size_t dim = header_.slot_defs(slot).dim();
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (size_t i = 0; i < cpuArguments.size(); ++i) {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
*batch = gpuBatch;
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 846dd7673a..ffdcc8fdc9 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -48,7 +47,8 @@ namespace paddle {
*/
class ProtoDataProvider : public DataProvider {
public:
- ProtoDataProvider(const DataConfig& config, bool useGpu,
+ ProtoDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
virtual void reset();
@@ -161,14 +161,16 @@ protected:
};
/**
- * @brief Special use for Proto data: instances should contain sparse-non-value slots
+ * @brief Special use for Proto data: instances should contain sparse-non-value
+ * slots
* and label.
*
* @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
*/
class ProtoSequenceDataProvider : public ProtoDataProvider {
public:
- ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+ ProtoSequenceDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
~ProtoSequenceDataProvider() {}
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 3b1eb7e9ef..b8fca3cd7f 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -138,7 +137,8 @@ protected:
*
* @note this code depends on protobuf 2.4.0. There is nothing like
* CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
- * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+ * bytes has the object readed so far. Therefore, we calculated bytes
+ * ourselves.
*/
int approximateReadedBytes_;
};
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 1332c0ab63..bee6ca14a2 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PyDataProvider.h"
#include "paddle/utils/PythonUtil.h"
#include
#include "paddle/utils/Util.h"
#include "paddle/utils/Excepts.h"
-
namespace paddle {
#ifndef PADDLE_NO_PYTHON
REGISTER_DATA_PROVIDER(py, PyDataProvider);
#endif
-PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+PyDataProvider::PyDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll)
: DataProvider(config, useGpu), batchSize_(0) {
PyGuard guard;
@@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector& fileList) {
classInstance_ =
createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
CHECK(classInstance_) << "Create class instance failed.";
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("getHeader"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("getHeader"), NULL));
CHECK_PY(obj) << "Call function getHeader failed.";
std::string headerInfo =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() {
}
}
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
unsigned int dim = slot.dim;
slot.sampleNum = readT(data, dataEnd);
@@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
float* dat = reinterpret_cast(data);
std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
#else
- memcpyWithCheck(slot.denseData.data(), data,
- sizeof(real) * dim * slot.sampleNum, dataEnd);
+ memcpyWithCheck(slot.denseData.data(),
+ data,
+ sizeof(real) * dim * slot.sampleNum,
+ dataEnd);
#endif
// PyDataProvider always provide data in float
data += sizeof(float) * dim * slot.sampleNum;
}
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
unsigned int* indexPtr = (unsigned int*)data;
@@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
length = readT(data, dataEnd);
slot.indices.push_back(length);
slot.sparseNonValueData.resize(length);
- memcpyWithCheck(slot.sparseNonValueData.data(), data,
- sizeof(unsigned int) * length, dataEnd);
+ memcpyWithCheck(slot.sparseNonValueData.data(),
+ data,
+ sizeof(unsigned int) * length,
+ dataEnd);
data += sizeof(unsigned int) * length;
}
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
unsigned int* indexPtr = (unsigned int*)data;
@@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
}
}
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
@@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
data += sizeof(unsigned int) * slot.sampleNum;
}
-void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
for (unsigned int i = 0; i < slot.sampleNum; ++i) {
@@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
}
for (size_t i = 0; i < sequenceNum; ++i) {
size_t begin = slot.sequenceStartPositions[i];
- size_t end = (i < sequenceNum - 1)
- ? slot.sequenceStartPositions[i + 1]
- : slot.sampleNum;
+ size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+ : slot.sampleNum;
for (size_t ii = begin; ii < end; ++ii) {
slot.sampleSequenceIdVec.push_back(ii);
}
@@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
void PyDataProvider::reset() {
{ // Invoke PyDataProvider Reset
PyGuard guard;
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("reset"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("reset"), NULL));
CHECK_PY(obj) << "Call function reset failed.";
}
@@ -270,15 +277,18 @@ void PyDataProvider::reset() {
void PyDataProvider::shuffle() {
// py shuffle
PyGuard guard;
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("shuffle"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("shuffle"), NULL));
CHECK_PY(obj) << "Call function shuffle failed.";
}
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
unsigned int dim = slot.dim;
- Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+ Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+ slot.sampleNum,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slotIndex].value->getData();
@@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot(
ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
unsigned int dim = slot.dim;
if (!(cpuArguments[slotIndex].value)) {
- cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
- slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
- SPARSE_CSR, false, useGpu_);
+ cpuArguments[slotIndex].value =
+ Matrix::createSparseMatrix(slot.sampleNum,
+ dim,
+ slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+ NO_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slotIndex].value;
mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
- slot.sparseNonValueData.data(), HPPL_STREAM_1);
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
+ slot.sparseNonValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
slot.sparseNonValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot(
ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
unsigned int dim = slot.dim;
if (!(cpuArguments[slotIndex].value)) {
- cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
- slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
- FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+ cpuArguments[slotIndex].value =
+ Matrix::createSparseMatrix(slot.sampleNum,
+ dim,
+ slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+ FLOAT_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slotIndex].value;
mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
- slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
+ slot.sparseFloatValueData.data(),
+ HPPL_STREAM_DEFAULT);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
slot.sparseFloatValueData.data());
} else {
LOG(FATAL) << "Not Supported";
}
}
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
- IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+ IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+ slot.sampleNum,
/*useGpu_*/ false);
int* buf = cpuArguments[slotIndex].ids->getData();
for (size_t i = 0; i < slot.sampleNum; ++i) {
@@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
}
}
-void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
if (cpuArguments[slotIndex].strs) {
cpuArguments[slotIndex].strs->resize(slot.sampleNum);
@@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
PyGuard guard;
PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
const_cast("getNextBatch"),
- const_cast("i"), size));
+ const_cast("i"),
+ size));
CHECK_PY(obj) << "Call function getNextBatch failed.";
const std::string& samples =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
if (!iidData()) {
for (size_t j = 0; j < slotNum_; ++j) {
auto& slot = slots_[j];
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[j].sequenceStartPositions,
- slot.sequenceNum + 1, /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+ slot.sequenceNum + 1,
+ /* useGpu= */ false);
int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
std::copy(slot.sequenceStartPositions.begin(),
- slot.sequenceStartPositions.end(), buf);
+ slot.sequenceStartPositions.end(),
+ buf);
buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
if (slot.subSequenceStartPositions.size()) {
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[j].subSequenceStartPositions,
- slot.subSequenceNum + 1,
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+ slot.subSequenceNum + 1,
+ /* useGpu= */ false);
int* buf =
- cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+ cpuArguments[j].subSequenceStartPositions->getMutableData(false);
std::copy(slot.subSequenceStartPositions.begin(),
- slot.subSequenceStartPositions.end(), buf);
+ slot.subSequenceStartPositions.end(),
+ buf);
buf[slot.subSequenceNum] = slot.sampleNum;
// check subSequenceStartPositions and sequenceStartPositions
cpuArguments[j].checkSubset();
@@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
cpuArguments[i].subSequenceStartPositions;
}
} else {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
}
hl_stream_synchronize(HPPL_STREAM_1);
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 939d9cf725..6bb7c831fd 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -25,7 +24,8 @@ namespace paddle {
class PyDataProvider : public DataProvider {
public:
- PyDataProvider(const DataConfig& config, bool useGpu,
+ PyDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
virtual void reset();
@@ -48,21 +48,27 @@ protected:
void parseHeaderData(const std::string& headerData);
void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
- void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+ void fillSparseNonValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd);
void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillSlotsByStr(const std::string& samples);
- void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleDenseSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleSparseNonValueSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleSparseValueSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleIndexSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleStringSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
void resetSlots();
void loadData(const std::vector& fileList);
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 90391a7c30..967fc9026a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -34,7 +34,7 @@ namespace paddle {
namespace unittest {
static std::unique_ptr>
- OnPoolFilled;
+ OnPoolFilled;
namespace pydp2 {
@@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function& callback) {
*OnPoolFilled = callback;
}
-void clearOnPoolFilledHook() {
- OnPoolFilled.reset();
-}
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
} // namespace pydp2
} // namespace unittest
-
-
/**
* Slot type
*/
@@ -65,17 +61,13 @@ enum SlotType {
/**
* Sequence type
*/
-enum SeqType {
- SQT_NONE = 0,
- SQT_SEQ,
- SQT_SUBSEQ
-};
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
/**
* Cache Type.
*/
enum CacheType {
- NO_CACHE = 0, // Each pass will load data from PyDataProvider2.
+ NO_CACHE = 0, // Each pass will load data from PyDataProvider2.
CACHE_PASS_IN_MEM = 1, // First pass will load data from PyDataProvider2,
// then cache all data in memory. Load data from
// memory in rest passes.
@@ -87,8 +79,8 @@ struct SlotHeader { // Slot Header will parse from python object's slots field.
SeqType seqType;
};
-inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
- os <<"Dim = " << header.dim << " Type = " << header.slotType
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+ os << "Dim = " << header.dim << " Type = " << header.slotType
<< " SeqType = " << header.seqType;
return os;
}
@@ -158,7 +150,6 @@ protected:
SlotHeader* headerPtr_;
};
-
/**
* Py Data Provider Cache Interface.
*/
@@ -209,17 +200,13 @@ public:
PyDataProvider2(const DataConfig& config,
const ModelConfig& modelConfig,
bool useGpu)
- :DataProvider(config, useGpu),
- callingContextCreated_(2) {
- if (PyArray_API == NULL)
- import_array();
+ : DataProvider(config, useGpu), callingContextCreated_(2) {
+ if (PyArray_API == NULL) import_array();
auto& args = config.load_data_args();
PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
if (!args.empty()) {
kwargs = callPythonFuncRetPyObj(
- "paddle.trainer.PyDataProvider2",
- "deserialize_args",
- {args});
+ "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
}
py::DictHelper kwargsDict(kwargs);
@@ -245,40 +232,38 @@ public:
* Dtor
* @note will stop loading thread when destructing
*/
- virtual ~PyDataProvider2() {
- resetImpl(false);
- }
+ virtual ~PyDataProvider2() { resetImpl(false); }
private:
void createPyDataObj(const std::string& model,
const std::string& className,
const std::string& fileListName,
- PyObjectPtr && kwargs) {
- LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+ PyObjectPtr&& kwargs // NOLINT
+ ) {
+ LOG(INFO) << "loading dataprovider " << model << "::" << className;
PyObjectPtr module = py::import(model);
PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
- PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
- className.c_str()));
+ PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
CHECK_PY(cls) << "load class " << className.c_str() << "error";
// If there are multiple python instance share same module, the PyObjectPtr
// only for instance will make python reference-count error.
//
// So here, we increase reference count manually.
- if (gModuleClsPtrs_.find((uintptr_t) module.get())
- != gModuleClsPtrs_.end()) {
+ if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
+ gModuleClsPtrs_.end()) {
// Multi instance use same module
Py_XINCREF(module.get());
Py_XINCREF(moduleDict.get());
} else {
- gModuleClsPtrs_.insert((uintptr_t) module.get());
+ gModuleClsPtrs_.insert((uintptr_t)module.get());
}
- if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+ if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
Py_XINCREF(cls.get());
} else {
- gModuleClsPtrs_.insert((uintptr_t) cls.get());
+ gModuleClsPtrs_.insert((uintptr_t)cls.get());
}
PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
@@ -294,8 +279,8 @@ private:
py::ObjectHelper self(this->instance_);
bool ok;
- this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
- &ok /*isBoolType*/);
+ this->skipShuffle_ =
+ !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
if (!ok) {
this->skipShuffle_ = testing; // shuffle when is training, skip shuffle
// when is testing.
@@ -335,12 +320,12 @@ private:
PyObjectPtr headerPtrWrap(hdPtr);
py::ObjectHelper hd(headerPtrWrap);
header.dim = hd.getIntAttrWithError("dim");
- header.seqType = (SeqType) hd.getIntAttrWithError("seq_type");
- header.slotType = (SlotType) hd.getIntAttrWithError("type");
+ header.seqType = (SeqType)hd.getIntAttrWithError("seq_type");
+ header.slotType = (SlotType)hd.getIntAttrWithError("type");
}
DBG << "Data header size " << headers_.size();
- for (auto & header : headers_) {
+ for (auto& header : headers_) {
DBG << header;
}
cache_.reset(IPyDataProviderCache::create(
@@ -351,8 +336,7 @@ private:
loadFileList(fileListName, fileLists_);
PyObject* lst = PyList_New(fileLists_.size());
for (size_t i = 0; i < fileLists_.size(); ++i) {
- PyList_SET_ITEM(lst, i,
- PyString_FromString(fileLists_[i].c_str()));
+ PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
}
return PyObjectPtr(lst);
}
@@ -414,11 +398,12 @@ private:
CHECK(ok) << "CalcBatchSize must return int or long";
}
- if (this->loadThread_){ // wait poolActualSize < poolSize;
+ if (this->loadThread_) { // wait poolActualSize < poolSize;
std::unique_lock l(mtx_);
- pushCV_.wait(l, [this, additionalBatchSize] {
- return this->poolActualSize_ < poolSize_;
- });
+ pushCV_.wait(l,
+ [this, additionalBatchSize] {
+ return this->poolActualSize_ < poolSize_;
+ });
}
{
@@ -487,14 +472,14 @@ private:
std::vector fileLists_;
std::vector headers_;
static PyObjectPtr zeroTuple_;
- static std::unordered_set gModuleClsPtrs_;
+ static std::unordered_set gModuleClsPtrs_;
class PositionRandom {
public:
- inline explicit PositionRandom(bool skipRand):
- eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+ inline explicit PositionRandom(bool skipRand)
+ : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
- inline size_t operator() (size_t len) {
+ inline size_t operator()(size_t len) {
if (!skipRand_) {
if (!dist_ || dist_->b() != len - 1) {
dist_.reset(new std::uniform_int_distribution(0, len - 1));
@@ -525,32 +510,31 @@ public:
* Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
* select data from datapool.
*/
- void shuffle() {
- }
+ void shuffle() {}
/**
* Not limited size.
*/
- int64_t getSize() {
- return -1;
- }
+ int64_t getSize() { return -1; }
/**
* Loading a batch of data.
*/
- int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+ int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
std::lock_guard guard(mutexForReset_);
REGISTER_TIMER("PyDP2.getNextBatchInternal")
CHECK_GE(size_, 0);
- size_t size = (size_t) size_;
+ size_t size = (size_t)size_;
if (loadThread_) { // loading from thread should wait for data pool ready.
// but, loading from cache, cache object should ensure
// data pool ready.
std::unique_lock l(mtx_);
- pullCV_.wait(l, [this, &size] {
- return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
- || callingContexts_.empty();
- });
+ pullCV_.wait(l,
+ [this, &size] {
+ return this->poolActualSize_ >=
+ std::max(size, this->minPoolSize_) ||
+ callingContexts_.empty();
+ });
if (unittest::OnPoolFilled) {
(*unittest::OnPoolFilled)(this->poolActualSize_);
@@ -633,35 +617,35 @@ public:
cpuBatch.setSize(bsize);
auto& inArgs = cpuBatch.getStreams();
inArgs.resize(headers_.size());
- std::vector > scanners;
+ std::vector> scanners;
scanners.reserve(headers_.size());
for (auto& header : headers_) {
scanners.emplace_back(IFieldScanner::create(&header));
}
DBG << "Scanner created.";
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->startPrepare(inArgs[i]);
}
- for (auto & d : data) {
+ for (auto& d : data) {
py::SequenceHelper s(d);
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->prepare(inArgs[i], s[i]);
}
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->finishPrepare(inArgs[i]);
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->startFill(inArgs[i]);
}
- for (auto & d : data) {
+ for (auto& d : data) {
py::SequenceHelper s(d);
for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->fill(inArgs[i], s[i]);
}
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->finishFill(inArgs[i]);
}
@@ -679,8 +663,8 @@ public:
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (size_t i = 0; i < headers_.size(); ++i) {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
} else {
@@ -690,31 +674,28 @@ public:
}
};
-std::unordered_set PyDataProvider2::gModuleClsPtrs_;
+std::unordered_set PyDataProvider2::gModuleClsPtrs_;
PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
-
/**
* Scanner for dense slot.
*/
-class DenseScanner: public IFieldScanner {
+class DenseScanner : public IFieldScanner {
public:
- explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+ explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
/**
* Prepare.
* @param argument target argument
* @param obj each timestep of a sample.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
- ++height_;
- }
+ virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
- false, false);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreate(
+ argument.value, height_, headerPtr_->dim, false, false);
height_ = 0;
}
@@ -723,24 +704,23 @@ public:
* @param argument
* @param obj
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
real* dat = argument.value->getData() + height_ * headerPtr_->dim;
if (PyArray_Check(obj)) {
- auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
- if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
- real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
- auto sz = PyArray_SIZE((PyArrayObject*)obj);
- std::copy(data, data + sz, dat);
- } else {
- LOG(FATAL) << "You should yield float" << sizeof(real) * 8
- << " array";
- }
- } else {
- py::SequenceHelper s(obj);
- // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
- for (size_t i=0; i < headerPtr_->dim; ++i) {
- dat[i] = (real) s.getDouble(i);
- }
+ auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+ if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+ real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+ auto sz = PyArray_SIZE((PyArrayObject*)obj);
+ std::copy(data, data + sz, dat);
+ } else {
+ LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+ }
+ } else {
+ py::SequenceHelper s(obj);
+ // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+ for (size_t i = 0; i < headerPtr_->dim; ++i) {
+ dat[i] = (real)s.getDouble(i);
+ }
}
++height_;
}
@@ -752,20 +732,18 @@ private:
/**
* Scanner for index slot
*/
-class IndexScanner: public IFieldScanner {
+class IndexScanner : public IFieldScanner {
public:
- explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+ explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
/**
* Prepare memory space.
*
* @note obj is a single timestep of sample
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
- ++cnt_;
- }
+ virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
- virtual void finishPrepare(Argument &argument) {
+ virtual void finishPrepare(Argument& argument) {
IVector::resizeOrCreate(argument.ids, cnt_, false);
cnt_ = 0;
}
@@ -773,9 +751,9 @@ public:
/**
* Fill one index to argument.
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
bool ok;
- argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
+ argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
CHECK(ok) << "Cannot cast int " << py::repr(obj);
}
@@ -785,27 +763,25 @@ private:
class SparseNonValueScanner : public IFieldScanner {
public:
- explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
- nnz_(0),
- height_(0) {}
+ explicit SparseNonValueScanner(SlotHeader* ptr)
+ : IFieldScanner(ptr), nnz_(0), height_(0) {}
/**
* Prepare memory space
* @note obj is a timestep of one sample.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
+ virtual void prepare(Argument& argument, PyObject* obj) {
++height_;
nnz_ += py::SequenceHelper(obj).size();
}
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
- headerPtr_->dim,
- nnz_, NO_VALUE);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreateSparseMatrix(
+ argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
}
- virtual void startFill(Argument & argument) {
- auto smat = (CpuSparseMatrix*) (argument.value.get());
+ virtual void startFill(Argument& argument) {
+ auto smat = (CpuSparseMatrix*)(argument.value.get());
smat->getRows()[0] = 0;
nnz_ = 0;
height_ = 1;
@@ -818,14 +794,14 @@ public:
virtual void fill(Argument& argument, PyObject* obj) {
py::SequenceHelper s(obj);
auto sz = s.size();
- auto smat = (CpuSparseMatrix*) (argument.value.get());
+ auto smat = (CpuSparseMatrix*)(argument.value.get());
int* row = smat->getRows();
int* col = smat->getCols();
real* dat = smat->getData();
- row[height_] = row[height_-1] + (int)sz;
+ row[height_] = row[height_ - 1] + (int)sz;
for (decltype(sz) i = 0; i < sz; ++i) {
- setData(col+nnz_, dat+nnz_, s[i]);
+ setData(col + nnz_, dat + nnz_, s[i]);
++nnz_;
}
++height_;
@@ -839,7 +815,7 @@ protected:
* @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
* For sparse_value is a Tuple (int, float).
*/
- virtual void setData(int* col, real * dat, PyObject* obj) {
+ virtual void setData(int* col, real* dat, PyObject* obj) {
bool ok;
*col = py::castInt(obj, &ok);
CHECK(ok);
@@ -851,26 +827,25 @@ protected:
class SparseValueScanner : public SparseNonValueScanner {
public:
- explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+ explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
- headerPtr_->dim,
- nnz_, FLOAT_VALUE);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreateSparseMatrix(
+ argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
}
protected:
- virtual void setData(int *col, real *dat, PyObject *obj) {
+ virtual void setData(int* col, real* dat, PyObject* obj) {
py::SequenceHelper s(obj);
SparseNonValueScanner::setData(col, dat, s[0]);
- *dat = (real) s.getDouble(1);
+ *dat = (real)s.getDouble(1);
}
};
/**
* Sequence Scanner. Scanner for sequence or sub-sequence.
*/
-class SequenceScanner: public IFieldScanner {
+class SequenceScanner : public IFieldScanner {
public:
/**
* Ctor
@@ -879,15 +854,18 @@ public:
* return a sequence start position or a sub-sequence
* start position.
*/
- SequenceScanner(std::unique_ptr&& innerScanner,
- const std::function& getSeqStartPos)
- : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
- cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+ SequenceScanner(
+ std::unique_ptr&& innerScanner,
+ const std::function& getSeqStartPos)
+ : IFieldScanner(nullptr),
+ inner_(std::move(innerScanner)),
+ cnt_(0),
+ getSeqStartPos_(getSeqStartPos) {}
/**
* Start prepare. Invoke inner->startPrepare too.
*/
- virtual void startPrepare(Argument &argument) {
+ virtual void startPrepare(Argument& argument) {
inner_->startPrepare(argument);
}
@@ -895,10 +873,10 @@ public:
* Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
* element of sequence obj.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
+ virtual void prepare(Argument& argument, PyObject* obj) {
py::SequenceHelper s(obj);
++cnt_;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
inner_->prepare(argument, s[i]);
}
}
@@ -906,7 +884,7 @@ public:
/**
* Finish prepare. invoke inner_->finishPrepare too.
*/
- virtual void finishPrepare(Argument &argument) {
+ virtual void finishPrepare(Argument& argument) {
ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
inner_->finishPrepare(argument);
}
@@ -914,7 +892,7 @@ public:
/**
* Start fill. invoke inner->startFill too.
*/
- virtual void startFill(Argument &argument) {
+ virtual void startFill(Argument& argument) {
getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
cnt_ = 1;
inner_->startFill(argument);
@@ -925,13 +903,13 @@ public:
* sequence obj. And set seqStartPos at same time. The seqStartPos will be
* calculated by getSeqStartPos callback passed in ctor.
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
- getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
- (int)getSize(obj);
+ getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+ (int)getSize(obj);
py::SequenceHelper s(obj);
++cnt_;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
inner_->fill(argument, s[i]);
}
}
@@ -939,9 +917,7 @@ public:
/**
* Finish fill. will invoke inner->finishFill too.
*/
- virtual void finishFill(Argument &argument) {
- inner_->finishFill(argument);
- }
+ virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
protected:
size_t getSize(PyObject* obj) {
@@ -949,7 +925,7 @@ protected:
auto sc = dynamic_cast(inner_.get());
if (sc) {
size_t sum = 0;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
sum += sc->getSize(s[i]);
}
return sum;
@@ -964,8 +940,7 @@ private:
std::function getSeqStartPos_;
};
-
-IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
IFieldScanner* retv = nullptr;
switch (header->slotType) {
case ST_DENSE:
@@ -989,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
break;
case SQT_SUBSEQ:
retv = new SequenceScanner(std::unique_ptr(retv),
- [](Argument& arg) -> ICpuGpuVectorPtr& {
- return arg.subSequenceStartPositions;
- });
- // fall through, not break;
+ [](Argument& arg) -> ICpuGpuVectorPtr& {
+ return arg.subSequenceStartPositions;
+ });
+ // fall through, not break;
case SQT_SEQ:
retv = new SequenceScanner(std::unique_ptr(retv),
- [](Argument& arg) -> ICpuGpuVectorPtr& {
- return arg.sequenceStartPositions;
- });
+ [](Argument& arg) -> ICpuGpuVectorPtr& {
+ return arg.sequenceStartPositions;
+ });
break;
default:
LOG(FATAL) << "Not implemented";
@@ -1010,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
* No Cache Strategy. Will destruct old data immediately and load data from
* python every pass.
*/
-class NoCacheStrategy: public IPyDataProviderCache {
+class NoCacheStrategy : public IPyDataProviderCache {
public:
- virtual bool reset() {
- return true;
- }
+ virtual bool reset() { return true; }
- virtual void drop(std::deque *data) {
- data->clear();
- }
+ virtual void drop(std::deque* data) { data->clear(); }
- virtual std::deque* load() {
- return nullptr;
- }
+ virtual std::deque* load() { return nullptr; }
};
/**
@@ -1033,9 +1002,9 @@ public:
*/
class CacheOnePassInMemory : public IPyDataProviderCache {
public:
- CacheOnePassInMemory() : objPool_(new std::deque()),
- droppedPool_(new std::deque())
- {}
+ CacheOnePassInMemory()
+ : objPool_(new std::deque()),
+ droppedPool_(new std::deque()) {}
virtual bool reset() {
if (objPool_->empty() && droppedPool_->empty()) {
@@ -1048,25 +1017,22 @@ public:
}
}
- virtual void drop(std::deque *data) {
+ virtual void drop(std::deque* data) {
size_t orgSize = droppedPool_->size();
droppedPool_->resize(orgSize + data->size());
- for (size_t i=0; i < data->size(); ++i) {
+ for (size_t i = 0; i < data->size(); ++i) {
std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
}
data->clear();
}
- virtual std::deque* load() {
- return objPool_.get();
- }
+ virtual std::deque* load() { return objPool_.get(); }
private:
- std::unique_ptr > objPool_;
- std::unique_ptr > droppedPool_;
+ std::unique_ptr> objPool_;
+ std::unique_ptr> droppedPool_;
};
-
IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
switch (ct) {
case NO_CACHE:
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index e397c71c87..8f7d2fb80e 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "Evaluator.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
@@ -33,7 +32,8 @@ private:
str.clear();
int prevLabel = -1;
for (std::vector