Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

revert-13637-optimize-opyreader
Dang Qingqing 7 years ago
commit 182b24ce3c

@ -213,9 +213,11 @@ include(configure) # add paddle env configuration
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
include(tensorrt) include(tensorrt)
endif()
if(WITH_MKL OR WITH_MKLML)
include(external/anakin) include(external/anakin)
elseif() elseif()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE) set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
endif() endif()
include(generic) # simplify cmake module include(generic) # simplify cmake module

@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ... # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle RUN pip install -U kubernetes paddlepaddle
@ -27,5 +28,6 @@ ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/ ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
ADD models/ /workspace/models/ ADD models/ /workspace/models/

@ -17,7 +17,8 @@ import argparse
__all__ = ['parse_args', ] __all__ = ['parse_args', ]
BENCHMARK_MODELS = [ BENCHMARK_MODELS = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
"stacked_dynamic_lstm", "resnet_with_preprocess"
] ]
@ -67,12 +68,12 @@ def parse_args():
'--cpus', '--cpus',
type=int, type=int,
default=1, default=1,
help='If cpus > 1, will use ParallelDo to run, else use Executor.') help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
parser.add_argument( parser.add_argument(
'--data_set', '--data_set',
type=str, type=str,
default='flowers', default='flowers',
choices=['cifar10', 'flowers'], choices=['cifar10', 'flowers', 'imagenet'],
help='Optional dataset for benchmark.') help='Optional dataset for benchmark.')
parser.add_argument( parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.') '--infer_only', action='store_true', help='If set, run forward only.')
@ -122,6 +123,11 @@ def parse_args():
type=str, type=str,
default="", default="",
help='Directory that contains all the training recordio files.') help='Directory that contains all the training recordio files.')
parser.add_argument(
'--test_data_path',
type=str,
default="",
help='Directory that contains all the test data (NOT recordio).')
parser.add_argument( parser.add_argument(
'--use_inference_transpiler', '--use_inference_transpiler',
action='store_true', action='store_true',
@ -130,5 +136,9 @@ def parse_args():
'--no_random', '--no_random',
action='store_true', action='store_true',
help='If set, keep the random seed and do not shuffle the data.') help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
args = parser.parse_args() args = parser.parse_args()
return args return args

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -163,6 +163,19 @@ def gen_job():
volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}}) volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"}) volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
# add ceph volumes
volumes.append({
"name": "ceph-data",
"cephfs": {
"monitors": ["192.168.16.23:6789"],
"secretRef": {
"name": "ceph-secret"
},
"user": "admin",
}
})
volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
tn["spec"]["template"]["spec"]["volumes"] = volumes tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts tn_container["volumeMounts"] = volumeMounts

@ -13,5 +13,6 @@
# limitations under the License. # limitations under the License.
__all__ = [ __all__ = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
"resnet_with_preprocess"
] ]

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""seq2seq model for fluid.""" """seq2seq model for fluid."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
return ndarray return ndarray
def get_model(args): def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op: if args.use_reader_op:
raise Exception("machine_translation do not support reader op for now.") raise Exception("machine_translation do not support reader op for now.")
embedding_dim = 512 embedding_dim = 512
@ -190,6 +191,9 @@ def get_model(args):
dict_size = 30000 dict_size = 30000
beam_size = 3 beam_size = 3
max_length = 250 max_length = 250
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
avg_cost, feeding_list = seq_to_seq_net( avg_cost, feeding_list = seq_to_seq_net(
embedding_dim, embedding_dim,
encoder_size, encoder_size,
@ -199,21 +203,15 @@ def get_model(args):
False, False,
beam_size=beam_size, beam_size=beam_size,
max_length=max_length) max_length=max_length)
if is_train:
# clone from default main program
inference_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
train_batch_generator = paddle.batch( batch_generator = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size)
if is_train else paddle.dataset.wmt14.test(dict_size),
buf_size=1000),
batch_size=args.batch_size * args.gpus) batch_size=args.batch_size * args.gpus)
test_batch_generator = paddle.batch( return avg_cost, optimizer, [], batch_generator, None
paddle.reader.shuffle(
paddle.dataset.wmt14.test(dict_size), buf_size=1000),
batch_size=args.batch_size)
return avg_cost, inference_program, optimizer, train_batch_generator, \
test_batch_generator, None

@ -65,61 +65,50 @@ def cnn_model(data):
return predict return predict
def get_model(args): def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op: # NOTE: mnist is small, we don't implement data sharding yet.
filelist = [ filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path) os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
] ]
data_file = fluid.layers.open_files( with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
data_file_handle = fluid.layers.open_files(
filenames=filelist, filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)], shapes=[[-1, 1, 28, 28], (-1, 1)],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=["float32", "int64"], dtypes=["float32", "int64"],
thread_num=args.gpus, thread_num=1,
pass_num=args.pass_num) pass_num=1)
data_file = fluid.layers.double_buffer( data_file = fluid.layers.double_buffer(
fluid.layers.batch( fluid.layers.batch(
data_file, batch_size=args.batch_size)) data_file_handle, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file) with fluid.unique_name.guard():
if args.use_reader_op:
input, label = fluid.layers.read_file(data_file)
else: else:
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) images = fluid.layers.data(
label = fluid.layers.data(name='label', shape=[1], dtype='int64') name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
if args.device == 'CPU' and args.cpus > 1: name='label', shape=[1], dtype='int64')
places = fluid.layers.get_places(args.cpus)
pd = fluid.layers.ParallelDo(places)
with pd.do():
predict = cnn_model(pd.read_input(images))
label = pd.read_input(label)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
batch_acc = fluid.layers.accuracy(input=predict, label=label)
pd.write_output(avg_cost)
pd.write_output(batch_acc)
avg_cost, batch_acc = pd()
avg_cost = fluid.layers.mean(avg_cost)
batch_acc = fluid.layers.mean(batch_acc)
else:
# Train program
predict = cnn_model(images) predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
# Evaluator # Evaluator
batch_acc = fluid.layers.accuracy(input=predict, label=label) batch_acc = fluid.layers.accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
# Optimization # Optimization
if is_train:
opt = fluid.optimizer.AdamOptimizer( opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999) learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize()
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# Reader # Reader
train_reader = paddle.batch( if is_train:
paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus) reader = paddle.dataset.mnist.train()
test_reader = paddle.batch( else:
paddle.dataset.mnist.test(), batch_size=args.batch_size) reader = paddle.dataset.mnist.test()
return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc batched_reader = paddle.batch(
reader, batch_size=args.batch_size * args.gpus)
return avg_cost, opt, [batch_acc], batched_reader, data_file_handle

@ -27,10 +27,17 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
from recordio_converter import imagenet_train, imagenet_test # from recordio_converter import imagenet_train, imagenet_test
from imagenet_reader import train, val
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
is_train=True):
conv1 = fluid.layers.conv2d( conv1 = fluid.layers.conv2d(
input=input, input=input,
filter_size=filter_size, filter_size=filter_size,
@ -39,29 +46,31 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
padding=padding, padding=padding,
act=None, act=None,
bias_attr=False) bias_attr=False)
return fluid.layers.batch_norm(input=conv1, act=act) return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
def shortcut(input, ch_out, stride): def shortcut(input, ch_out, stride, is_train=True):
ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1]
if ch_in != ch_out: if ch_in != ch_out:
return conv_bn_layer(input, ch_out, 1, stride, 0, None) return conv_bn_layer(
input, ch_out, 1, stride, 0, None, is_train=is_train)
else: else:
return input return input
def basicblock(input, ch_out, stride): def basicblock(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out, stride) short = shortcut(input, ch_out, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def bottleneck(input, ch_out, stride): def bottleneck(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out * 4, stride) short = shortcut(input, ch_out * 4, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) conv3 = conv_bn_layer(
conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
@ -72,7 +81,11 @@ def layer_warp(block_func, input, ch_out, count, stride):
return res_out return res_out
def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): def resnet_imagenet(input,
class_dim,
depth=50,
data_format='NCHW',
is_train=True):
cfg = { cfg = {
18: ([2, 2, 2, 1], basicblock), 18: ([2, 2, 2, 1], basicblock),
@ -115,8 +128,9 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
return out return out
def get_model(args): def _model_reader_dshape_classdim(args, is_train):
model = resnet_cifar10 model = resnet_cifar10
reader = None
if args.data_set == "cifar10": if args.data_set == "cifar10":
class_dim = 10 class_dim = 10
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
@ -124,8 +138,10 @@ def get_model(args):
else: else:
dshape = [32, 32, 3] dshape = [32, 32, 3]
model = resnet_cifar10 model = resnet_cifar10
train_reader = paddle.dataset.cifar.train10() if is_train:
test_reader = paddle.dataset.cifar.test10() reader = paddle.dataset.cifar.train10()
else:
reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers": elif args.data_set == "flowers":
class_dim = 102 class_dim = 102
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
@ -133,8 +149,10 @@ def get_model(args):
else: else:
dshape = [224, 224, 3] dshape = [224, 224, 3]
model = resnet_imagenet model = resnet_imagenet
train_reader = paddle.dataset.flowers.train() if is_train:
test_reader = paddle.dataset.flowers.test() reader = paddle.dataset.flowers.train()
else:
reader = paddle.dataset.flowers.test()
elif args.data_set == "imagenet": elif args.data_set == "imagenet":
class_dim = 1000 class_dim = 1000
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
@ -145,64 +163,89 @@ def get_model(args):
if not args.data_path: if not args.data_path:
raise Exception( raise Exception(
"Must specify --data_path when training with imagenet") "Must specify --data_path when training with imagenet")
train_reader = imagenet_train(args.data_path) if not args.use_reader_op:
test_reader = imagenet_test(args.data_path) if is_train:
reader = train()
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + dshape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
input, label = fluid.layers.read_file(data_file)
else: else:
input = fluid.layers.data(name='data', shape=dshape, dtype='float32') reader = val()
label = fluid.layers.data(name='label', shape=[1], dtype='int64') else:
if is_train:
if args.device == 'CPU' and args.cpus > 1: reader = train(xmap=False)
places = fluid.layers.get_places(args.cpus) else:
pd = fluid.layers.ParallelDo(places) reader = val(xmap=False)
with pd.do(): return model, reader, dshape, class_dim
predict = model(pd.read_input(input), class_dim)
label = pd.read_input(label)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
batch_acc = fluid.layers.accuracy(input=predict, label=label)
pd.write_output(avg_cost)
pd.write_output(batch_acc)
avg_cost, batch_acc = pd() def get_model(args, is_train, main_prog, startup_prog):
avg_cost = fluid.layers.mean(avg_cost) model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
batch_acc = fluid.layers.mean(batch_acc) is_train)
pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
if args.use_reader_op:
pyreader = fluid.layers.py_reader(
capacity=args.batch_size * args.gpus,
shapes=([-1] + dshape, (-1, 1)),
dtypes=('float32', 'int64'),
name="train_reader" if is_train else "test_reader",
use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader)
else: else:
predict = model(input, class_dim) input = fluid.layers.data(
name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
predict = model(input, class_dim, is_train=is_train)
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
batch_acc = fluid.layers.accuracy(input=predict, label=label)
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc])
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
batched_train_reader = paddle.batch( # configure optimize
train_reader if args.no_random else paddle.reader.shuffle( optimizer = None
train_reader, buf_size=5120), if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
epochs = [30, 60, 80, 90]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=base_lr,
#learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# config readers
if not args.use_reader_op:
batched_reader = paddle.batch(
reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size * args.gpus, batch_size=args.batch_size * args.gpus,
drop_last=True) drop_last=True)
batched_test_reader = paddle.batch( else:
test_reader, batch_size=args.batch_size, drop_last=True) batched_reader = None
pyreader.decorate_paddle_reader(
return avg_cost, inference_program, optimizer, batched_train_reader,\ paddle.batch(
batched_test_reader, batch_acc reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size))
return avg_cost, optimizer, [batch_acc1,
batch_acc5], batched_reader, pyreader

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -26,7 +26,6 @@ import numpy
import paddle import paddle
import paddle.dataset.imdb as imdb import paddle.dataset.imdb as imdb
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.batch as batch
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
word_dict = imdb.word_dict() word_dict = imdb.word_dict()
@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
return __impl__ return __impl__
def get_model(args): def lstm_net(sentence, lstm_size):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
emb_dim = 512
crop_size = 1500
data = fluid.layers.data(
name="words", shape=[1], lod_level=1, dtype='int64')
sentence = fluid.layers.embedding(
input=data, size=[len(word_dict), emb_dim])
sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
rnn = fluid.layers.DynamicRNN() rnn = fluid.layers.DynamicRNN()
@ -97,6 +84,24 @@ def get_model(args):
last = fluid.layers.sequence_pool(rnn(), 'last') last = fluid.layers.sequence_pool(rnn(), 'last')
logit = fluid.layers.fc(input=last, size=2, act='softmax') logit = fluid.layers.fc(input=last, size=2, act='softmax')
return logit
def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
emb_dim = 512
crop_size = 1500
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
data = fluid.layers.data(
name="words", shape=[1], lod_level=1, dtype='int64')
sentence = fluid.layers.embedding(
input=data, size=[len(word_dict), emb_dim])
logit = lstm_net(sentence, lstm_size)
loss = fluid.layers.cross_entropy( loss = fluid.layers.cross_entropy(
input=logit, input=logit,
label=fluid.layers.data( label=fluid.layers.data(
@ -108,20 +113,18 @@ def get_model(args):
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
shape=[1], dtype='int64'), total=batch_size_tensor) shape=[1], dtype='int64'), total=batch_size_tensor)
inference_program = fluid.default_main_program().clone() if is_train:
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
adam = fluid.optimizer.Adam() adam = fluid.optimizer.Adam()
adam.minimize(loss)
if is_train:
reader = crop_sentence(imdb.train(word_dict), crop_size)
else:
reader = crop_sentence(imdb.test(word_dict), crop_size)
train_reader = batch( batched_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), reader, buf_size=25000),
batch_size=args.batch_size * args.gpus) batch_size=args.batch_size * args.gpus)
test_reader = batch(
paddle.reader.shuffle(
crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
batch_size=args.batch_size)
return loss, inference_program, adam, train_reader, test_reader, batch_acc return loss, adam, [batch_acc], batched_reader, None

@ -25,7 +25,7 @@ import functools
import os import os
def vgg16_bn_drop(input): def vgg16_bn_drop(input, is_train=True):
def conv_block(input, num_filter, groups, dropouts): def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group( return fluid.nets.img_conv_group(
input=input, input=input,
@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None) fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu') bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None) fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2 return fc2
def get_model(args): def get_model(args, is_train, main_prog, startup_prog):
if args.data_set == "cifar10": if args.data_set == "cifar10":
classdim = 10 classdim = 10
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
@ -65,29 +65,31 @@ def get_model(args):
data_shape = [3, 224, 224] data_shape = [3, 224, 224]
else: else:
data_shape = [224, 224, 3] data_shape = [224, 224, 3]
if args.use_reader_op:
filelist = [ filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path) os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
] ]
data_file = fluid.layers.open_files( with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
data_file_handle = fluid.layers.open_files(
filenames=filelist, filenames=filelist,
shapes=[[-1] + data_shape, (-1, 1)], shapes=[[-1] + data_shape, (-1, 1)],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=["float32", "int64"], dtypes=["float32", "int64"],
thread_num=args.gpus, thread_num=1,
pass_num=args.pass_num) pass_num=1)
data_file = fluid.layers.double_buffer( data_file = fluid.layers.double_buffer(
fluid.layers.batch( fluid.layers.batch(
data_file, batch_size=args.batch_size)) data_file_handle, batch_size=args.batch_size))
with fluid.unique_name.guard():
if args.use_reader_op:
images, label = fluid.layers.read_file(data_file) images, label = fluid.layers.read_file(data_file)
else: else:
images = fluid.layers.data( images = fluid.layers.data(
name='data', shape=data_shape, dtype='float32') name='data', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
# Train program # Train program
net = vgg16_bn_drop(images) net = vgg16_bn_drop(images, is_train=is_train)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax') predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
@ -96,26 +98,23 @@ def get_model(args):
batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy( batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor) input=predict, label=label, total=batch_size_tensor)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
# Optimization # Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) if is_train:
optimizer = fluid.optimizer.Adam(
learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
# data reader # data reader
train_reader = paddle.batch( if is_train:
reader = paddle.dataset.cifar.train10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
else:
reader = paddle.dataset.cifar.test10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
batched_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.cifar.train10() reader, buf_size=5120),
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size * args.gpus) batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle

@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so) set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so) set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
include_directories(${ANAKIN_INCLUDE}) include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/) include_directories(${ANAKIN_INCLUDE}/saber/)
include_directories(${ANAKIN_INCLUDE}/saber/core/) include_directories(${ANAKIN_INCLUDE}/saber/core/)
@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder -Wno-reorder
-Wno-error=cpp) -Wno-error=cpp)
if(WITH_GPU)
set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
else()
set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
endif()
ExternalProject_Add( ExternalProject_Add(
extern_anakin extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLML_PROJECT} DEPENDS ${MKLML_PROJECT}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin" GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin"
GIT_TAG "9424277cf9ae180a14aff09560d3cd60a49c76d2" GIT_TAG "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
PREFIX ${ANAKIN_SOURCE_DIR} PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES CMAKE_ARGS ${CMAKE_ARGS_PREFIX}
-DUSE_X86_PLACE=YES -DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO -DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DCUDNN_ROOT=${CUDNN_ROOT}
-DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
-DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER} -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
${EXTERNAL_OPTIONAL_ARGS} ${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}

@ -145,12 +145,12 @@ copy(memory_lib
set(inference_deps paddle_fluid_shared paddle_fluid) set(inference_deps paddle_fluid_shared paddle_fluid)
set(module "inference/api") set(module "inference/api")
if (WITH_ANAKIN AND WITH_GPU) if (WITH_ANAKIN AND WITH_MKL)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${ANAKIN_INSTALL_DIR} # anakin release ${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin) DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
list(APPEND inference_deps anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif() endif()

@ -822,6 +822,14 @@ pad
.. autofunction:: paddle.fluid.layers.pad .. autofunction:: paddle.fluid.layers.pad
:noindex: :noindex:
.. _api_fluid_layers_pad_constant_like:
pad_constant_like
---
.. autofunction:: paddle.fluid.layers.pad_constant_like
:noindex:
.. _api_fluid_layers_label_smooth: .. _api_fluid_layers_label_smooth:
label_smooth label_smooth
@ -1145,6 +1153,14 @@ sigmoid
.. autofunction:: paddle.fluid.layers.sigmoid .. autofunction:: paddle.fluid.layers.sigmoid
:noindex: :noindex:
.. _api_fluid_layers_hsigmoid:
hsigmoid
-------
.. autofunction:: paddle.fluid.layers.hsigmoid
:noindex:
.. _api_fluid_layers_logsigmoid: .. _api_fluid_layers_logsigmoid:
logsigmoid logsigmoid

File diff suppressed because one or more lines are too long

@ -66,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0)) paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None

@ -1,14 +1,21 @@
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n")
file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
function(pass_library TARGET)
# Usage: pass_library(target inference) will append to paddle_inference_pass.h
function(pass_library TARGET DEST)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass) cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
# add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
message(STATUS "add pass ${TARGET} ${DEST}")
file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE) set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
endif()
endfunction() endfunction()
cc_library(node SRCS node.cc DEPS proto_desc) cc_library(node SRCS node.cc DEPS proto_desc)
@ -18,13 +25,15 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
pass_library(graph_to_program_pass) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass) pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass) pass_library(infer_clean_graph_pass inference)
pass_library(fc_lstm_fuse_pass) pass_library(fc_lstm_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass) pass_library(fc_gru_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass inference)
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)

@ -29,39 +29,27 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
std::unordered_set<Node*> nodes2delete; std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd; GraphPatternDetector gpd;
// BuildFCPattern(gpd.mutable_pattern());
auto* x = gpd.mutable_pattern() auto* x = gpd.mutable_pattern()
->NewNode("fc_fuse/x") ->NewNode("fc_fuse/x")
->AsInput() ->AsInput()
->assert_is_op_input("mul", "X"); ->assert_is_op_input("mul", "X");
patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/); patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse");
fc_pattern(x, true /*with bias*/);
#define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
int found_fc_count = 0; int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle FC fuse"; VLOG(4) << "handle FC fuse";
// Currently, there is no FC op available, so I will just simulate the GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
// scenerio. GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
// FC's fusion is simple, just op fuse, no need to process the GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
// parameters. GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
GET_NODE(x); // x GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
GET_NODE(w); // Y GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
GET_NODE(fc_bias); // bias
GET_NODE(fc_out); // Out
GET_NODE(mul); // MUL op
GET_NODE(elementwise_add); // ELEMENT_ADD op
GET_NODE(mul_out); // tmp
#undef GET_NODE
// Create an FC Node. // Create an FC Node.
OpDesc desc; OpDesc desc;
std::string fc_x_in = x->Name(); std::string fc_x_in = subgraph.at(x)->Name();
std::string fc_Y_in = w->Name(); std::string fc_Y_in = w->Name();
std::string fc_bias_in = fc_bias->Name(); std::string fc_bias_in = fc_bias->Name();
std::string fc_out_out = fc_out->Name(); std::string fc_out_out = fc_out->Name();
@ -73,7 +61,8 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
IR_NODE_LINK_TO(x, fc_node); PADDLE_ENFORCE(subgraph.count(x));
IR_NODE_LINK_TO(subgraph.at(x), fc_node);
IR_NODE_LINK_TO(w, fc_node); IR_NODE_LINK_TO(w, fc_node);
IR_NODE_LINK_TO(fc_bias, fc_node); IR_NODE_LINK_TO(fc_bias, fc_node);
IR_NODE_LINK_TO(fc_node, fc_out); IR_NODE_LINK_TO(fc_node, fc_out);

@ -0,0 +1,185 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
namespace paddle {
namespace framework {
namespace ir {
static int BuildFusion(Graph* graph, const std::string& name_scope,
Scope* scope, bool with_fc_bias) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
// Create pattern.
patterns::FC fc_pattern(pattern, name_scope);
patterns::GRU gru_pattern(pattern, name_scope);
PDNode* x =
pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable();
auto* fc_out = fc_pattern(x, with_fc_bias);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
gru_pattern(fc_out);
// Create New OpDesc
auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
Node* bias, Node* hidden, Node* fc_bias) {
OpDesc op_desc;
op_desc.SetType("fusion_gru");
#define NEW_NAME(x) name_scope + "/at." #x ".new"
#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
SET_IN(X, x);
SET_IN(WeightX, weight_x);
SET_IN(WeightH, weight_h);
if (with_fc_bias) {
op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()});
} else {
SET_IN(Bias, bias);
}
#undef SET_IN
op_desc.SetInput("H0", {});
op_desc.SetOutput("Hidden", {hidden->Name()});
op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse"));
// TODO(TJ): This should be a option for infer
op_desc.SetAttr("use_seq", true);
#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
SET_IMTERMEDIATE_OUT(ReorderedH0);
SET_IMTERMEDIATE_OUT(XX);
SET_IMTERMEDIATE_OUT(BatchedInput);
SET_IMTERMEDIATE_OUT(BatchedOut);
#undef SET_IMTERMEDIATE_OUT
auto* op = graph->CreateOpNode(&op_desc);
PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
auto* scope = graph->Get<Scope*>(kParamScopeAttr);
PADDLE_ENFORCE(scope);
if (with_fc_bias) {
// Fusion GRU bias = fcbias + grubias
auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias->Name());
auto* out_bias_tensor =
fusion_bias_var->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE(fusion_bias_var);
auto* gru_bias_var = scope->FindVar(bias->Name());
auto* fc_bias_var = scope->FindVar(fc_bias->Name());
PADDLE_ENFORCE(gru_bias_var);
PADDLE_ENFORCE(fc_bias_var);
const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
// new bias = fc bias + gru bias
out_bias_tensor->Resize(gru_bias_tenosr.dims());
auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < out_bias_tensor->numel(); i++) {
data[i] =
fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
}
}
#undef GET_NODE
#define NEW_IMTERMEDIATE_OUT(key) \
scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
NEW_IMTERMEDIATE_OUT(ReorderedH0);
NEW_IMTERMEDIATE_OUT(XX);
NEW_IMTERMEDIATE_OUT(BatchedInput);
NEW_IMTERMEDIATE_OUT(BatchedOut);
#undef NEW_NAME
#undef NEW_IMTERMEDIATE_OUT
IR_NODE_LINK_TO(x, op);
IR_NODE_LINK_TO(weight_x, op);
IR_NODE_LINK_TO(weight_h, op);
IR_NODE_LINK_TO(bias, op); // actually should link to new bias if have
IR_NODE_LINK_TO(op, hidden);
// h0?
return op;
};
int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
auto* x_n = subgraph.at(x);
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, gru_pattern);
// nodes need be removed
GET_IR_NODE_FROM_SUBGRAPH(BatchGate, BatchGate, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(BatchResetHiddenPrev, BatchGate, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchGate, gru_pattern);
if (with_fc_bias) {
GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul, gru, elementwise_add, fc_bias, fc_out, mul_out, BatchGate,
BatchResetHiddenPrev, BatchHidden});
GraphSafeRemoveNodes(graph, marked_nodes);
} else {
gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden});
GraphSafeRemoveNodes(graph, marked_nodes);
}
#undef GET_NODE
++fusion_count;
};
gpd(graph, handler);
return fusion_count;
}
std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
false /*with_fc_bias*/);
AddStatis(fusion_count);
return graph;
}
std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
true /*with_fc_bias*/);
AddStatis(fusion_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);

@ -0,0 +1,50 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
class FCGRUFusePass : public FusePassBase {
public:
virtual ~FCGRUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_gru_fuse"};
};
// Just FC without bias
class MulGRUFusePass : public FusePassBase {
public:
virtual ~MulGRUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_nobias_gru_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle

@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
@ -19,44 +20,29 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
std::string GenNodeName(const std::string& prefix, const std::string& name) {
return prefix + "/" + name;
}
void BuildPattern(PDPattern* pattern, const std::string& name_scope,
bool with_fc_bias) {
PDNode* x = pattern->NewNode(name_scope, "x")
->assert_is_op_input("mul")
->assert_var_not_persistable();
auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
patterns::LSTM(pattern, name_scope, fc_out);
// LOG(INFO) << "\n" << pattern->DotString();
}
int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
bool with_fc_bias) { bool with_fc_bias) {
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern(); auto* pattern = gpd.mutable_pattern();
BuildPattern(pattern, name_scope, with_fc_bias); // Build pattern
PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
->assert_is_op_input("mul")
->assert_var_not_persistable();
patterns::FC fc_pattern(pattern, name_scope);
// fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
auto* fc_out = fc_pattern(x, with_fc_bias)->AsIntermediate();
patterns::LSTM lstm_pattern(pattern, name_scope);
lstm_pattern(fc_out);
// Create New OpDesc // Create New OpDesc
auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h, auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x,
int bias, int hidden, int cell, int xx, int fc_bias) { Node* weight_h, Node* bias, Node* hidden, Node* cell,
#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x); Node* xx, Node* fc_bias) {
GET_NODE(input);
GET_NODE(weight_x);
GET_NODE(weight_h);
GET_NODE(bias);
GET_NODE(hidden);
GET_NODE(cell);
GET_NODE(xx);
GET_NODE(lstm);
OpDesc op_desc; OpDesc op_desc;
op_desc.SetType("fusion_lstm"); op_desc.SetType("fusion_lstm");
#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); #define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
SET_IN(X, input); SET_IN(X, input);
SET_IN(WeightX, weight_x); SET_IN(WeightX, weight_x);
SET_IN(WeightH, weight_h); SET_IN(WeightH, weight_h);
@ -69,13 +55,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
auto* bias_var = scope->Var(new_bias_var); auto* bias_var = scope->Var(new_bias_var);
PADDLE_ENFORCE(bias_var); PADDLE_ENFORCE(bias_var);
auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>(); auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
auto* lstm_bias_var = scope->FindVar(bias_n->Name()); auto* lstm_bias_var = scope->FindVar(bias->Name());
PADDLE_ENFORCE(lstm_bias_var); PADDLE_ENFORCE(lstm_bias_var);
const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>(); const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
bias_tensor->Resize(lstm_bias_tensor.dims()); bias_tensor->Resize(lstm_bias_tensor.dims());
GET_NODE(fc_bias); auto* fc_bias_var = scope->FindVar(fc_bias->Name());
auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>(); const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace()); auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
@ -86,31 +71,36 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
} }
op_desc.SetInput("Bias", {new_bias_var}); op_desc.SetInput("Bias", {new_bias_var});
} }
#undef GET_NODE
// Create temp variables. // Create temp variables.
scope->Var(name_scope + "/BatchedInput.new") const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
->GetMutable<framework::LoDTensor>(); const std::string BatchedCellPreAct =
scope->Var(name_scope + "/BatchCellPreAct.new") patterns::UniqueKey("BatchedCellPreAct");
->GetMutable<framework::LoDTensor>(); const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
scope->Var(name_scope + "/BatchedGate.new")
->GetMutable<framework::LoDTensor>(); scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {}); op_desc.SetInput("C0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Hidden", {hidden->Name()});
op_desc.SetOutput("Cell", {cell_n->Name()}); op_desc.SetOutput("Cell", {cell->Name()});
op_desc.SetOutput("XX", {xx_n->Name()}); op_desc.SetOutput("XX", {xx->Name()});
op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"}); op_desc.SetOutput("BatchedGate", {BatchedGate});
op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"}); op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"}); op_desc.SetOutput("BatchedInput", {BatchedInput});
op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr // TODO(TJ): get from attr
op_desc.SetAttr("use_seq", true); op_desc.SetAttr("use_seq", true);
#define TMP_NAME(x) "at.new.tmp." #x PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)}) auto* scope = graph->Get<Scope*>(kParamScopeAttr);
#define OP_SET_OUT(x) \
const std::string x = patterns::UniqueKey(#x); \
op_desc.SetOutput(#x, {x}); \
scope->Var(x)->GetMutable<LoDTensor>()
OP_SET_OUT(BatchedCell); OP_SET_OUT(BatchedCell);
OP_SET_OUT(BatchedHidden); OP_SET_OUT(BatchedHidden);
OP_SET_OUT(ReorderedH0); OP_SET_OUT(ReorderedH0);
@ -118,22 +108,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
#undef OP_SET_OUT #undef OP_SET_OUT
auto* op = graph->CreateOpNode(&op_desc); auto* op = graph->CreateOpNode(&op_desc);
PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); IR_NODE_LINK_TO(input, op);
auto* scope = graph->Get<Scope*>(kParamScopeAttr); IR_NODE_LINK_TO(weight_x, op);
IR_NODE_LINK_TO(weight_h, op);
#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>() IR_NODE_LINK_TO(bias, op);
TMP_NEW(BatchedCell); IR_NODE_LINK_TO(op, hidden);
TMP_NEW(BatchedHidden);
TMP_NEW(ReorderedH0);
TMP_NEW(ReorderedC0);
#undef TMP_NEW
#undef TMP_NAME
IR_NODE_LINK_TO(input_n, op);
IR_NODE_LINK_TO(weight_x_n, op);
IR_NODE_LINK_TO(weight_h_n, op);
IR_NODE_LINK_TO(bias_n, op);
IR_NODE_LINK_TO(op, hidden_n);
return op; return op;
}; };
@ -141,39 +120,32 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
#define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \
PADDLE_ENFORCE(name__##n); \
PADDLE_ENFORCE(subgraph.count(name__##n)); \
Node* name__##_n = subgraph.at(name__##n); \
int name__ __attribute__((unused)) = name__##_n->id();
GET_NODE(x);
GET_NODE(w);
GET_NODE(mul);
GET_NODE(fc_out);
GET_NODE(Weight);
GET_NODE(lstm);
GET_NODE(Bias);
GET_NODE(Hidden);
GET_NODE(Cell);
GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
if (with_fc_bias) { if (with_fc_bias) {
GET_NODE(fc_bias); GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
GET_NODE(elementwise_add); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
fc_bias);
// Remove unneeded nodes. // Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes( std::unordered_set<const Node*> marked_nodes(
{mul_n, lstm_n, elementwise_add_n}); {mul, lstm, elementwise_add});
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} else { } else {
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1); GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
nullptr);
// Remove unneeded nodes. // Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n}); std::unordered_set<const Node*> marked_nodes({mul, lstm});
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} }
#undef GET_NODE
++fusion_count; ++fusion_count;
}; };

@ -21,6 +21,7 @@
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
@ -106,8 +107,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
for (auto& pdnode : pattern_.nodes()) { for (auto& pdnode : pattern_.nodes()) {
if (!pdnodes2nodes_.count(pdnode.get())) { if (!pdnodes2nodes_.count(pdnode.get())) {
VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
// return false;
return false;
} }
} }
for (auto& item : pdnodes2nodes_) { for (auto& item : pdnodes2nodes_) {
@ -517,61 +517,51 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
return false; return false;
} }
PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
PDNode* x, bool with_bias) { bool with_bias) {
// Create Operators // Create shared nodes.
PDNode* elementwise_add_op{nullptr}; x->assert_is_op_input("mul", "X");
auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul"); auto* mul = pattern->NewNode(mul_repr())->assert_is_op("mul");
if (with_bias) {
elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add") auto* mul_w_var = pattern->NewNode(w_repr())
->assert_is_op("elementwise_add");
}
// Create variables
// w
auto* mul_weight_var = pattern->NewNode(name_scope, "w")
->AsInput() ->AsInput()
->assert_is_persistable_var() ->assert_is_persistable_var()
->assert_is_op_nth_input("mul", "Y", 0); ->assert_is_op_input("mul", "Y");
PDNode* mul_out_var{nullptr};
if (with_bias) { auto* mul_out_var =
// intermediate variable, will be removed in the IR after fuse. pattern->NewNode(mul_out_repr())->assert_is_op_output("mul");
mul_out_var = pattern->NewNode(name_scope, "mul_out")
->AsIntermediate() if (!with_bias) { // not with bias
->assert_is_only_output_of_op("mul") // Add links.
->assert_is_op_input("elementwise_add"); mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var});
} return mul_out_var;
PDNode *bias{nullptr}, *fc_out{nullptr};
if (with_bias) { } else { // with bias
// bias mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
bias = pattern->NewNode(name_scope, "fc_bias") // Create operators.
auto* elementwise_add = pattern->NewNode(elementwise_add_repr())
->assert_is_op("elementwise_add");
// Create variables.
auto* bias = pattern->NewNode(bias_repr())
->assert_is_op_input("elementwise_add") ->assert_is_op_input("elementwise_add")
->AsInput(); ->AsInput();
// output
fc_out = pattern->NewNode(name_scope, "fc_out") auto* fc_out = pattern->NewNode(Out_repr())
->AsOutput() ->AsOutput()
->assert_is_op_output("elementwise_add"); ->assert_is_op_output("elementwise_add");
} else {
fc_out = pattern->NewNode(name_scope, "fc_out")
->AsOutput()
->assert_is_op_output("mul");
}
if (with_bias) {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
} else {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
}
mul->LinksFrom({mul_w_var, x}).LinksTo({mul_out_var});
elementwise_add->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
return fc_out; return fc_out;
} }
PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope, }
PDNode* x) {
PDNode* patterns::LSTM::operator()(PDNode* x) {
x->assert_is_op_input("lstm", "Input"); x->assert_is_op_input("lstm", "Input");
auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm"); auto* lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
#define NEW_NODE(arg__, io__) \ #define NEW_NODE(arg__, io__) \
auto* arg__ = pattern->NewNode(name_scope, #arg__) \ auto* arg__ = \
->assert_is_op_##io__("lstm", #arg__); pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__);
// Currently, the H0 and C0 are optional // Currently, the H0 and C0 are optional
// TODO(Superjomn) upgrade the fuse framework to support optional. // TODO(Superjomn) upgrade the fuse framework to support optional.
@ -584,11 +574,42 @@ PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
NEW_NODE(Cell, output); NEW_NODE(Cell, output);
NEW_NODE(BatchGate, output); NEW_NODE(BatchGate, output);
NEW_NODE(BatchCellPreAct, output); NEW_NODE(BatchCellPreAct, output);
#undef NEW_NODE
lstm_op->LinksFrom({x, Weight, Bias}); lstm_op->LinksFrom({x, Weight, Bias});
lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct}); lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
return Hidden; return Hidden;
} }
PDNode* patterns::GRU::operator()(PDNode* x) {
x->assert_is_op_input("gru", "Input");
auto* gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru");
#define NEW_NODE(arg__, io__) \
auto* arg__ = \
pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__);
NEW_NODE(Weight, input);
// TODO(Superjomn): upgrade the fuse framework to support optional.
// H0 and bias are optional
NEW_NODE(Bias, input); // also optional
// NEW_NODE(H0, input);
NEW_NODE(Hidden, output);
// below are intermediate
NEW_NODE(BatchGate, output);
NEW_NODE(BatchResetHiddenPrev, output);
NEW_NODE(BatchHidden, output);
#undef NEW_NODE
BatchGate->AsIntermediate();
BatchResetHiddenPrev->AsIntermediate();
BatchHidden->AsIntermediate();
gru_op->LinksFrom({x, Weight, Bias});
gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
return Hidden;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save