Benchmark tool for imgnet (#12305)

* support test using executor without reader

* run imgnet

* update fluid benchmark

* wip

* update

* update all models

* support pyreader

* update

* clean up

* make profile batches contollable

* update API.spec

* update scripts

* clean dockerfile

* update

* clean comments

* add scope argument docstring

* use num_trainers to determine nccl init comms
fix-develop-build.sh
Wu Yi 7 years ago committed by GitHub
parent 8a6b46404f
commit f90c7865f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
@ -27,5 +28,6 @@ ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
ADD models/ /workspace/models/

@ -17,7 +17,8 @@ import argparse
__all__ = ['parse_args', ]
BENCHMARK_MODELS = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
"machine_translation", "resnet", "se_resnext", "vgg", "mnist",
"stacked_dynamic_lstm", "resnet_with_preprocess"
]
@ -67,12 +68,12 @@ def parse_args():
'--cpus',
type=int,
default=1,
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
parser.add_argument(
'--data_set',
type=str,
default='flowers',
choices=['cifar10', 'flowers'],
choices=['cifar10', 'flowers', 'imagenet'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
@ -122,6 +123,11 @@ def parse_args():
type=str,
default="",
help='Directory that contains all the training recordio files.')
parser.add_argument(
'--test_data_path',
type=str,
default="",
help='Directory that contains all the test data (NOT recordio).')
parser.add_argument(
'--use_inference_transpiler',
action='store_true',
@ -130,5 +136,9 @@ def parse_args():
'--no_random',
action='store_true',
help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
args = parser.parse_args()
return args

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -163,6 +163,19 @@ def gen_job():
volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
# add ceph volumes
volumes.append({
"name": "ceph-data",
"cephfs": {
"monitors": ["192.168.16.23:6789"],
"secretRef": {
"name": "ceph-secret"
},
"user": "admin",
}
})
volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts

@ -13,5 +13,6 @@
# limitations under the License.
__all__ = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
"resnet_with_preprocess"
]

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""seq2seq model for fluid."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
return ndarray
def get_model(args):
def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op:
raise Exception("machine_translation do not support reader op for now.")
embedding_dim = 512
@ -190,30 +191,27 @@ def get_model(args):
dict_size = 30000
beam_size = 3
max_length = 250
avg_cost, feeding_list = seq_to_seq_net(
embedding_dim,
encoder_size,
decoder_size,
dict_size,
dict_size,
False,
beam_size=beam_size,
max_length=max_length)
# clone from default main program
inference_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=args.batch_size * args.gpus)
test_batch_generator = paddle.batch(
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
avg_cost, feeding_list = seq_to_seq_net(
embedding_dim,
encoder_size,
decoder_size,
dict_size,
dict_size,
False,
beam_size=beam_size,
max_length=max_length)
if is_train:
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(dict_size), buf_size=1000),
batch_size=args.batch_size)
paddle.dataset.wmt14.train(dict_size)
if is_train else paddle.dataset.wmt14.test(dict_size),
buf_size=1000),
batch_size=args.batch_size * args.gpus)
return avg_cost, inference_program, optimizer, train_batch_generator, \
test_batch_generator, None
return avg_cost, optimizer, [], batch_generator, None

@ -65,61 +65,50 @@ def cnn_model(data):
return predict
def get_model(args):
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.device == 'CPU' and args.cpus > 1:
places = fluid.layers.get_places(args.cpus)
pd = fluid.layers.ParallelDo(places)
with pd.do():
predict = cnn_model(pd.read_input(images))
label = pd.read_input(label)
def get_model(args, is_train, main_prog, startup_prog):
# NOTE: mnist is small, we don't implement data sharding yet.
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=1,
pass_num=1)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file_handle, batch_size=args.batch_size))
with fluid.unique_name.guard():
if args.use_reader_op:
input, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_acc = fluid.layers.accuracy(input=predict, label=label)
pd.write_output(avg_cost)
pd.write_output(batch_acc)
avg_cost, batch_acc = pd()
avg_cost = fluid.layers.mean(avg_cost)
batch_acc = fluid.layers.mean(batch_acc)
else:
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_acc = fluid.layers.accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# Optimization
if is_train:
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize()
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size)
return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
if is_train:
reader = paddle.dataset.mnist.train()
else:
reader = paddle.dataset.mnist.test()
batched_reader = paddle.batch(
reader, batch_size=args.batch_size * args.gpus)
return avg_cost, opt, [batch_acc], batched_reader, data_file_handle

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -26,7 +26,6 @@ import numpy
import paddle
import paddle.dataset.imdb as imdb
import paddle.fluid as fluid
import paddle.batch as batch
import paddle.fluid.profiler as profiler
word_dict = imdb.word_dict()
@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
return __impl__
def get_model(args):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
emb_dim = 512
crop_size = 1500
data = fluid.layers.data(
name="words", shape=[1], lod_level=1, dtype='int64')
sentence = fluid.layers.embedding(
input=data, size=[len(word_dict), emb_dim])
def lstm_net(sentence, lstm_size):
sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
rnn = fluid.layers.DynamicRNN()
@ -97,31 +84,47 @@ def get_model(args):
last = fluid.layers.sequence_pool(rnn(), 'last')
logit = fluid.layers.fc(input=last, size=2, act='softmax')
loss = fluid.layers.cross_entropy(
input=logit,
label=fluid.layers.data(
name='label', shape=[1], dtype='int64'))
loss = fluid.layers.mean(x=loss)
return logit
# add acc
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
shape=[1], dtype='int64'), total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
adam = fluid.optimizer.Adam()
def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
emb_dim = 512
crop_size = 1500
train_reader = batch(
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
data = fluid.layers.data(
name="words", shape=[1], lod_level=1, dtype='int64')
sentence = fluid.layers.embedding(
input=data, size=[len(word_dict), emb_dim])
logit = lstm_net(sentence, lstm_size)
loss = fluid.layers.cross_entropy(
input=logit,
label=fluid.layers.data(
name='label', shape=[1], dtype='int64'))
loss = fluid.layers.mean(x=loss)
# add acc
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
shape=[1], dtype='int64'), total=batch_size_tensor)
if is_train:
adam = fluid.optimizer.Adam()
adam.minimize(loss)
if is_train:
reader = crop_sentence(imdb.train(word_dict), crop_size)
else:
reader = crop_sentence(imdb.test(word_dict), crop_size)
batched_reader = paddle.batch(
paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
reader, buf_size=25000),
batch_size=args.batch_size * args.gpus)
test_reader = batch(
paddle.reader.shuffle(
crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
batch_size=args.batch_size)
return loss, inference_program, adam, train_reader, test_reader, batch_acc
return loss, adam, [batch_acc], batched_reader, None

@ -25,7 +25,7 @@ import functools
import os
def vgg16_bn_drop(input):
def vgg16_bn_drop(input, is_train=True):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2
def get_model(args):
def get_model(args, is_train, main_prog, startup_prog):
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
@ -65,57 +65,56 @@ def get_model(args):
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + data_shape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=1,
pass_num=1)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file_handle, batch_size=args.batch_size))
with fluid.unique_name.guard():
if args.use_reader_op:
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='data', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images, is_train=is_train)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + data_shape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='data', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
# Optimization
if is_train:
optimizer = fluid.optimizer.Adam(
learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
# data reader
train_reader = paddle.batch(
if is_train:
reader = paddle.dataset.cifar.train10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
else:
reader = paddle.dataset.cifar.test10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
batched_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
reader, buf_size=5120),
batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle

@ -66,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None

@ -100,14 +100,13 @@ struct NCCLContextMap {
return;
}
std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
// if pass nccl_id here, can assume we are doing multi node training
if (nccl_id == nullptr) {
// if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1) {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data()));
} else {
PADDLE_ENFORCE_GT(num_trainers, 1);
// TODO(wuyi): need to ensure each node have same number of GPUs
PADDLE_ENFORCE_NOT_NULL(nccl_id);
{
int nranks = num_trainers * order_.size();
NCCLGroupGuard gurad;

@ -43,8 +43,9 @@ class ParallelExecutor(object):
num_trainers(int): If greater than 1, NCCL will be initialized with
multiple rank of nodes, each node should have same number of GPUs.
Distributed training will be enabled then. Default 1.
trainer_id(int: Must use together with num_trainers. trainer_id is the
trainer_id(int): Must use together with num_trainers. trainer_id is the
"rank" of current node starts from 0. Default 0.
scope(Scope): scope to run with, default use fluid.global_scope().
Returns:
ParallelExecutor: The initialized ParallelExecutor object.
@ -73,6 +74,7 @@ class ParallelExecutor(object):
build_strategy=None,
num_trainers=1,
trainer_id=0,
scope=None,
**kwargs):
if len(kwargs) != 0:
err_msg = ""
@ -131,7 +133,8 @@ class ParallelExecutor(object):
main = main_program
main = main if main else framework.default_main_program()
scope = executor.global_scope()
if scope == None:
scope = executor.global_scope()
# FIXME(Yancey1989): it's a temporary approach to determinate the distribute
# train program, call self.bcast_param() at the end of each mini-batch.
self.is_dist = True if "recv" in [

Loading…
Cancel
Save