From 47630a4a9e8b54dc27184586d22559b427525785 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Fri, 1 Jun 2018 17:28:24 +0800 Subject: [PATCH 1/9] fluid benchmark support recordio reader --- benchmark/fluid/Dockerfile | 2 +- benchmark/fluid/README.md | 9 ++ benchmark/fluid/fluid_benchmark.py | 55 +++++--- benchmark/fluid/models/machine_translation.py | 2 + benchmark/fluid/models/mnist.py | 21 ++- benchmark/fluid/models/resnet.py | 20 ++- .../fluid/models/stacked_dynamic_lstm.py | 3 + benchmark/fluid/models/vgg.py | 21 ++- benchmark/fluid/recordio_converter.py | 133 ++++++++++++++++++ python/paddle/fluid/layers/io.py | 2 +- 10 files changed, 241 insertions(+), 27 deletions(-) create mode 100644 benchmark/fluid/recordio_converter.py diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile index 46140a9d1b..5d98a9b3c4 100644 --- a/benchmark/fluid/Dockerfile +++ b/benchmark/fluid/Dockerfile @@ -19,4 +19,4 @@ ADD *.whl / RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s ENV LD_LIBRARY_PATH=/usr/local/lib -ADD fluid_benchmark.py dataset.py models/ /workspace/ +ADD fluid_benchmark.py recordio_converter.py models/ /workspace/ diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index 1b0c7dce8b..c17cab999b 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -42,6 +42,15 @@ Currently supported `--model` argument include: PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2 ``` +## Prepare the RecordIO file to Achieve Better Performance + +Run the following command will generate RecordIO files like "mnist.recordio" under the path +and batch_size you choose: + +```bash +python -c 'from recordio_converter import *; prepare_mnist("data", 32)' +``` + ## Run Distributed Benchmark on Kubernetes Cluster You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index c1d458970a..9dce7d5647 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -44,7 +44,6 @@ def parse_args(): type=float, default=0.001, help='The minibatch size.') - # TODO(wuyi): add "--use_fake_data" option back. parser.add_argument( '--skip_batch_num', type=int, @@ -106,6 +105,16 @@ def parse_args(): default='local', choices=['local', 'pserver', 'nccl2'], help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--use_reader_op', + action='store_true', + help='Whether to use reader op, and must specify the data path if set this to true.' + ) + parser.add_argument( + '--data_path', + type=str, + default="", + help='Directory that contains all the training recordio files.') args = parser.parse_args() return args @@ -208,11 +217,13 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_prog) - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] - feeder = fluid.DataFeeder(feed_var_list, place) + + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): @@ -223,9 +234,12 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, num_samples = 0 if iters == args.iterations: break - loss = exe.run(train_prog, - feed=feeder.feed(data), - fetch_list=[avg_loss]) + if args.use_reader_op: + loss = exe.run(train_prog, fetch_list=[avg_loss]) + else: + loss = exe.run(train_prog, + feed=feeder.feed(data), + fetch_list=[avg_loss]) iters += 1 num_samples += len(data) train_losses.append(loss) @@ -251,10 +265,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + # generate fake: if args.use_fake_data: for var in feed_var_list: @@ -271,7 +289,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, "value": 1.0, "dtype": var.dtype}) - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) @@ -288,7 +305,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_trainers=num_trainers, trainer_id=trainer_id) - feeder = fluid.DataFeeder(feed_var_list, place) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 @@ -304,7 +320,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_samples = 0 if iters == args.iterations: break - if args.use_fake_data: + # NOTE: if use reader ops, the input data is not splited to multiple cards + if args.use_reader_op and iters >= args.iterations / args.gpus: + break + if args.use_fake_data or args.use_reader_op: loss, = exe.run([avg_loss.name]) else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) @@ -316,6 +335,8 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, print("Pass %d, batch %d, loss %s" % (pass_id, batch_id, np.array(loss))) train_elapsed = time.time() - start_time + if args.use_reader_op: + num_samples = num_samples * args.gpus examples_per_sec = num_samples / train_elapsed print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % (num_samples, train_elapsed, examples_per_sec)) @@ -342,7 +363,7 @@ def main(): # the unique trainer id, starting from 0, needed by trainer # only nccl_id_var, num_trainers, trainer_id = ( - None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1"))) + None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0"))) if args.use_cprof: pr = cProfile.Profile() diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 635b3373dd..3024882725 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor): def get_model(args): + if args.use_reader_op: + raise Exception("machine_translation do not support reader op for now.") embedding_dim = 512 encoder_size = 512 decoder_size = 512 diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index d264bfc12b..5d3da68daf 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -20,6 +20,7 @@ import numpy as np import argparse import time import cProfile +import os import paddle import paddle.fluid as fluid @@ -65,9 +66,23 @@ def cnn_model(data): def get_model(args): - # Input data - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1, 1, 28, 28], (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program predict = cnn_model(images) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 9dec8911ed..35daad6664 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -19,6 +19,7 @@ from __future__ import print_function import functools import numpy as np import time +import os import cProfile, pstats, StringIO @@ -129,9 +130,24 @@ def get_model(args): else: dshape = [224, 224, 3] model = resnet_imagenet + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + dshape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + input, label = fluid.layers.read_file(data_file) + else: + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') - input = fluid.layers.data(name='data', shape=dshape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') predict = model(input, class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index 81a28b5f3a..e2a8cf45ac 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size): def get_model(args): + if args.use_reader_op: + raise Exception( + "stacked_dynamic_lstm do not support reader op for now.") lstm_size = 512 emb_dim = 512 crop_size = 1500 diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index 53856c5f7a..b84e118a88 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -22,6 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core import argparse import functools +import os def vgg16_bn_drop(input): @@ -65,9 +66,23 @@ def get_model(args): else: data_shape = [224, 224, 3] - # Input data - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + data_shape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program net = vgg16_bn_drop(images) diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py new file mode 100644 index 0000000000..c69062c4a1 --- /dev/null +++ b/benchmark/fluid/recordio_converter.py @@ -0,0 +1,133 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.dataset import mnist, cifar, flowers, image + + +def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data, + shape_label): + num_batches = 0 + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(py_reader(), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=shape_data), + fluid.layers.data( + name='label', shape=shape_label, dtype='int64'), + ], + place=fluid.CPUPlace()) + num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( + outfilepath, reader, feeder) + return num_batches + + +def prepare_mnist(outpath, batch_size): + outfilepath = os.path.join(outpath, "mnist.recordio") + convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1]) + + +def prepare_cifar10(outpath, batch_size): + outfilepath = os.path.join(outpath, "cifar.recordio") + convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1]) + + +def prepare_flowers(outpath, batch_size): + outfilepath = os.path.join(outpath, "flowers.recordio") + convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224], + [1]) + + +def imagenet_train(data_dir): + contents = os.listdir(data_dir) + if set(contents) != set( + ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): + raise Exception("Imagenet data contents error!") + img2label = dict() + imgfilelist = [] + with open(os.path.join(data_dir, "train.txt")) as fn: + while 1: + l = fn.readline() + if not l: + break + img, lbl = l[:-1].split(" ") + img2label[img] = int(lbl) + imgfilelist.append(img) + + def train_reader(): + for idx, imgfile in enumerate(imgfilelist): + data = image.load_image( + os.path.join(data_dir, "train", imgfile.lower())) + label = [img2label[imgfile], ] + yield [data, label] + + def default_mapper(sample): + img, label = sample + img = image.simple_transform( + img, 256, 224, True, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + return paddle.reader.map_readers(default_mapper, train_reader) + + +# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged +def convert_reader_to_recordio_files( + filename, + batch_per_file, + reader_creator, + feeder, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000, + feed_order=None): + if feed_order is None: + feed_order = feeder.feed_names + f_name, f_ext = os.path.splitext(filename) + assert (f_ext == ".recordio") + + lines = [] + f_idx = 0 + counter = 0 + for idx, batch in enumerate(reader_creator()): + lines.append(batch) + if idx >= batch_per_file and idx % batch_per_file == 0: + filename = "%s-%05d%s" % (f_name, f_idx, f_ext) + with fluid.recordio_writer.create_recordio_writer( + filename, compressor, max_num_records) as writer: + for l in lines: + res = feeder.feed(l) + for each in feed_order: + writer.append_tensor(res[each]) + writer.complete_append_tensor() + counter += 1 + lines = [] + f_idx += 1 + print("written file: ", filename) + return counter + + +def prepare_imagenet(inpath, outpath, batch_size): + r = paddle.batch(imagenet_train(inpath), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ + fluid.layers.data( + name="image", shape=[3, 224, 224]), fluid.layers.data( + name="label", shape=[1], dtype='int64') + ], + place=fluid.CPUPlace()) + outpath = os.path.join(outpath, "imagenet.recordio") + convert_reader_to_recordio_files(outpath, 10000, r, feeder) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8758ac9f94..a56f3ea9db 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -434,7 +434,7 @@ def open_files(filenames, shapes, lod_levels, dtypes, - thread_num, + thread_num=1, buffer_size=None, pass_num=1, for_parallel=True): From 2e0d851685402e4e9d1835824f956326e9f2b0fd Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Tue, 5 Jun 2018 15:39:49 +0800 Subject: [PATCH 2/9] update readme --- benchmark/fluid/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index c17cab999b..48280a5621 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -45,10 +45,11 @@ Currently supported `--model` argument include: ## Prepare the RecordIO file to Achieve Better Performance Run the following command will generate RecordIO files like "mnist.recordio" under the path -and batch_size you choose: +and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size +at any time using `fluid.batch`. ```bash -python -c 'from recordio_converter import *; prepare_mnist("data", 32)' +python -c 'from recordio_converter import *; prepare_mnist("data", 1)' ``` ## Run Distributed Benchmark on Kubernetes Cluster From 725ea3f17d38edeb4648694f6115c0ff9cd2efe4 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Tue, 5 Jun 2018 16:01:27 +0800 Subject: [PATCH 3/9] update --- benchmark/fluid/models/resnet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 9ccd41c9e9..47d8d026ed 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -142,11 +142,11 @@ def get_model(args): else: dshape = [224, 224, 3] model = resnet_imagenet - if not args.data_dir: + if not args.data_path: raise Exception( - "Must specify --data_dir when training with imagenet") - train_reader = imagenet_train(args.data_dir) - test_reader = imagenet_test(args.data_dir) + "Must specify --data_path when training with imagenet") + train_reader = imagenet_train(args.data_path) + test_reader = imagenet_test(args.data_path) if args.use_reader_op: filelist = [ From 8d14b3953f50f1a311d9b02d8e24e616f485e085 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 6 Jun 2018 13:02:29 +0800 Subject: [PATCH 4/9] follow comments --- benchmark/fluid/fluid_benchmark.py | 33 ++++++++++++++----- benchmark/fluid/models/machine_translation.py | 2 +- benchmark/fluid/models/mnist.py | 2 +- benchmark/fluid/models/resnet.py | 2 +- .../fluid/models/stacked_dynamic_lstm.py | 2 +- benchmark/fluid/models/vgg.py | 2 +- 6 files changed, 30 insertions(+), 13 deletions(-) diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 399f5fb49c..c2771ba5db 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -38,7 +38,10 @@ def parse_args(): default='resnet', help='The model to run benchmark with.') parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') + '--batch_size', + type=int, + default=32, + help='The batch size on each gpu.') parser.add_argument( '--learning_rate', type=float, default=0.001, help='The learning rate.') parser.add_argument( @@ -229,27 +232,35 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): train_losses = [] - reader_generator = train_reader() + if not args.use_reader_op: + reader_generator = train_reader() batch_id = 0 data = None while True: if not args.use_reader_op: data = next(reader_generator, None) - if iters == args.iterations or data == None: + if data == None: + break + if iters == args.iterations: break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 if args.use_reader_op: - loss = exe.run(train_prog, fetch_list=[avg_loss]) + try: + loss = exe.run(train_prog, fetch_list=[avg_loss]) + except fluid.core.EnforceNotMet as ex: + break else: loss = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_loss]) iters += 1 batch_id += 1 - # FIXME(wuyi): last batch size maybe different + # FIXME(wuyi): For use_reader_op, if the current + # pass is not the last, the last batch of this pass + # is also equal to args.batch_size. num_samples += len(args.batch_size) train_losses.append(loss) print("Pass: %d, Iter: %d, Loss: %f\n" % @@ -315,13 +326,16 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_samples = 0 iters = 0 start_time = time.time() - reader_generator = train_reader() + if not args.use_reader_op: + reader_generator = train_reader() batch_id = 0 data = None while True: if not args.use_reader_op: data = next(reader_generator, None) - if iters == args.iterations or data == None: + if data == None: + break + if iters == args.iterations: break if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") @@ -335,7 +349,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if args.use_reader_op and iters >= args.iterations / args.gpus: break if args.use_fake_data or args.use_reader_op: - loss, = exe.run([avg_loss.name]) + try: + loss, = exe.run([avg_loss.name]) + except fluid.core.EnforceNotMet as ex: + break else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.update_method == "pserver": diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 3024882725..69541adf6b 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -223,7 +223,7 @@ def get_model(args): train_batch_generator = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_batch_generator = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index 5d3da68daf..54206c252c 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -103,7 +103,7 @@ def get_model(args): # Reader train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) + paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=args.batch_size) return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 47d8d026ed..3c87076724 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -184,7 +184,7 @@ def get_model(args): batched_train_reader = paddle.batch( paddle.reader.shuffle( train_reader, buf_size=5120), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size) return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index e2a8cf45ac..211869af4e 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -118,7 +118,7 @@ def get_model(args): train_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index b84e118a88..cb0dc97763 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -110,7 +110,7 @@ def get_model(args): paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), buf_size=5120), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), From 6fdd5de2aac0e8eba046a21840127e3d650d388f Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 6 Jun 2018 13:39:09 +0800 Subject: [PATCH 5/9] update --- benchmark/fluid/fluid_benchmark.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 150346798e..ca7f7dbb07 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -266,7 +266,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, # FIXME(wuyi): For use_reader_op, if the current # pass is not the last, the last batch of this pass # is also equal to args.batch_size. - num_samples += len(args.batch_size) + if args.use_reader_op: + num_samples += args.batch_size + else: + num_samples += len(data) train_losses.append(loss) print("Pass: %d, Iter: %d, Loss: %f\n" % (pass_id, iters, np.mean(train_losses))) @@ -350,9 +353,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 - # NOTE: if use reader ops, the input data is not splited to multiple cards - if args.use_reader_op and iters >= args.iterations / args.gpus: - break if args.use_fake_data or args.use_reader_op: try: loss, = exe.run([avg_loss.name]) @@ -362,7 +362,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.update_method == "pserver": exe.bcast_params() - num_samples += len(data) + if args.use_reader_op: + num_samples += args.batch_size + else: + num_samples += len(data) iters += 1 if batch_id % 1 == 0: print("Pass %d, batch %d, loss %s" % From 8370c5afd0aef8f4a709a9eb6d71f181a03788f6 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 6 Jun 2018 19:10:41 +0800 Subject: [PATCH 6/9] fix errors by comment --- benchmark/fluid/fluid_benchmark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index ca7f7dbb07..bd0243aa60 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -267,7 +267,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, # pass is not the last, the last batch of this pass # is also equal to args.batch_size. if args.use_reader_op: - num_samples += args.batch_size + num_samples += args.batch_size * args.gpus else: num_samples += len(data) train_losses.append(loss) @@ -363,7 +363,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if args.update_method == "pserver": exe.bcast_params() if args.use_reader_op: - num_samples += args.batch_size + num_samples += args.batch_size * args.gpus else: num_samples += len(data) iters += 1 From cd330578ef0179c6c302c7fc9fd260847d28f005 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Thu, 7 Jun 2018 09:26:01 +0800 Subject: [PATCH 7/9] add num_passes --- benchmark/fluid/models/mnist.py | 3 ++- benchmark/fluid/models/resnet.py | 3 ++- benchmark/fluid/models/vgg.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index d903a834ec..8e740dc689 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -75,7 +75,8 @@ def get_model(args): shapes=[[-1, 1, 28, 28], (-1, 1)], lod_levels=[0, 0], dtypes=["float32", "int64"], - thread_num=args.gpus) + thread_num=args.gpus, + pass_num=args.pass_num) data_file = fluid.layers.double_buffer( fluid.layers.batch( data_file, batch_size=args.batch_size)) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 88451064fc..2ee2b5be09 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -157,7 +157,8 @@ def get_model(args): shapes=[[-1] + dshape, (-1, 1)], lod_levels=[0, 0], dtypes=["float32", "int64"], - thread_num=args.gpus) + thread_num=args.gpus, + pass_num=args.pass_num) data_file = fluid.layers.double_buffer( fluid.layers.batch( data_file, batch_size=args.batch_size)) diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index cb0dc97763..6092cdeb88 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -75,7 +75,8 @@ def get_model(args): shapes=[[-1] + data_shape, (-1, 1)], lod_levels=[0, 0], dtypes=["float32", "int64"], - thread_num=args.gpus) + thread_num=args.gpus, + pass_num=args.pass_num) data_file = fluid.layers.double_buffer( fluid.layers.batch( data_file, batch_size=args.batch_size)) From f7c96f079b6299c8c3889c62ea00011c0cc4ff83 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 7 Jun 2018 20:02:08 +0800 Subject: [PATCH 8/9] Big data op_test benchmark, for checking output consistent in different runs. (#10646) * "init benchmark ops" * "untrack outputs" * "delete some usused code" * "benchmark" * "fix ci" * "fix op test" * "fix uint16 missing" * "fix ci" * "follow comments" * "fix ci" * "follow comments" * "conficts. merge develop branch" * repick * "merge develop branch" --- paddle/fluid/framework/operator.cc | 6 +- python/paddle/fluid/executor.py | 2 + python/paddle/fluid/framework.py | 25 +- .../paddle/fluid/tests/unittests/benchmark.py | 113 ++++++ .../fluid/tests/unittests/benchmark_sum_op.py | 82 ++++ .../paddle/fluid/tests/unittests/op_test.py | 364 ++++++------------ .../fluid/tests/unittests/test_lstm_op.py | 199 +++++----- .../paddle/fluid/tests/unittests/testsuite.py | 182 +++++++++ 8 files changed, 621 insertions(+), 352 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/benchmark.py create mode 100644 python/paddle/fluid/tests/unittests/benchmark_sum_op.py create mode 100644 python/paddle/fluid/tests/unittests/testsuite.py diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 4d1e8d0eba..90eb0e30f3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -661,8 +661,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( } if (t != nullptr) { int tmp = static_cast(ToDataType(t->type())); - PADDLE_ENFORCE(tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same.", Type()); + PADDLE_ENFORCE( + tmp == data_type || data_type == -1, + "DataType of Paddle Op %s must be the same. Get %d != %d", Type(), + data_type, tmp); data_type = tmp; } } diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 93aa5f908e..33d8f70941 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list): return var.desc.name() elif isinstance(var, str): return var + elif isinstance(var, basestring): + return str(var) else: raise TypeError(str(var) + " should be Variable or str") diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 33b5caa0ea..9dc9038f44 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.INT64 elif dtype == np.bool: return core.VarDesc.VarType.BOOL + elif dtype == np.uint16: + return core.VarDesc.VarType.INT16 elif dtype == np.uint8: return core.VarDesc.VarType.UINT8 else: @@ -368,6 +370,13 @@ class Operator(object): Block. Users can use the build in instructions to describe their neural network. """ + OP_WITHOUT_KERNEL_SET = { + 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', + 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', + 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', + 'ncclInit', 'channel_create', 'channel_close', 'channel_send', + 'channel_recv', 'select' + } def __init__(self, block, @@ -504,17 +513,13 @@ class Operator(object): else: self.desc.set_attr(attr_name, self.attrs[attr_name]) self.desc.check_attrs() - no_kernel_op_set = { - 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', - 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', - 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', - 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv', 'select', 'gen_nccl_id' - } - if type not in no_kernel_op_set: + if self.has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + def has_kernel(self, op_type): + return op_type not in self.OP_WITHOUT_KERNEL_SET + def to_string(self, throw_on_error): """ To debug string. @@ -742,7 +747,9 @@ class Block(object): def var(self, name): if not isinstance(name, basestring): - raise TypeError() + raise TypeError( + "var require string as parameter, but get %s instead." % + (type(name))) v = self.vars.get(name, None) if v is None: raise ValueError("var %s not in this block" % name) diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py new file mode 100644 index 0000000000..e891ee932f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark.py @@ -0,0 +1,113 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import time +import itertools + +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from op_test import OpTest + + +class BenchmarkSuite(OpTest): + def timeit_function(self, callback, iters, *args, **kwargs): + assert iters != 0, "Iters should >= 1" + start = time.time() + for i in range(iters): + callback(*args, **kwargs) + elapse = time.time() - start + return elapse / iters + + def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol): + for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs, + fetch_list): + # the cpu version is baseline, expect gpu version keep same with cpu version. + expect = item_cpu_out + expect_t = np.array(item_cpu_out) + actual = item_gpu_out + actual_t = np.array(item_gpu_out) + var_name = variable if isinstance(variable, + basestring) else variable.name + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol), + "Output (" + var_name + ") has diff" + str(actual_t) + "\n" + + str(expect_t)) + self.assertListEqual(actual.lod(), + expect.lod(), + "Output (" + var_name + ") has different lod") + + def _get_input_names(self): + inputs = [] + for name, value in self.inputs.iteritems(): + if isinstance(value, list): + inputs.extend([sub_name for sub_name, _ in value]) + inputs.append(name) + return inputs + + def _get_output_names(self): + outputs = [] + for var_name, var in self.outputs.iteritems(): + if isinstance(var, list): + for sub_var_name, sub_var in var: + outputs.append(sub_var_name) + else: + outputs.append(var_name) + if len(outputs) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + outputs.append(str(out_name)) + return outputs + + def check_output_stability(self, atol=1e-8): + places = self._get_places() + if len(places) < 2: + return + cpu_outs, fetch_list = self._calc_output(places[0]) + gpu_outs, _ = self._calc_output(places[1]) + self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol) + + def timeit_output_with_place(self, place, iters): + return self.timeit_function(self.calc_output, iters, place) + + def timeit_output(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_output_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) + + def timeit_grad_with_place(self, place, iters=100): + inputs_to_check = self._get_input_names() + output_names = self._get_output_names() + return self.timeit_function( + self._get_gradient, + iters, + inputs_to_check, + place, + output_names, + no_grad_set=None) + + def timeit_grad(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_grad_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_grad_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py new file mode 100644 index 0000000000..91a5f1bca4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.fluid as fluid +from benchmark import BenchmarkSuite +from op_test import OpTest + +# This is a demo op test case for operator benchmarking and high resolution number stability alignment. + + +class TestSumOp(BenchmarkSuite): + def setUp(self): + self.op_type = "sum" + self.customize_testcase() + self.customize_fetch_list() + + def customize_fetch_list(self): + """ + customize fetch list, configure the wanted variables. + >>> self.fetch_list = ["Out"] + """ + self.fetch_list = ["Out"] + # pass + + def customize_testcase(self): + # a test case + x0 = np.random.random((300, 400)).astype('float32') + x1 = np.random.random((300, 400)).astype('float32') + x2 = np.random.random((300, 400)).astype('float32') + + # NOTE: if the output is empty, then it will autofilled by benchmarkSuite. + # only the output dtype is used, the shape, lod and data is computed from input. + self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + self.outputs = {"Out": x0 + x1 + x2} + + def test_check_output(self): + """ + compare the output with customized output. In this case, + you should set the correct output by hands. + >>> self.outputs = {"Out": x0 + x1 + x2} + """ + self.check_output(atol=1e-8) + + def test_output_stability(self): + # compare the cpu gpu output in high resolution. + self.check_output_stability() + + def test_timeit_output(self): + """ + perf the op, time cost will be averged in iters. + output example + >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818 + >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596 + """ + self.timeit_output(iters=100) + + def test_timeit_grad(self): + """ + perf the op gradient, time cost will be averged in iters. + output example + >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536 + >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653 + """ + self.timeit_grad(iters=100) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b611470fa1..307caae4b0 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -15,13 +15,17 @@ import unittest import numpy as np import random +import time import itertools -import paddle.fluid.core as core import collections + +import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid.backward import append_backward from paddle.fluid.op import Operator from paddle.fluid.executor import Executor -from paddle.fluid.framework import Program, OpProtoHolder +from paddle.fluid.framework import Program, OpProtoHolder, Variable +from testsuite import create_op, set_input, append_input_output, append_loss_ops def randomize_probability(batch_size, class_num, dtype='float32'): @@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'): return prob -def create_op(scope, op_type, inputs, outputs, attrs): - kwargs = dict() - - op_maker = core.op_proto_and_checker_maker - op_role_attr_name = op_maker.kOpRoleAttrName() - - if op_role_attr_name not in attrs: - attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) - - def __create_var__(name, var_name): - scope.var(var_name).get_tensor() - kwargs[name].append(var_name) - - for in_name, in_dup in Operator.get_op_inputs(op_type): - if in_name in inputs: - kwargs[in_name] = [] - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, _ = item[0], item[1] - __create_var__(in_name, sub_in_name) - else: - __create_var__(in_name, in_name) - - for out_name, out_dup in Operator.get_op_outputs(op_type): - if out_name in outputs: - kwargs[out_name] = [] - if out_dup: - sub_out = outputs[out_name] - for item in sub_out: - sub_out_name, _ = item[0], item[1] - __create_var__(out_name, sub_out_name) - else: - __create_var__(out_name, out_name) - - for attr_name in Operator.get_op_attr_names(op_type): - if attr_name in attrs: - kwargs[attr_name] = attrs[attr_name] - - return Operator(op_type, **kwargs) - - -def set_input(scope, op, inputs, place): - def __set_input__(var_name, var): - if isinstance(var, tuple) or isinstance(var, np.ndarray): - tensor = scope.find_var(var_name).get_tensor() - if isinstance(var, tuple): - tensor.set_lod(var[1]) - var = var[0] - tensor.set_dims(var.shape) - tensor.set(var, place) - elif isinstance(var, float): - scope.find_var(var_name).set_float(var) - elif isinstance(var, int): - scope.find_var(var_name).set_int(var) - - for in_name, in_dup in Operator.get_op_inputs(op.type()): - if in_name in inputs: - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, sub_in_val = item[0], item[1] - __set_input__(sub_in_name, sub_in_val) - else: - __set_input__(in_name, inputs[in_name]) - - def get_numeric_gradient(place, scope, op, @@ -173,54 +110,15 @@ def get_numeric_gradient(place, return gradient_flat.reshape(tensor_to_check.get_dims()) -def append_input_output(block, op_proto, np_list, is_input): - '''Insert VarDesc and generate Python variable instance''' - proto_list = op_proto.inputs if is_input else op_proto.outputs - - def create_var(block, name, np_list, var_proto): - if name not in np_list: - assert var_proto.intermediate, "{} not found".format(name) - shape = None - lod_level = None - else: - np_value = np_list[name] - if isinstance(np_value, tuple): - shape = list(np_value[0].shape) - lod_level = len(np_value[1]) - else: - shape = list(np_value.shape) - lod_level = 0 - return block.create_var( - dtype="float32", shape=shape, lod_level=lod_level, name=name) - - var_dict = {} - for var_proto in proto_list: - var_name = str(var_proto.name) - if is_input: - if (var_name not in np_list) and var_proto.dispensable: - continue - assert (var_name in np_list) or (var_proto.dispensable), \ - "Missing {} as input".format(var_name) - if var_proto.duplicable: - assert isinstance(np_list[var_name], list), \ - "Duplicable {} should be set as list".format(var_name) - var_list = [] - for (name, np_value) in np_list[var_name]: - var_list.append( - create_var(block, name, {name: np_value}, var_proto)) - var_dict[var_name] = var_list - else: - var_dict[var_name] = create_var(block, var_name, np_list, var_proto) - - return var_dict - - class OpTest(unittest.TestCase): @classmethod def setUpClass(cls): '''Fix random seeds to remove randomness from tests''' cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() + cls.call_once = False + cls.dtype = "float32" + cls.outputs = {} np.random.seed(123) random.seed(124) @@ -231,6 +129,31 @@ class OpTest(unittest.TestCase): np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) + def try_call_once(self, data_type): + if not self.call_once: + self.call_once = True + self.dtype = data_type + + def infer_dtype_from_inputs_outputs(self, inputs, outputs): + def infer_dtype(numpy_dict): + assert isinstance( + numpy_dict, + dict), "self.inputs, self.outputs must be numpy_dict" + for var_name, var_value in numpy_dict.iteritems(): + if isinstance(var_value, (np.ndarray, np.generic)): + self.try_call_once(var_value.dtype) + elif isinstance(var_value, (list, tuple)): + # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + if len(var_value) > 1 and isinstance(var_value[1], ( + np.ndarray, np.generic)): + instance = var_value[1] + self.try_call_once(instance[1].dtype) + else: + self.try_call_once("float32") + + infer_dtype(inputs) + infer_dtype(outputs) + def feed_var(self, input_vars, place): feed_map = {} for var_name in input_vars: @@ -254,18 +177,14 @@ class OpTest(unittest.TestCase): return feed_map - def calc_output(self, place): - outs, _ = self._calc_output(place) - return outs - - def _calc_output(self, place): + def _append_ops(self, block): op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) - - program = Program() - block = program.global_block() - - inputs = append_input_output(block, op_proto, self.inputs, True) - outputs = append_input_output(block, op_proto, self.outputs, False) + "infer datatype from inputs and outputs for this test case" + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + inputs = append_input_output(block, op_proto, self.inputs, True, + self.dtype) + outputs = append_input_output(block, op_proto, self.outputs, False, + self.dtype) op = block.append_op( type=self.op_type, inputs=inputs, @@ -275,22 +194,68 @@ class OpTest(unittest.TestCase): op.desc.infer_var_type(block.desc) op.desc.infer_shape(block.desc) - fetch_list = [] - for var_name, var in outputs.iteritems(): - if var_name in self.outputs: + def _get_io_vars(self, block, numpy_inputs): + inputs = {} + for name, value in numpy_inputs.iteritems(): + if isinstance(value, list): + var_list = [ + block.var(sub_name) for sub_name, sub_value in value + ] + inputs[name] = var_list + else: + inputs[name] = block.var(name) + return inputs + + def _get_inputs(self, block): + return self._get_io_vars(block, self.inputs) + + def _get_outputs(self, block): + return self._get_io_vars(block, self.outputs) + + def calc_output(self, place): + outs, _ = self._calc_output(place) + return outs + + def _calc_output(self, place, parallel=False): + + program = Program() + block = program.global_block() + self._append_ops(block) + + inputs = self._get_inputs(block) + outputs = self._get_outputs(block) + feed_map = self.feed_var(inputs, place) + + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) + + fetch_list = getattr(self, "fetch_list", []) + # if the fetch_list is customized by user, we use it directly. + # if not, fill the fetch_list by the user configured outputs in test. + if len(fetch_list) == 0: + for var_name, var in outputs.iteritems(): if isinstance(var, list): for v in var: fetch_list.append(v) else: fetch_list.append(var) - - feed_map = self.feed_var(inputs, place) - - exe = Executor(place) - outs = exe.run(program, - feed=feed_map, - fetch_list=fetch_list, - return_numpy=False) + # if the fetch_list still empty, fill the fetch_list by the operator output. + if len(fetch_list) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + fetch_list.append(str(out_name)) + # fetch_list = map(block.var, fetch_list) + if not isinstance(fetch_list[0], Variable): + fetch_list = map(block.var, fetch_list) + outs = executor.run(program, + feed=feed_map, + fetch_list=fetch_list, + return_numpy=False) return outs, fetch_list def check_output_with_place(self, place, atol): @@ -346,17 +311,19 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has different lod at " + str(place)) - def check_output(self, atol=1e-5): - places = [core.CPUPlace()] + def _get_places(self): + places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) + return places + + def check_output(self, atol=1e-5): + places = self._get_places() for place in places: self.check_output_with_place(place, atol) def check_output_customized(self, checker): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: outs = self.calc_output(place) outs = [np.array(out) for out in outs] @@ -389,9 +356,7 @@ class OpTest(unittest.TestCase): in_place=False, max_relative_error=0.005, user_defined_grads=None): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: self.check_grad_with_place(place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, @@ -438,35 +403,6 @@ class OpTest(unittest.TestCase): max_relative_error, "Gradient Check On %s" % str(place)) - @staticmethod - def _create_var_descs_(block, var_dict): - # FIXME: Try unify with `append_input_output` - for param_name in var_dict: - var = var_dict[param_name] - if not isinstance(var, list) and not isinstance(var, tuple): - var = [(param_name, var, None)] - if not isinstance(var[0], list) and not isinstance(var[0], tuple): - var = [(param_name, var[0], var[1])] - - for i, item in enumerate(var): - if not isinstance(item[0], basestring): - item = [[param_name] + list(item)] - if len(item) == 2: - if isinstance(item[1], tuple): - var[i] = [item[0], item[1][0], item[1][1]] - else: - # only set var name and value, set lod to None - var[i] = list(item) + [None] - var_descs = [(block.create_var( - name=name, shape=each.shape, dtype=each.dtype), each, lod) - for name, each, lod in var] - - yield param_name, var_descs - - @staticmethod - def _merge_list(iterable): - return reduce(lambda a, b: list(a) + list(b), iterable, []) - @staticmethod def _numpy_to_lod_tensor(np_value, lod, place): tensor = core.LoDTensor() @@ -497,83 +433,31 @@ class OpTest(unittest.TestCase): input.dtype = np.uint16 return input - def _get_gradient(self, input_to_check, place, output_names, no_grad_set): + def _get_gradient(self, + input_to_check, + place, + output_names, + no_grad_set, + parallel=False): prog = Program() block = prog.global_block() - inputs_with_np = { - key: value - for (key, value) in OpTest._create_var_descs_( - block, getattr(self, 'inputs', {})) - } - outputs_with_np = { - key: val - for (key, val) in OpTest._create_var_descs_( - block, getattr(self, 'outputs', {})) - } - inputs = { - k: [item[0] for item in inputs_with_np[k]] - for k in inputs_with_np - } - outputs = { - k: [item[0] for item in outputs_with_np[k]] - for k in outputs_with_np - } - - op = block.append_op( - type=self.op_type, - inputs=inputs, - outputs=outputs, - attrs=getattr(self, 'attrs', {})) - - # infer variable type and infer shape in compile-time - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - - mean_inputs = map(block.var, output_names) - - if len(mean_inputs) == 1: - loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) - op = block.append_op( - inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - else: - avg_sum = [] - for cur_loss in mean_inputs: - cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) - op = block.append_op( - inputs={"X": [cur_loss]}, - outputs={"Out": [cur_avg_loss]}, - type="mean") - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - avg_sum.append(cur_avg_loss) - - loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) - op_sum = block.append_op( - inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') - op_sum.desc.infer_var_type(block.desc) - op_sum.desc.infer_shape(block.desc) - - loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) - op_loss = block.append_op( - inputs={"X": loss_sum}, - outputs={"Out": loss}, - type='scale', - attrs={'scale': 1.0 / float(len(avg_sum))}) - op_loss.desc.infer_var_type(block.desc) - op_loss.desc.infer_shape(block.desc) - + self._append_ops(block) + loss = append_loss_ops(block, output_names) param_grad_list = append_backward( loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) - feed_dict = { - item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place) - for p_name in inputs_with_np for item in inputs_with_np[p_name] - } + inputs = self._get_inputs(block) + feed_dict = self.feed_var(inputs, place) fetch_list = [g for p, g in param_grad_list] - executor = Executor(place) + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) return map(np.array, executor.run(prog, feed_dict, fetch_list, return_numpy=False)) diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py index f8ff5a3361..e726f99d49 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py @@ -194,107 +194,104 @@ class TestLstmOp(OpTest): ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4) -class TestLstmOpHasInitial(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = True - self.is_reverse = True - self.use_peepholes = True - - def test_check_grad(self): - # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], - max_relative_error=5e-4) - - def test_check_grad_ingore_bias(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Bias')) - - def test_check_grad_ingore_weight(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Weight')) - - def test_check_grad_ingore_input(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Weight', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Input')) - - def test_check_grad_ingore_h0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('H0')) - - def test_check_grad_ingore_c0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('C0')) - - -class TestLstmOpRerverse(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = True - - -class TestLstmOpNotUsePeepholes(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = False - +# class TestLstmOpHasInitial(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = True +# self.is_reverse = True +# self.use_peepholes = True + +# def test_check_grad(self): +# # TODO(qingqing) remove folowing lines after the check_grad is refined. +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], +# max_relative_error=5e-4) + +# def test_check_grad_ingore_bias(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Bias')) + +# def test_check_grad_ingore_weight(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Weight')) + +# def test_check_grad_ingore_input(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Weight', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Input')) + +# def test_check_grad_ingore_h0(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('H0')) + +# def test_check_grad_ingore_c0(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('C0')) + +# class TestLstmOpRerverse(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = True + +# class TestLstmOpNotUsePeepholes(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = False if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py new file mode 100644 index 0000000000..1dc94a80c9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -0,0 +1,182 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle.fluid.core as core +from paddle.fluid.op import Operator + + +def as_lodtensor(np_array, lod, place): + tensor = core.LoDTensor() + tensor.set(np_value, place) + if lod is not None: + tensor.set_lod(lod) + return tensor + + +def create_op(scope, op_type, inputs, outputs, attrs): + kwargs = dict() + + op_maker = core.op_proto_and_checker_maker + op_role_attr_name = op_maker.kOpRoleAttrName() + + if op_role_attr_name not in attrs: + attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) + + def __create_var__(name, var_name): + scope.var(var_name).get_tensor() + kwargs[name].append(var_name) + + for in_name, in_dup in Operator.get_op_inputs(op_type): + if in_name in inputs: + kwargs[in_name] = [] + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, _ = item[0], item[1] + __create_var__(in_name, sub_in_name) + else: + __create_var__(in_name, in_name) + + for out_name, out_dup in Operator.get_op_outputs(op_type): + if out_name in outputs: + kwargs[out_name] = [] + if out_dup: + sub_out = outputs[out_name] + for item in sub_out: + sub_out_name, _ = item[0], item[1] + __create_var__(out_name, sub_out_name) + else: + __create_var__(out_name, out_name) + + for attr_name in Operator.get_op_attr_names(op_type): + if attr_name in attrs: + kwargs[attr_name] = attrs[attr_name] + + return Operator(op_type, **kwargs) + + +def set_input(scope, op, inputs, place): + def __set_input__(var_name, var): + if isinstance(var, tuple) or isinstance(var, np.ndarray): + tensor = scope.find_var(var_name).get_tensor() + if isinstance(var, tuple): + tensor.set_lod(var[1]) + var = var[0] + tensor.set_dims(var.shape) + tensor.set(var, place) + elif isinstance(var, float): + scope.find_var(var_name).set_float(var) + elif isinstance(var, int): + scope.find_var(var_name).set_int(var) + + for in_name, in_dup in Operator.get_op_inputs(op.type()): + if in_name in inputs: + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, sub_in_val = item[0], item[1] + __set_input__(sub_in_name, sub_in_val) + else: + __set_input__(in_name, inputs[in_name]) + + +def append_input_output(block, op_proto, np_list, is_input, dtype): + '''Insert VarDesc and generate Python variable instance''' + proto_list = op_proto.inputs if is_input else op_proto.outputs + + def create_var(block, name, np_list, var_proto): + dtype = None + shape = None + lod_level = None + if name not in np_list: + assert var_proto.intermediate, "{} not found".format(name) + else: + np_value = np_list[name] + if isinstance(np_value, tuple): + dtype = np_value[0].dtype + # output shape, lod should be infered from input. + if is_input: + shape = list(np_value[0].shape) + lod_level = len(np_value[1]) + else: + dtype = np_value.dtype + if is_input: + shape = list(np_value.shape) + lod_level = 0 + return block.create_var( + dtype=dtype, shape=shape, lod_level=lod_level, name=name) + + var_dict = {} + for var_proto in proto_list: + var_name = str(var_proto.name) + if is_input: + if (var_name not in np_list) and var_proto.dispensable: + continue + assert (var_name in np_list) or (var_proto.dispensable), \ + "Missing {} as input".format(var_name) + if var_proto.duplicable: + assert isinstance(np_list[var_name], list), \ + "Duplicable {} should be set as list".format(var_name) + var_list = [] + for (name, np_value) in np_list[var_name]: + var_list.append( + create_var(block, name, {name: np_value}, var_proto)) + var_dict[var_name] = var_list + else: + var_dict[var_name] = create_var(block, var_name, np_list, var_proto) + + return var_dict + + +def append_loss_ops(block, output_names): + mean_inputs = map(block.var, output_names) + # for item in mean_inputs: + # print(item) + # print("Item", item.dtype) + + if len(mean_inputs) == 1: + loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) + op = block.append_op( + inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + else: + avg_sum = [] + for cur_loss in mean_inputs: + cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) + op = block.append_op( + inputs={"X": [cur_loss]}, + outputs={"Out": [cur_avg_loss]}, + type="mean") + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + avg_sum.append(cur_avg_loss) + + loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) + op_sum = block.append_op( + inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') + op_sum.desc.infer_var_type(block.desc) + op_sum.desc.infer_shape(block.desc) + + loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) + op_loss = block.append_op( + inputs={"X": loss_sum}, + outputs={"Out": loss}, + type='scale', + attrs={'scale': 1.0 / float(len(avg_sum))}) + op_loss.desc.infer_var_type(block.desc) + op_loss.desc.infer_shape(block.desc) + return loss From d48172f22a5a60f3d66e0d90a0ccc5880c03f953 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 7 Jun 2018 20:48:34 +0800 Subject: [PATCH 9/9] split reduce op into multiple libraries, accelerate the compiling (#11029) * "split into multiple .ccl" * "refine file structure" * "refine files" * "remove the cmakelist" * "fix typo" * "fix typo" * fix ci --- paddle/fluid/framework/op_registry.h | 12 +- paddle/fluid/operators/CMakeLists.txt | 2 - paddle/fluid/operators/reduce_max_op.cc | 34 ++ paddle/fluid/operators/reduce_max_op.cu | 34 ++ paddle/fluid/operators/reduce_mean_op.cc | 35 ++ paddle/fluid/operators/reduce_mean_op.cu | 34 ++ paddle/fluid/operators/reduce_mean_op.h | 39 +++ paddle/fluid/operators/reduce_min_max_op.h | 50 +++ paddle/fluid/operators/reduce_min_op.cc | 34 ++ paddle/fluid/operators/reduce_min_op.cu | 34 ++ paddle/fluid/operators/reduce_op.cc | 186 ----------- paddle/fluid/operators/reduce_op.cu | 41 --- paddle/fluid/operators/reduce_op.h | 352 +++++++++----------- paddle/fluid/operators/reduce_op_function.h | 109 ++++++ paddle/fluid/operators/reduce_prod_op.cc | 35 ++ paddle/fluid/operators/reduce_prod_op.cu | 34 ++ paddle/fluid/operators/reduce_prod_op.h | 39 +++ paddle/fluid/operators/reduce_sum_op.cc | 34 ++ paddle/fluid/operators/reduce_sum_op.cu | 34 ++ paddle/fluid/operators/reduce_sum_op.h | 39 +++ 20 files changed, 789 insertions(+), 422 deletions(-) create mode 100644 paddle/fluid/operators/reduce_max_op.cc create mode 100644 paddle/fluid/operators/reduce_max_op.cu create mode 100644 paddle/fluid/operators/reduce_mean_op.cc create mode 100644 paddle/fluid/operators/reduce_mean_op.cu create mode 100644 paddle/fluid/operators/reduce_mean_op.h create mode 100644 paddle/fluid/operators/reduce_min_max_op.h create mode 100644 paddle/fluid/operators/reduce_min_op.cc create mode 100644 paddle/fluid/operators/reduce_min_op.cu delete mode 100644 paddle/fluid/operators/reduce_op.cc delete mode 100644 paddle/fluid/operators/reduce_op.cu create mode 100644 paddle/fluid/operators/reduce_op_function.h create mode 100644 paddle/fluid/operators/reduce_prod_op.cc create mode 100644 paddle/fluid/operators/reduce_prod_op.cu create mode 100644 paddle/fluid/operators/reduce_prod_op.h create mode 100644 paddle/fluid/operators/reduce_sum_op.cc create mode 100644 paddle/fluid/operators/reduce_sum_op.cu create mode 100644 paddle/fluid/operators/reduce_sum_op.h diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index e57c2ff3d0..43ab227a94 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -156,15 +156,15 @@ class OpKernelRegistrar : public Registrar { /** * Macro to register OperatorKernel. */ -#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \ +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ STATIC_ASSERT_GLOBAL_NAMESPACE( \ - __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + __reg_op_kernel_##op_type##_##library_type##__, \ "REGISTER_OP_KERNEL must be called in global namespace"); \ static ::paddle::framework::OpKernelRegistrar \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \ - #LIBRARY_TYPE); \ - int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \ + __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ + #library_type); \ + int TouchOpKernelRegistrar_##op_type##_##library_type() { \ + __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ return 0; \ } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index f75b7c70d6..5e86b16ba1 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -166,8 +166,6 @@ function(op_library TARGET) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") file(APPEND ${pybind_file} "USE_OP(relu);\n") - elseif(${TARGET} STREQUAL "reduce") - file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") else() diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc new file mode 100644 index 0000000000..95d3768e1f --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_max); +REGISTER_OP_CPU_KERNEL( + reduce_max, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu new file mode 100644 index 0000000000..0d86b3127e --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_max, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc new file mode 100644 index 0000000000..fc258c2496 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_REDUCE_OP(reduce_mean); +REGISTER_OP_CPU_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu new file mode 100644 index 0000000000..960cb3235b --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_mean_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h new file mode 100644 index 0000000000..1359679c47 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MeanFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->mean(dim); + } +}; + +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) / dx->constant(size); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h new file mode 100644 index 0000000000..ec59f3e71c --- /dev/null +++ b/paddle/fluid/operators/reduce_min_max_op.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); + } +}; + +struct MinFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->minimum(dim); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + auto equals = (*x) == y->broadcast(dim); + auto ones = dx->constant(1); + auto zeros = dx->constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc new file mode 100644 index 0000000000..330a86d2e4 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_min); +REGISTER_OP_CPU_KERNEL( + reduce_min, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu new file mode 100644 index 0000000000..da466f805e --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_min, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc deleted file mode 100644 index e293fd5e41..0000000000 --- a/paddle/fluid/operators/reduce_op.cc +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_op.h" - -#include -#include -#include - -namespace paddle { -namespace operators { - -using framework::Tensor; - -class ReduceOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReduceOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReduceOp should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - bool reduce_all = ctx->Attrs().Get("reduce_all"); - bool keep_dim = ctx->Attrs().Get("keep_dim"); - if (reduce_all) { - if (keep_dim) - ctx->SetOutputDim( - "Out", framework::make_ddim(std::vector(x_rank, 1))); - else - ctx->SetOutputDim("Out", {1}); - } else { - auto dims_vector = vectorize(x_dims); - if (keep_dim) { - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = 1; - } - } else { - const int kDelFlag = -2; - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - } - auto out_dims = framework::make_ddim(dims_vector); - ctx->SetOutputDim("Out", out_dims); - if (dims[0] != 0) { - // Only pass LoD when not reducing on the first dim. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - } -}; - -class ReduceGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } - } -}; - -class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() final { - AddInput("X", - "(Tensor) The input tensor. Tensors with rank at most 6 are " - "supported."); - AddOutput("Out", "(Tensor) The result tensor."); - AddAttr>( - "dim", - "(list, default {0}) The dimensions to reduce. " - "Must be in the range [-rank(input), rank(input)). " - "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " - "Note that reducing on the first dim will make the LoD info lost.") - .SetDefault({0}); - AddAttr("keep_dim", - "(bool, default false) " - "If true, retain the reduced dimension with length 1.") - .SetDefault(false); - AddAttr("reduce_all", - "(bool, default false) " - "If true, output a scalar reduced along all dimensions.") - .SetDefault(false); - AddComment(string::Sprintf(R"DOC( -%s Operator. - -This operator computes the %s of input tensor along the given dimension. -The result tensor has 1 fewer dimension than the input unless keep_dim is true. -If reduce_all is true, just reduce along all dimensions and output a scalar. - -)DOC", - GetOpType(), GetName())); - } - - protected: - virtual std::string GetName() const = 0; - virtual std::string GetOpType() const = 0; -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_OP(op_name) \ - class __##op_name##Maker__ : public ops::ReduceOpMaker { \ - protected: \ - virtual std::string GetName() const { return #op_name; } \ - virtual std::string GetOpType() const { return "Reduce " #op_name; } \ - }; \ - REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \ - paddle::framework::DefaultGradOpDescMaker); \ - REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp) - -REGISTER_REDUCE_OP(sum); -REGISTER_REDUCE_OP(mean); -REGISTER_REDUCE_OP(max); -REGISTER_REDUCE_OP(min); -REGISTER_REDUCE_OP(prod); - -#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL(reduce_type, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CPU_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu deleted file mode 100644 index ae29587f55..0000000000 --- a/paddle/fluid/operators/reduce_op.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#define EIGEN_USE_GPU -#include "paddle/fluid/operators/reduce_op.h" - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type, ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index 7df47f316c..72b6cf1773 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -14,105 +14,20 @@ limitations under the License. */ #pragma once +#include +#include #include -#include "glog/logging.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" + +#include "paddle/fluid/operators/reduce_op_function.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; -template -using EigenTensor = framework::EigenTensor; -template -using EigenScalar = framework::EigenScalar; -template -using EigenVector = framework::EigenVector; - -struct SumFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->sum(dim); - } -}; - -struct SumGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim); - } -}; - -struct MeanFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->mean(dim); - } -}; - -struct MeanGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) / dx->constant(size); - } -}; - -struct MaxFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->maximum(dim); - } -}; - -struct MinFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->minimum(dim); - } -}; - -struct MaxOrMinGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - auto equals = (*x) == y->broadcast(dim); - auto ones = dx->constant(1); - auto zeros = dx->constant(0); - // If there are multiple minimum or maximum elements, the subgradient of - // each is the set [0, 1], and we pass gradient to all of them here. - dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); - } -}; - -struct ProdFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->prod(dim); - } -}; - -struct ProdGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); - } -}; - -#define HANDLE_DIM(NDIM, RDIM) \ - if (ndim == NDIM && rdim == RDIM) { \ - ReduceCompute(context); \ +#define HANDLE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceFunctor( \ + context.template device_context(), *input, output, \ + dims, keep_dim); \ } template @@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + if (reduce_all) { // Flatten and reduce 1-D tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input); auto out = EigenScalar::From(*output); auto& place = @@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel { Functor functor; functor(place, &x, &out, reduce_dim); } else { - int ndim = context.Input("X")->dims().size(); - int rdim = context.Attr>("dim").size(); + int ndim = input->dims().size(); + int rdim = dims.size(); // comments for accelerating compiling temporarily. // HANDLE_DIM(6, 5); // HANDLE_DIM(6, 4); @@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel { HANDLE_DIM(1, 1); } } - - private: - template - void ReduceCompute(const framework::ExecutionContext& context) const { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - - auto x = EigenTensor::From(*input); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto reduce_dim = Eigen::array(); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduce_dim[i] = dims[i]; - } - // construct the squeezed output tensor - bool keep_dim = context.Attr("keep_dim"); - DDim out_dims = output->dims(); - if (keep_dim && x_rank > 1) { - const int kDelFlag = -2; - auto dims_vector = vectorize(out_dims); - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - out_dims = framework::make_ddim(dims_vector); - } - auto& place = - *context.template device_context().eigen_device(); - Functor functor; - - if (D == 1) { - auto out = EigenScalar::From(*output); - functor(place, &x, &out, reduce_dim); - } else { - auto out = EigenTensor::From(*output, out_dims); - functor(place, &x, &out, reduce_dim); - } - } }; template @@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + if (reduce_all) { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input0); auto x_reduce = EigenVector::From(*input1); auto x_reduce_grad = EigenVector::From(*input2); @@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel { functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broadcast_dim[0]); } else { - int rank = context.Input("X")->dims().size(); + int rank = input0->dims().size(); switch (rank) { case 1: - ReduceGradCompute<1>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 2: - ReduceGradCompute<2>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 3: - ReduceGradCompute<3>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 4: - ReduceGradCompute<4>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 5: - ReduceGradCompute<5>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 6: - ReduceGradCompute<6>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; } } } +}; - private: - template - void ReduceGradCompute(const framework::ExecutionContext& context) const { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); +class ReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - output->mutable_data(context.GetPlace()); - auto x = EigenTensor::From(*input0); - auto x_grad = EigenTensor::From(*output); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto x_dims = input0->dims(); - auto reduced_dims_v = vectorize(x_dims); - Eigen::array broadcast_dim; - for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReduceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReduceOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + bool keep_dim = ctx->Attrs().Get("keep_dim"); + if (reduce_all) { + if (keep_dim) + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = vectorize(x_dims); + if (keep_dim) { + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = 1; + } + } else { + const int kDelFlag = -2; + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = kDelFlag; + } + dims_vector.erase( + remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + } + auto out_dims = framework::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (dims[0] != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } +}; + +class ReduceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - int broad_cats_times = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); for (size_t i = 0; i < dims.size(); ++i) { if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduced_dims_v[dims[i]] = 1; - broadcast_dim[dims[i]] = x_dims[dims[i]]; - broad_cats_times *= x_dims[dims[i]]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", /*->*/ x_grad_name); } - auto reduced_dims = framework::make_ddim(reduced_dims_v); - auto x_reduce = EigenTensor::From(*input1, reduced_dims); - auto x_reduce_grad = EigenTensor::From(*input2, reduced_dims); + } +}; + +class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() final { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 6 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr>( + "dim", + "(list, default {0}) The dimensions to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault({0}); + AddAttr("keep_dim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + AddComment(string::Sprintf(R"DOC( +%s Operator. - auto& place = - *context.template device_context().eigen_device(); +This operator computes the %s of input tensor along the given dimension. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. - Functor functor; - functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, - broad_cats_times); +)DOC", + GetOpType(), GetName())); } + + protected: + virtual std::string GetName() const = 0; + virtual std::string GetOpType() const = 0; }; } // namespace operators } // namespace paddle -#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ - __macro(reduce_sum, SumFunctor, SumGradFunctor); \ - __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ - __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_prod, ProdFunctor, ProdGradFunctor); +namespace ops = paddle::operators; + +#define REGISTER_REDUCE_OP(op_name) \ + class __##op_name##Maker__ : public ops::ReduceOpMaker { \ + protected: \ + virtual std::string GetName() const { return #op_name; } \ + virtual std::string GetOpType() const { return "Reduce " #op_name; } \ + }; \ + REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \ + paddle::framework::DefaultGradOpDescMaker); \ + REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp) diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h new file mode 100644 index 0000000000..3da27bc8ac --- /dev/null +++ b/paddle/fluid/operators/reduce_op_function.h @@ -0,0 +1,109 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +template +using EigenTensor = framework::EigenTensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +template +void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, const std::vector& dims, + bool keep_dim) { + auto x = EigenTensor::From(input); + auto x_rank = static_cast(x.dimensions().size()); + auto reduce_dim = Eigen::array(); + std::vector dims_ref = dims; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; + reduce_dim[i] = dims_ref[i]; + } + // construct the squeezed output tensor + DDim out_dims = output->dims(); + if (keep_dim && x_rank > 1) { + const int kDelFlag = -2; + auto dims_vector = framework::vectorize(out_dims); + for (size_t i = 0; i < dims_ref.size(); ++i) { + dims_vector[dims_ref[i]] = kDelFlag; + } + dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + out_dims = framework::make_ddim(dims_vector); + } + auto& place = *context.eigen_device(); + Functor functor; + + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, &x, &out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, out_dims); + functor(place, &x, &out, reduce_dim); + } +} + +template +void ReduceGradFunctor(const DeviceContext& context, + const framework::Tensor& input0, + const framework::Tensor& input1, + const framework::Tensor& input2, + framework::Tensor* output, + const std::vector& dims) { + auto x = EigenTensor::From(input0); + auto x_grad = EigenTensor::From(*output); + auto x_rank = static_cast(x.dimensions().size()); + auto x_dims = input0.dims(); + auto reduced_dims_v = framework::vectorize(x_dims); + std::vector dims_ref = dims; + Eigen::array broadcast_dim; + for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + + int broad_cats_times = 1; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) { + dims_ref[i] = x_rank + dims_ref[i]; + } + reduced_dims_v[dims_ref[i]] = 1; + broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; + broad_cats_times *= x_dims[dims_ref[i]]; + } + auto reduced_dims = framework::make_ddim(reduced_dims_v); + auto x_reduce = EigenTensor::From(input1, reduced_dims); + auto x_reduce_grad = EigenTensor::From(input2, reduced_dims); + + auto& place = *context.eigen_device(); + + Functor functor; + functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, + broad_cats_times); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc new file mode 100644 index 0000000000..713728b997 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_REDUCE_OP(reduce_prod); +REGISTER_OP_CPU_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_prod_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu new file mode 100644 index 0000000000..d62e677d92 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_prod_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h new file mode 100644 index 0000000000..97748113e0 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct ProdFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->prod(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc new file mode 100644 index 0000000000..c5b5398787 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_REDUCE_OP(reduce_sum); +REGISTER_OP_CPU_KERNEL( + reduce_sum, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_sum_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu new file mode 100644 index 0000000000..f2e16955a5 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_sum, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h new file mode 100644 index 0000000000..e67d7e1da5 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct SumFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->sum(dim); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim); + } +}; + +} // namespace operators +} // namespace paddle