Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update_crop_op

7 years ago · 24649a780d
parent 9c61409a18 d48172f22a
commit 24649a780d
38 changed files with 1764 additions and 829 deletions
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@ -19,4 +19,4 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py dataset.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@ -44,6 +44,16 @@ Currently supported `--model` argument include:
    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
    ```
 ## Prepare the RecordIO file to Achieve Better Performance
 Run the following command will generate RecordIO files like "mnist.recordio" under the path
 and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
 at any time using `fluid.batch`.
 ```bash
 python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
 ```
 ## Run Distributed Benchmark on Kubernetes Cluster
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -38,10 +38,12 @@ def parse_args():
        default='resnet',
        help='The model to run benchmark with.')
    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
+        '--batch_size',
        type=int,
        default=32,
        help='The batch size on each gpu.')
    parser.add_argument(
        '--learning_rate', type=float, default=0.001, help='The learning rate.')
    # TODO(wuyi): add "--use_fake_data" option back.
    parser.add_argument(
        '--skip_batch_num',
        type=int,
@ -49,7 +51,10 @@ def parse_args():
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
+        '--iterations',
        type=int,
        default=80,
        help='The number of minibatches, set to -1 to run all batches.')
    parser.add_argument(
        '--pass_num', type=int, default=100, help='The number of passes.')
    parser.add_argument(
@ -69,6 +74,7 @@ def parse_args():
        type=int,
        default=1,
        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
    # this option is available only for vgg and resnet.
    parser.add_argument(
        '--cpus',
        type=int,
@ -78,7 +84,7 @@ def parse_args():
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@ -108,6 +114,16 @@ def parse_args():
        default='local',
        choices=['local', 'pserver', 'nccl2'],
        help='Choose parameter update method, can be local, pserver, nccl2.')
    parser.add_argument(
        '--use_reader_op',
        action='store_true',
        help='Whether to use reader op, and must specify the data path if set this to true.'
    )
    parser.add_argument(
        '--data_path',
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
    args = parser.parse_args()
    return args
@ -210,26 +226,50 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(startup_prog)
-    feed_var_list = [
+
-        var for var in train_prog.global_block().vars.itervalues()
+    if not args.use_reader_op:
-        if var.is_data
+        feed_var_list = [
-    ]
+            var for var in train_prog.global_block().vars.itervalues()
-    feeder = fluid.DataFeeder(feed_var_list, place)
+            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
            reader_generator = train_reader()
        batch_id = 0
        data = None
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
            if iters == args.iterations:
                break
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
+
-                break
+            if args.use_reader_op:
-            loss = exe.run(train_prog,
+                try:
-                           feed=feeder.feed(data),
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                           fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
                    break
            else:
                loss = exe.run(train_prog,
                               feed=feeder.feed(data),
                               fetch_list=[avg_loss])
            iters += 1
-            num_samples += len(data)
+            batch_id += 1
            # FIXME(wuyi): For use_reader_op, if the current
            # pass is not the last, the last batch of this pass
            # is also equal to args.batch_size.
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)
            train_losses.append(loss)
            print("Pass: %d, Iter: %d, Loss: %f\n" %
                  (pass_id, iters, np.mean(train_losses)))
@ -250,10 +290,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
                   num_trainers, trainer_id):
-    feed_var_list = [
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-        var for var in train_prog.global_block().vars.itervalues()
+    if not args.use_reader_op:
-        if var.is_data
+        feed_var_list = [
-    ]
+            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
@ -270,7 +314,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                       "value": 1.0,
                       "dtype": var.dtype})
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)
@ -287,12 +330,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
        num_trainers=num_trainers,
        trainer_id=trainer_id)
    feeder = fluid.DataFeeder(feed_var_list, place)
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
            reader_generator = train_reader()
        batch_id = 0
        data = None
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
            if iters == args.iterations:
                break
            if args.profile and pass_id == 0 and batch_id == 5:
                profiler.start_profiler("All")
            elif args.profile and pass_id == 0 and batch_id == 10:
@ -301,19 +353,26 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
+            if args.use_fake_data or args.use_reader_op:
-                break
+                try:
-            if args.use_fake_data:
+                    loss, = exe.run([avg_loss.name])
-                loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
                    break
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
            if args.update_method == "pserver":
                exe.bcast_params()
-            num_samples += len(data)
+            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)
            iters += 1
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
            batch_id += 1
        if args.use_reader_op:
            num_samples = num_samples * args.gpus
        print_train_time(start_time, time.time(), num_samples)
        if not args.no_test and batch_acc:
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
 def get_model(args):
    if args.use_reader_op:
        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
    encoder_size = 512
    decoder_size = 512
@ -221,7 +223,7 @@ def get_model(args):
    train_batch_generator = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_batch_generator = paddle.batch(
        paddle.reader.shuffle(
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@ -20,6 +20,7 @@ import numpy as np
 import argparse
 import time
 import cProfile
 import os
 import paddle
 import paddle.fluid as fluid
@ -65,9 +66,24 @@ def cnn_model(data):
 def get_model(args):
-    # Input data
+    if args.use_reader_op:
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        filelist = [
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
        ]
        data_file = fluid.layers.open_files(
            filenames=filelist,
            shapes=[[-1, 1, 28, 28], (-1, 1)],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            thread_num=args.gpus,
            pass_num=args.pass_num)
        data_file = fluid.layers.double_buffer(
            fluid.layers.batch(
                data_file, batch_size=args.batch_size))
        images, label = fluid.layers.read_file(data_file)
    else:
        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    if args.device == 'CPU' and args.cpus > 1:
        places = fluid.layers.get_places(args.cpus)
@ -103,7 +119,7 @@ def get_model(args):
    # Reader
    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@ -19,6 +19,7 @@ from __future__ import print_function
 import functools
 import numpy as np
 import time
 import os
 import cProfile, pstats, StringIO
@ -26,6 +27,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
 from recordio_converter import imagenet_train, imagenet_test
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@ -122,16 +124,48 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-    else:
+        train_reader = paddle.dataset.cifar.train10()
        test_reader = paddle.dataset.cifar.test10()
    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
-
+        train_reader = paddle.dataset.flowers.train()
-    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        test_reader = paddle.dataset.flowers.test()
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    elif args.data_set == "imagenet":
        class_dim = 1000
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
        if not args.data_path:
            raise Exception(
                "Must specify --data_path when training with imagenet")
        train_reader = imagenet_train(args.data_path)
        test_reader = imagenet_test(args.data_path)
    if args.use_reader_op:
        filelist = [
            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
        ]
        data_file = fluid.layers.open_files(
            filenames=filelist,
            shapes=[[-1] + dshape, (-1, 1)],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            thread_num=args.gpus,
            pass_num=args.pass_num)
        data_file = fluid.layers.double_buffer(
            fluid.layers.batch(
                data_file, batch_size=args.batch_size))
        input, label = fluid.layers.read_file(data_file)
    else:
        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    if args.device == 'CPU' and args.cpus > 1:
        places = fluid.layers.get_places(args.cpus)
@ -162,15 +196,10 @@ def get_model(args):
    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    train_reader = paddle.batch(
+    batched_train_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
+            train_reader, buf_size=5120),
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+        batch_size=args.batch_size * args.gpus)
-            buf_size=5120),
+    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
-        batch_size=args.batch_size)
+
-    test_reader = paddle.batch(
+    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
 def get_model(args):
    if args.use_reader_op:
        raise Exception(
            "stacked_dynamic_lstm do not support reader op for now.")
    lstm_size = 512
    emb_dim = 512
    crop_size = 1500
@ -114,7 +117,7 @@ def get_model(args):
    train_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
 import os
 def vgg16_bn_drop(input):
@ -65,9 +66,24 @@ def get_model(args):
        else:
            data_shape = [224, 224, 3]
-    # Input data
+    if args.use_reader_op:
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+        filelist = [
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
        ]
        data_file = fluid.layers.open_files(
            filenames=filelist,
            shapes=[[-1] + data_shape, (-1, 1)],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            thread_num=args.gpus,
            pass_num=args.pass_num)
        data_file = fluid.layers.double_buffer(
            fluid.layers.batch(
                data_file, batch_size=args.batch_size))
        images, label = fluid.layers.read_file(data_file)
    else:
        images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    net = vgg16_bn_drop(images)
@ -95,7 +111,7 @@ def get_model(args):
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
--- a/benchmark/fluid/recordio_converter.py
+++ b/benchmark/fluid/recordio_converter.py
@ -0,0 +1,164 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import random
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.dataset import mnist, cifar, flowers, image
 def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
                       shape_label):
    num_batches = 0
    with fluid.program_guard(fluid.Program(), fluid.Program()):
        reader = paddle.batch(py_reader(), batch_size=batch_size)
        feeder = fluid.DataFeeder(
            feed_list=[  # order is image and label
                fluid.layers.data(
                    name='image', shape=shape_data),
                fluid.layers.data(
                    name='label', shape=shape_label, dtype='int64'),
            ],
            place=fluid.CPUPlace())
        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
            outfilepath, reader, feeder)
    return num_batches
 def prepare_mnist(outpath, batch_size):
    outfilepath = os.path.join(outpath, "mnist.recordio")
    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
 def prepare_cifar10(outpath, batch_size):
    outfilepath = os.path.join(outpath, "cifar.recordio")
    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
 def prepare_flowers(outpath, batch_size):
    outfilepath = os.path.join(outpath, "flowers.recordio")
    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
                       [1])
 def default_mapper(sample):
    img, label = sample
    img = image.simple_transform(
        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
    return img.flatten().astype('float32'), label
 def imagenet_train(data_dir):
    contents = os.listdir(data_dir)
    if set(contents) != set(
        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
        raise Exception("Imagenet data contents error!")
    img2label = dict()
    imgfilelist = []
    with open(os.path.join(data_dir, "train.txt")) as fn:
        while 1:
            l = fn.readline()
            if not l:
                break
            img, lbl = l[:-1].split(" ")
            img2label[img] = int(lbl)
            imgfilelist.append(img)
    # shuffle all, this is slow
    random.shuffle(imgfilelist)
    def train_reader():
        for idx, imgfile in enumerate(imgfilelist):
            data = image.load_image(
                os.path.join(data_dir, "train", imgfile.lower()))
            label = [img2label[imgfile], ]
            yield [data, label]
    return paddle.reader.map_readers(default_mapper, train_reader)
 def imagenet_test(data_dir):
    contents = os.listdir(data_dir)
    if set(contents) != set(
        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
        raise Exception("Imagenet data contents error!")
    img2label = dict()
    imgfilelist = []
    with open(os.path.join(data_dir, "val.txt")) as fn:
        while 1:
            l = fn.readline()
            if not l:
                break
            img, lbl = l[:-1].split(" ")
            img2label[img] = int(lbl)
            imgfilelist.append(img)
    def test_reader():
        for idx, imgfile in enumerate(imgfilelist):
            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
            image_path = ".".join([base_path, "jpeg"])
            data = image.load_image(image_path)
            label = [img2label[imgfile], ]
            yield [data, label]
    return paddle.reader.map_readers(default_mapper, test_reader)
 # FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
 def convert_reader_to_recordio_files(
        filename,
        batch_per_file,
        reader_creator,
        feeder,
        compressor=core.RecordIOWriter.Compressor.Snappy,
        max_num_records=1000,
        feed_order=None):
    if feed_order is None:
        feed_order = feeder.feed_names
    f_name, f_ext = os.path.splitext(filename)
    assert (f_ext == ".recordio")
    lines = []
    f_idx = 0
    counter = 0
    for idx, batch in enumerate(reader_creator()):
        lines.append(batch)
        if idx >= batch_per_file and idx % batch_per_file == 0:
            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
            with fluid.recordio_writer.create_recordio_writer(
                    filename, compressor, max_num_records) as writer:
                for l in lines:
                    res = feeder.feed(l)
                    for each in feed_order:
                        writer.append_tensor(res[each])
                    writer.complete_append_tensor()
                    counter += 1
                lines = []
                f_idx += 1
            print("written file: ", filename)
    return counter
 def prepare_imagenet(inpath, outpath, batch_size):
    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
    feeder = fluid.DataFeeder(
        feed_list=[
            fluid.layers.data(
                name="image", shape=[3, 224, 224]), fluid.layers.data(
                    name="label", shape=[1], dtype='int64')
        ],
        place=fluid.CPUPlace())
    outpath = os.path.join(outpath, "imagenet.recordio")
    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -156,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
 /**
 * Macro to register OperatorKernel.
 */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
      "REGISTER_OP_KERNEL must be called in global namespace");            \
  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
+                                                           #library_type); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
    return 0;                                                              \
  }
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -693,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
        }
        if (t != nullptr) {
          int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+          PADDLE_ENFORCE(
-                         "DataType of Paddle Op %s must be the same.", Type());
+              tmp == data_type || data_type == -1,
              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
              data_type, tmp);
          data_type = tmp;
        }
      }
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -166,8 +166,6 @@ function(op_library TARGET)
      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
      if(${TARGET} STREQUAL "activation")
        file(APPEND ${pybind_file} "USE_OP(relu);\n")
      elseif(${TARGET} STREQUAL "reduce")
        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
      elseif(${TARGET} STREQUAL "fake_dequantize")
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
      else()
--- a/paddle/fluid/operators/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_max_op.cc
@ -0,0 +1,34 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_min_max_op.h"
 REGISTER_REDUCE_OP(reduce_max);
 REGISTER_OP_CPU_KERNEL(
    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
                                  ops::MaxFunctor>,
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                      ops::MaxFunctor>,
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                      ops::MaxFunctor>);
 REGISTER_OP_CPU_KERNEL(
    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
@ -0,0 +1,34 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(reduce_max,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          float, ops::MaxFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          double, ops::MaxFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int, ops::MaxFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int64_t, ops::MaxFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_mean_op.cc
@ -0,0 +1,35 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_mean_op.h"
 REGISTER_REDUCE_OP(reduce_mean);
 REGISTER_OP_CPU_KERNEL(reduce_mean,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         float, ops::MeanFunctor>,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         double, ops::MeanFunctor>,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         int, ops::MeanFunctor>,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         int64_t, ops::MeanFunctor>);
 REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             float, ops::MeanGradFunctor>,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             double, ops::MeanGradFunctor>,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             int, ops::MeanGradFunctor>,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             int64_t, ops::MeanGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@ -0,0 +1,34 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_mean_op.h"
 REGISTER_OP_CUDA_KERNEL(reduce_mean,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          float, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          double, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int64_t, ops::MeanFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MeanGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.h
+++ b/paddle/fluid/operators/reduce_mean_op.h
@ -0,0 +1,39 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/operators/reduce_op.h"
 namespace paddle {
 namespace operators {
 struct MeanFunctor {
  template <typename DeviceContext, typename X, typename Y, typename Dim>
  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
    y->device(place) = x->mean(dim);
  }
 };
 struct MeanGradFunctor {
  template <typename DeviceContext, typename X, typename Y, typename DX,
            typename DY, typename Dim>
  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                  const Dim& dim, int size) {
    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_min_max_op.h
@ -0,0 +1,50 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/operators/reduce_op.h"
 namespace paddle {
 namespace operators {
 struct MaxFunctor {
  template <typename DeviceContext, typename X, typename Y, typename Dim>
  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
    y->device(place) = x->maximum(dim);
  }
 };
 struct MinFunctor {
  template <typename DeviceContext, typename X, typename Y, typename Dim>
  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
    y->device(place) = x->minimum(dim);
  }
 };
 struct MaxOrMinGradFunctor {
  template <typename DeviceContext, typename X, typename Y, typename DX,
            typename DY, typename Dim>
  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                  const Dim& dim, int size) {
    auto equals = (*x) == y->broadcast(dim);
    auto ones = dx->constant(1);
    auto zeros = dx->constant(0);
    // If there are multiple minimum or maximum elements, the subgradient of
    // each is the set [0, 1], and we pass gradient to all of them here.
    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_min_op.cc
@ -0,0 +1,34 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_min_max_op.h"
 REGISTER_REDUCE_OP(reduce_min);
 REGISTER_OP_CPU_KERNEL(
    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
                                  ops::MinFunctor>,
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                      ops::MinFunctor>,
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                      ops::MinFunctor>);
 REGISTER_OP_CPU_KERNEL(
    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
@ -0,0 +1,34 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(reduce_min,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          float, ops::MinFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          double, ops::MinFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int, ops::MinFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int64_t, ops::MinFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
@ -1,186 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/reduce_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 class ReduceOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of ReduceOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of ReduceOp should not be null.");
    auto x_dims = ctx->GetInputDim("X");
    auto x_rank = x_dims.size();
    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
    for (size_t i = 0; i < dims.size(); ++i) {
      if (dims[i] < 0) dims[i] = x_rank + dims[i];
      PADDLE_ENFORCE_LT(
          dims[i], x_rank,
          "The dim should be in the range [-rank(input), rank(input)).");
    }
    sort(dims.begin(), dims.end());
    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
    if (reduce_all) {
      if (keep_dim)
        ctx->SetOutputDim(
            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
      else
        ctx->SetOutputDim("Out", {1});
    } else {
      auto dims_vector = vectorize(x_dims);
      if (keep_dim) {
        for (size_t i = 0; i < dims.size(); ++i) {
          dims_vector[dims[i]] = 1;
        }
      } else {
        const int kDelFlag = -2;
        for (size_t i = 0; i < dims.size(); ++i) {
          dims_vector[dims[i]] = kDelFlag;
        }
        dims_vector.erase(
            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
            dims_vector.end());
      }
      auto out_dims = framework::make_ddim(dims_vector);
      ctx->SetOutputDim("Out", out_dims);
      if (dims[0] != 0) {
        // Only pass LoD when not reducing on the first dim.
        ctx->ShareLoD("X", /*->*/ "Out");
      }
    }
  }
 };
 class ReduceGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null.");
    auto x_dims = ctx->GetInputDim("X");
    auto x_rank = x_dims.size();
    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
    for (size_t i = 0; i < dims.size(); ++i) {
      if (dims[i] < 0) dims[i] = x_rank + dims[i];
      PADDLE_ENFORCE_LT(
          dims[i], x_rank,
          "The dim should be in the range [-rank(input), rank(input)).");
    }
    sort(dims.begin(), dims.end());
    auto x_grad_name = framework::GradVarName("X");
    if (ctx->HasOutput(x_grad_name)) {
      ctx->SetOutputDim(x_grad_name, x_dims);
      ctx->ShareLoD("X", /*->*/ x_grad_name);
    }
  }
 };
 class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() final {
    AddInput("X",
             "(Tensor) The input tensor. Tensors with rank at most 6 are "
             "supported.");
    AddOutput("Out", "(Tensor) The result tensor.");
    AddAttr<std::vector<int>>(
        "dim",
        "(list<int>, default {0}) The dimensions to reduce. "
        "Must be in the range [-rank(input), rank(input)). "
        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
        "Note that reducing on the first dim will make the LoD info lost.")
        .SetDefault({0});
    AddAttr<bool>("keep_dim",
                  "(bool, default false) "
                  "If true, retain the reduced dimension with length 1.")
        .SetDefault(false);
    AddAttr<bool>("reduce_all",
                  "(bool, default false) "
                  "If true, output a scalar reduced along all dimensions.")
        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
 %s Operator.
 This operator computes the %s of input tensor along the given dimension.
 The result tensor has 1 fewer dimension than the input unless keep_dim is true.
 If reduce_all is true, just reduce along all dimensions and output a scalar.
 )DOC",
                               GetOpType(), GetName()));
  }
 protected:
  virtual std::string GetName() const = 0;
  virtual std::string GetOpType() const = 0;
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 #define REGISTER_REDUCE_OP(op_name)                                        \
  class __##op_name##Maker__ : public ops::ReduceOpMaker {                 \
   protected:                                                              \
    virtual std::string GetName() const { return #op_name; }               \
    virtual std::string GetOpType() const { return "Reduce " #op_name; }   \
  };                                                                       \
  REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
                    paddle::framework::DefaultGradOpDescMaker<true>);      \
  REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
 REGISTER_REDUCE_OP(sum);
 REGISTER_REDUCE_OP(mean);
 REGISTER_REDUCE_OP(max);
 REGISTER_REDUCE_OP(min);
 REGISTER_REDUCE_OP(prod);
 #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
                                           float, ops::functor>,               \
                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
                                           double, ops::functor>,              \
                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
                                           int, ops::functor>,                 \
                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
                                           int64_t, ops::functor>);            \
  REGISTER_OP_CPU_KERNEL(                                                      \
      reduce_type##_grad,                                                      \
      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
                            ops::grad_functor>,                                \
      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
                            ops::grad_functor>,                                \
      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
                            ops::grad_functor>,                                \
      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
                            ops::grad_functor>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
--- a/paddle/fluid/operators/reduce_op.cu
+++ b/paddle/fluid/operators/reduce_op.cu
@ -1,41 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/reduce_op.h"
 namespace ops = paddle::operators;
 #define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
  REGISTER_OP_CUDA_KERNEL(                                                \
      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
                                     float, ops::functor>,                \
      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
                        ops::functor>,                                    \
      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
                        ops::functor>,                                    \
      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
                        ops::functor>);                                   \
  REGISTER_OP_CUDA_KERNEL(                                                \
      reduce_type##_grad,                                                 \
      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
                            ops::grad_functor>,                           \
      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
                            ops::grad_functor>,                           \
      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
                            ops::grad_functor>,                           \
      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
                            ops::grad_functor>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_op_function.h
@ -0,0 +1,109 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using DDim = framework::DDim;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename DeviceContext, typename T, size_t D, size_t R_D,
          typename Functor>
 void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
                   framework::Tensor* output, const std::vector<int>& dims,
                   bool keep_dim) {
  auto x = EigenTensor<T, D>::From(input);
  auto x_rank = static_cast<int>(x.dimensions().size());
  auto reduce_dim = Eigen::array<int, R_D>();
  std::vector<int> dims_ref = dims;
  for (size_t i = 0; i < dims_ref.size(); ++i) {
    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
    reduce_dim[i] = dims_ref[i];
  }
  // construct the squeezed output tensor
  DDim out_dims = output->dims();
  if (keep_dim && x_rank > 1) {
    const int kDelFlag = -2;
    auto dims_vector = framework::vectorize(out_dims);
    for (size_t i = 0; i < dims_ref.size(); ++i) {
      dims_vector[dims_ref[i]] = kDelFlag;
    }
    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
                      dims_vector.end());
    out_dims = framework::make_ddim(dims_vector);
  }
  auto& place = *context.eigen_device();
  Functor functor;
  if (D == 1) {
    auto out = EigenScalar<T>::From(*output);
    functor(place, &x, &out, reduce_dim);
  } else {
    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
    functor(place, &x, &out, reduce_dim);
  }
 }
 template <typename DeviceContext, typename T, size_t D, typename Functor>
 void ReduceGradFunctor(const DeviceContext& context,
                       const framework::Tensor& input0,
                       const framework::Tensor& input1,
                       const framework::Tensor& input2,
                       framework::Tensor* output,
                       const std::vector<int>& dims) {
  auto x = EigenTensor<T, D>::From(input0);
  auto x_grad = EigenTensor<T, D>::From(*output);
  auto x_rank = static_cast<int>(x.dimensions().size());
  auto x_dims = input0.dims();
  auto reduced_dims_v = framework::vectorize(x_dims);
  std::vector<int> dims_ref = dims;
  Eigen::array<int, D> broadcast_dim;
  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
  int broad_cats_times = 1;
  for (size_t i = 0; i < dims_ref.size(); ++i) {
    if (dims_ref[i] < 0) {
      dims_ref[i] = x_rank + dims_ref[i];
    }
    reduced_dims_v[dims_ref[i]] = 1;
    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
    broad_cats_times *= x_dims[dims_ref[i]];
  }
  auto reduced_dims = framework::make_ddim(reduced_dims_v);
  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
  auto& place = *context.eigen_device();
  Functor functor;
  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
          broad_cats_times);
 }
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_prod_op.cc
@ -0,0 +1,35 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_prod_op.h"
 REGISTER_REDUCE_OP(reduce_prod);
 REGISTER_OP_CPU_KERNEL(reduce_prod,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         float, ops::ProdFunctor>,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         double, ops::ProdFunctor>,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         int, ops::ProdFunctor>,
                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                         int64_t, ops::ProdFunctor>);
 REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             float, ops::ProdGradFunctor>,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             double, ops::ProdGradFunctor>,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             int, ops::ProdGradFunctor>,
                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                             int64_t, ops::ProdGradFunctor>);
--- a/Show More
+++ b/Show More