All file pass pre-commit hook

9 years ago · 068bfbb817
parent 0cdfa8cd6b
commit 068bfbb817
261 changed files with 1524 additions and 1404 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -8,10 +8,13 @@ os:
 env:
  - JOB=DOCS
  - JOB=BUILD_AND_TEST
+  - JOB=PRE_COMMIT
 matrix:
  exclude:
    - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux
+      env: JOB=DOCS  # Only generate documentation in linux.
+    - os: osx
+      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux

 addons:
  apt:
@ -39,6 +42,7 @@ addons:
      - lcov
      - graphviz
      - swig
+      - clang-format-3.8
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then
@ -50,7 +54,8 @@ before_install:
    fi
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
+  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme pre-commit
 script:
  - paddle/scripts/travis/main.sh
 notifications:
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@ -25,4 +25,3 @@ test 4 2 256 512
 test 4 2 512 128 
 test 4 2 512 256 
 test 4 2 512 512 
-
--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
@ -15,4 +15,3 @@ set -e
 wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
 tar zxf cifar-10-python.tar.gz
 rm cifar-10-python.tar.gz
-
--- a/demo/gan/data/get_mnist_data.sh
+++ b/demo/gan/data/get_mnist_data.sh
@ -15,5 +15,3 @@ do
        gunzip ${fname}.gz
    fi
 done
-
-
--- a/demo/gan/gan_conf.py
+++ b/demo/gan/gan_conf.py
@ -14,10 +14,9 @@
 from paddle.trainer_config_helpers import *

 mode = get_config_arg("mode", str, "generator")
-assert mode in set(["generator",
-                    "discriminator",
-                    "generator_training",
-                    "discriminator_training"])
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])

 is_generator_training = mode == "generator_training"
 is_discriminator_training = mode == "discriminator_training"
@ -38,8 +37,8 @@ sample_dim = 2
 settings(
    batch_size=128,
    learning_rate=1e-4,
-    learning_method=AdamOptimizer(beta1=0.5)
-)
+    learning_method=AdamOptimizer(beta1=0.5))
+

 def discriminator(sample):
    """
@ -50,71 +49,88 @@ def discriminator(sample):
    of the sample is from real data.
    """
    param_attr = ParamAttr(is_static=is_generator_training)
-    bias_attr = ParamAttr(is_static=is_generator_training,
-                          initial_mean=1.0,
-                          initial_std=0)
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0)

-    hidden = fc_layer(input=sample, name="dis_hidden", size=hidden_dim,
+    hidden = fc_layer(
+        input=sample,
+        name="dis_hidden",
+        size=hidden_dim,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=ReluActivation())

-    hidden2 = fc_layer(input=hidden, name="dis_hidden2", size=hidden_dim,
+    hidden2 = fc_layer(
+        input=hidden,
+        name="dis_hidden2",
+        size=hidden_dim,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=LinearActivation())

-    hidden_bn = batch_norm_layer(hidden2, 
+    hidden_bn = batch_norm_layer(
+        hidden2,
        act=ReluActivation(),
        name="dis_hidden_bn",
        bias_attr=bias_attr,
-                     param_attr=ParamAttr(is_static=is_generator_training,
-                           initial_mean=1.0,
+        param_attr=ParamAttr(
+            is_static=is_generator_training, initial_mean=1.0,
            initial_std=0.02),
        use_global_stats=False)

-    return fc_layer(input=hidden_bn, name="dis_prob", size=2,
+    return fc_layer(
+        input=hidden_bn,
+        name="dis_prob",
+        size=2,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=SoftmaxActivation())

+
 def generator(noise):
    """
    generator generates a sample given noise
    """
    param_attr = ParamAttr(is_static=is_discriminator_training)
-    bias_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0)
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0)

-    hidden = fc_layer(input=noise,
+    hidden = fc_layer(
+        input=noise,
        name="gen_layer_hidden",
        size=hidden_dim,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=ReluActivation())

-    hidden2 = fc_layer(input=hidden, name="gen_hidden2", size=hidden_dim,
+    hidden2 = fc_layer(
+        input=hidden,
+        name="gen_hidden2",
+        size=hidden_dim,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=LinearActivation())

-    hidden_bn = batch_norm_layer(hidden2, 
+    hidden_bn = batch_norm_layer(
+        hidden2,
        act=ReluActivation(),
        name="gen_layer_hidden_bn",
        bias_attr=bias_attr,
-                     param_attr=ParamAttr(is_static=is_discriminator_training,
+        param_attr=ParamAttr(
+            is_static=is_discriminator_training,
            initial_mean=1.0,
            initial_std=0.02),
        use_global_stats=False)

-    return fc_layer(input=hidden_bn,
+    return fc_layer(
+        input=hidden_bn,
        name="gen_layer1",
        size=sample_dim,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=LinearActivation())

+
 if is_generator_training:
    noise = data_layer(name="noise", size=noise_dim)
    sample = generator(noise)
@ -126,7 +142,8 @@ if is_generator_training or is_discriminator_training:
    label = data_layer(name="label", size=1)
    prob = discriminator(sample)
    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
    outputs(cost)

 if is_generator:
--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
@ -15,10 +15,9 @@ from paddle.trainer_config_helpers import *

 mode = get_config_arg("mode", str, "generator")
 dataSource = get_config_arg("data", str, "mnist")
-assert mode in set(["generator",
-                    "discriminator",
-                    "generator_training",
-                    "discriminator_training"])
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])

 is_generator_training = mode == "generator_training"
 is_discriminator_training = mode == "discriminator_training"
@ -47,13 +46,22 @@ s8, s16 = int(sample_dim/8), int(sample_dim/16)
 settings(
    batch_size=128,
    learning_rate=2e-4,
-    learning_method=AdamOptimizer(beta1=0.5)
-)
-
-def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name, 
-                 param_attr, bias_attr, param_attr_bn, bn, trans=False, 
+    learning_method=AdamOptimizer(beta1=0.5))
+
+
+def conv_bn(input,
+            channels,
+            imgSize,
+            num_filters,
+            output_x,
+            stride,
+            name,
+            param_attr,
+            bias_attr,
+            param_attr_bn,
+            bn,
+            trans=False,
            act=ReluActivation()):
-    
    """
    conv_bn is a utility function that constructs a convolution/deconv layer 
    with an optional batch_norm layer
@ -84,16 +92,27 @@ def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name,
        nameApx = "_convt"

    if bn:
-        conv = img_conv_layer(input, filter_size=filter_size, 
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
            num_filters=num_filters,
-                   name=name + nameApx, num_channels=channels,
-                   act=LinearActivation(), groups=1, stride=stride, 
-                   padding=padding, bias_attr=bias_attr,
-                   param_attr=param_attr, shared_biases=True, layer_attr=None,
-                   filter_size_y=None, stride_y=None, padding_y=None, 
+            name=name + nameApx,
+            num_channels=channels,
+            act=LinearActivation(),
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
            trans=trans)

-        conv_bn = batch_norm_layer(conv, 
+        conv_bn = batch_norm_layer(
+            conv,
            act=act,
            name=name + nameApx + "_bn",
            bias_attr=bias_attr,
@ -102,46 +121,57 @@ def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name,

        return conv_bn
    else:
-        conv = img_conv_layer(input, filter_size=filter_size, 
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
            num_filters=num_filters,
-                   name=name + nameApx, num_channels=channels,
-                   act=act, groups=1, stride=stride, 
-                   padding=padding, bias_attr=bias_attr,
-                   param_attr=param_attr, shared_biases=True, layer_attr=None,
-                   filter_size_y=None, stride_y=None, padding_y=None,
+            name=name + nameApx,
+            num_channels=channels,
+            act=act,
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
            trans=trans)
        return conv

+
 def generator(noise):
    """
    generator generates a sample given noise
    """
-    param_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=0.0,
-                           initial_std=0.02)
-    bias_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=0.0,
-                           initial_std=0.0)
-    
-    param_attr_bn=ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02)
-    
-    h1 = fc_layer(input=noise,
+    param_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02)
+
+    h1 = fc_layer(
+        input=noise,
        name="gen_layer_h1",
        size=s8 * s8 * gf_dim * 4,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=LinearActivation())

-    h1_bn = batch_norm_layer(h1, 
+    h1_bn = batch_norm_layer(
+        h1,
        act=ReluActivation(),
        name="gen_layer_h1_bn",
        bias_attr=bias_attr,
        param_attr=param_attr_bn,
        use_global_stats=False)

-    h2_bn = conv_bn(h1_bn, 
+    h2_bn = conv_bn(
+        h1_bn,
        channels=gf_dim * 4,
        output_x=s8,
        num_filters=gf_dim * 2,
@ -154,7 +184,8 @@ def generator(noise):
        bn=True,
        trans=True)

-    h3_bn = conv_bn(h2_bn, 
+    h3_bn = conv_bn(
+        h2_bn,
        channels=gf_dim * 2,
        output_x=s4,
        num_filters=gf_dim,
@ -167,8 +198,8 @@ def generator(noise):
        bn=True,
        trans=True)

-    
-    return conv_bn(h3_bn,
+    return conv_bn(
+        h3_bn,
        channels=gf_dim,
        output_x=s2,
        num_filters=c_dim,
@ -191,18 +222,16 @@ def discriminator(sample):
    of the sample is from generator and dimension 1 is the probabblity
    of the sample is from real data.
    """
-    param_attr = ParamAttr(is_static=is_generator_training,
-                           initial_mean=0.0,
-                           initial_std=0.02)
-    bias_attr = ParamAttr(is_static=is_generator_training,
-                          initial_mean=0.0,
-                          initial_std=0.0)
-    
-    param_attr_bn=ParamAttr(is_static=is_generator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02)
-    
-    h0 = conv_bn(sample, 
+    param_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0.02)
+
+    h0 = conv_bn(
+        sample,
        channels=c_dim,
        imgSize=sample_dim,
        num_filters=df_dim,
@ -214,7 +243,8 @@ def discriminator(sample):
        param_attr_bn=param_attr_bn,
        bn=False)

-    h1_bn = conv_bn(h0, 
+    h1_bn = conv_bn(
+        h0,
        channels=df_dim,
        imgSize=s2,
        num_filters=df_dim * 2,
@ -226,7 +256,8 @@ def discriminator(sample):
        param_attr_bn=param_attr_bn,
        bn=True)

-    h2_bn = conv_bn(h1_bn, 
+    h2_bn = conv_bn(
+        h1_bn,
        channels=df_dim * 2,
        imgSize=s4,
        num_filters=df_dim * 4,
@ -238,13 +269,15 @@ def discriminator(sample):
        param_attr_bn=param_attr_bn,
        bn=True)

-    return fc_layer(input=h2_bn, name="dis_prob", size=2,
+    return fc_layer(
+        input=h2_bn,
+        name="dis_prob",
+        size=2,
        bias_attr=bias_attr,
        param_attr=param_attr,
        act=SoftmaxActivation())


-
 if is_generator_training:
    noise = data_layer(name="noise", size=noise_dim)
    sample = generator(noise)
@ -256,7 +289,8 @@ if is_generator_training or is_discriminator_training:
    label = data_layer(name="label", size=1)
    prob = discriminator(sample)
    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
    outputs(cost)

 if is_generator:
--- a/demo/gan/gan_trainer.py
+++ b/demo/gan/gan_trainer.py
@ -24,6 +24,7 @@ from paddle.trainer.config_parser import logger
 import py_paddle.swig_paddle as api
 import matplotlib.pyplot as plt

+
 def plot2DScatter(data, outputfile):
    '''
    Plot the data as a 2D scatter plot and save to outputfile
@ -41,9 +42,11 @@ def plot2DScatter(data, outputfile):
    plt.scatter(x, y)
    plt.savefig(outputfile, bbox_inches='tight')

+
 def CHECK_EQ(a, b):
    assert a == b, "a=%s, b=%s" % (a, b)

+
 def copy_shared_parameters(src, dst):
    '''
    copy the parameters from src to dst
@ -52,11 +55,9 @@ def copy_shared_parameters(src, dst):
    :param dst: the destination of the parameters
    :type dst: GradientMachine
    '''
-    src_params = [src.getParameter(i)
-               for i in xrange(src.getParameterSize())]
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
    src_params = dict([(p.getName(), p) for p in src_params])

-
    for i in xrange(dst.getParameterSize()):
        dst_param = dst.getParameter(i)
        src_param = src_params.get(dst_param.getName(), None)
@ -68,14 +69,16 @@ def copy_shared_parameters(src, dst):
        dst_value.copyFrom(src_value)
        dst_param.setValueUpdated()

+
 def print_parameters(src):
-    src_params = [src.getParameter(i)
-               for i in xrange(src.getParameterSize())]
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]

    print "***************"
    for p in src_params:
        print "Name is %s" % p.getName()
-        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray()
+        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray(
+        )
+

 def load_mnist_data(imageFile):
    f = open(imageFile, "rb")
@ -93,6 +96,7 @@ def load_mnist_data(imageFile):
    f.close()
    return data.astype('float32')

+
 def load_cifar_data(cifar_path):
    batch_size = 10000
    data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32")
@ -106,11 +110,13 @@ def load_cifar_data(cifar_path):
    data = data / 255.0 * 2.0 - 1.0
    return data

+
 # synthesize 2-D uniform data
 def load_uniform_data():
    data = numpy.random.rand(1000000, 2).astype('float32')
    return data

+
 def merge(images, size):
    if images.shape[1] == 28 * 28:
        h, w, c = 28, 28, 1
@ -124,6 +130,7 @@ def merge(images, size):
          ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
    return img.astype('uint8')

+
 def save_images(images, path):
    merged_img = merge(images, [8, 8])
    if merged_img.shape[2] == 1:
@ -132,13 +139,16 @@ def save_images(images, path):
        im = Image.fromarray(merged_img, mode="RGB")
    im.save(path)

+
 def get_real_samples(batch_size, data_np):
-    return data_np[numpy.random.choice(data_np.shape[0], batch_size, 
-                                       replace=False),:]
+    return data_np[numpy.random.choice(
+        data_np.shape[0], batch_size, replace=False), :]
+

 def get_noise(batch_size, noise_dim):
    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')

+
 def get_fake_samples(generator_machine, batch_size, noise):
    gen_inputs = api.Arguments.createArguments(1)
    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
@ -147,12 +157,14 @@ def get_fake_samples(generator_machine, batch_size, noise):
    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
    return fake_samples

+
 def get_training_loss(training_machine, inputs):
    outputs = api.Arguments.createArguments(0)
    training_machine.forward(inputs, outputs, api.PASS_TEST)
    loss = outputs.getSlotValue(0).copyToNumpyMat()
    return numpy.mean(loss)

+
 def prepare_discriminator_data_batch_pos(batch_size, data_np):
    real_samples = get_real_samples(batch_size, data_np)
    labels = numpy.ones(batch_size, dtype='int32')
@ -161,6 +173,7 @@ def prepare_discriminator_data_batch_pos(batch_size, data_np):
    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
    return inputs

+
 def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
    fake_samples = get_fake_samples(generator_machine, batch_size, noise)
    labels = numpy.zeros(batch_size, dtype='int32')
@ -169,6 +182,7 @@ def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
    return inputs

+
 def prepare_generator_data_batch(batch_size, noise):
    label = numpy.ones(batch_size, dtype='int32')
    inputs = api.Arguments.createArguments(2)
@ -193,10 +207,9 @@ def get_layer_size(model_conf, layer_name):
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
-    parser.add_argument("--use_gpu", default="1", 
-                        help="1 means use gpu for training")
-    parser.add_argument("--gpu_id", default="0", 
-                        help="the gpu_id parameter")
+    parser.add_argument(
+        "--use_gpu", default="1", help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
    args = parser.parse_args()
    data_source = args.data_source
    use_gpu = args.use_gpu
@ -209,8 +222,9 @@ def main():
    if not os.path.exists("./%s_params/" % data_source):
        os.makedirs("./%s_params/" % data_source)

-    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', '--log_period=100', 
-                   '--gpu_id=' + args.gpu_id, '--save_dir=' + "./%s_params/" % data_source)
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+                   '--log_period=100', '--gpu_id=' + args.gpu_id,
+                   '--save_dir=' + "./%s_params/" % data_source)

    if data_source == "uniform":
        conf = "gan_conf.py"
@ -220,7 +234,8 @@ def main():
        num_iter = 1000

    gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
-    dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
+    dis_conf = parse_config(conf,
+                            "mode=discriminator_training,data=" + data_source)
    generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
    batch_size = dis_conf.opt_config.batch_size
    noise_dim = get_layer_size(gen_conf.model_config, "noise")
@ -245,11 +260,9 @@ def main():
    generator_machine = api.GradientMachine.createFromConfigProto(
        generator_conf.model_config)

-    dis_trainer = api.Trainer.create(
-        dis_conf, dis_training_machine)
+    dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)

-    gen_trainer = api.Trainer.create(
-        gen_conf, gen_training_machine)
+    gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)

    dis_trainer.startTrain()
    gen_trainer.startTrain()
@ -272,21 +285,23 @@ def main():
            noise = get_noise(batch_size, noise_dim)
            data_batch_dis_pos = prepare_discriminator_data_batch_pos(
                batch_size, data_np)
-            dis_loss_pos = get_training_loss(dis_training_machine, data_batch_dis_pos)
+            dis_loss_pos = get_training_loss(dis_training_machine,
+                                             data_batch_dis_pos)

            data_batch_dis_neg = prepare_discriminator_data_batch_neg(
                generator_machine, batch_size, noise)
-            dis_loss_neg = get_training_loss(dis_training_machine, data_batch_dis_neg)            
+            dis_loss_neg = get_training_loss(dis_training_machine,
+                                             data_batch_dis_neg)

            dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0

            # Do forward pass in generator to get the gen_loss
-            data_batch_gen = prepare_generator_data_batch(
-                    batch_size, noise)
+            data_batch_gen = prepare_generator_data_batch(batch_size, noise)
            gen_loss = get_training_loss(gen_training_machine, data_batch_gen)

            if i % 100 == 0:
-                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos, dis_loss_neg) 
+                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos,
+                                                                 dis_loss_neg)
                print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)

            # Decide which network to train based on the training history
@ -300,7 +315,8 @@ def main():
                    curr_strike = 1
                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)
-                copy_shared_parameters(dis_training_machine, gen_training_machine)
+                copy_shared_parameters(dis_training_machine,
+                                       gen_training_machine)

            else:
                if curr_train == "gen":
@ -311,7 +327,8 @@ def main():
                gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
                # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
                # so that we do not need to copy shared parameters. 
-                copy_shared_parameters(gen_training_machine, dis_training_machine)
+                copy_shared_parameters(gen_training_machine,
+                                       dis_training_machine)
                copy_shared_parameters(gen_training_machine, generator_machine)

        dis_trainer.finishTrainPass()
@ -319,11 +336,14 @@ def main():
        # At the end of each pass, save the generated samples/images
        fake_samples = get_fake_samples(generator_machine, batch_size, noise)
        if data_source == "uniform":
-            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" %
+                          (data_source, train_pass))
        else:
-            save_images(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+            save_images(fake_samples, "./%s_samples/train_pass%s.png" %
+                        (data_source, train_pass))
    dis_trainer.finishTrain()
    gen_trainer.finishTrain()

+
 if __name__ == '__main__':
    main()
--- a/demo/quick_start/trainer_config.resnet-lstm.py
+++ b/demo/quick_start/trainer_config.resnet-lstm.py
@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 This configuration is a demonstration of how to implement the stacked LSTM
 with residual connections, i.e. an LSTM layer takes the sum of the hidden states
@ -46,7 +45,8 @@ is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
 tst = 'data/test.list' if not is_predict else 'data/pred.list'
 process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(train_list=trn,
+define_py_data_sources2(
+    train_list=trn,
    test_list=tst,
    module="dataprovider_emb",
    obj=process,
@ -58,8 +58,7 @@ settings(
    learning_rate=2e-3,
    learning_method=AdamOptimizer(),
    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)

 bias_attr = ParamAttr(initial_std=0., l2_rate=0.)

@ -73,17 +72,15 @@ for i in range(3):
    # The input to the current layer is the sum of the hidden state
    # and input of the previous layer.
    current_input = addto_layer(input=[previous_input, previous_hidden_state])
-    hidden_state = simple_lstm(input=current_input, size=128,
-                               lstm_cell_attr=ExtraAttr(drop_rate=0.1))
+    hidden_state = simple_lstm(
+        input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
    previous_input, previous_hidden_state = current_input, hidden_state

 lstm = previous_hidden_state

 lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
-                  act=SoftmaxActivation())
-
+output = fc_layer(
+    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())

 if is_predict:
    maxid = maxid_layer(output)
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@ -69,7 +69,6 @@ def extract_dict_features(pair_file, feature_file):
            feature_out.write(feature_str + '\n')


-
 if __name__ == '__main__':

    usage = '-p pair_file -f feature_file'
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@ -30,8 +30,7 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
        integer_value_sequence(len(word_dict)),
        integer_value_sequence(len(word_dict)),
        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(2),
+        integer_value_sequence(len(predicate_dict)), integer_value_sequence(2),
        integer_value_sequence(len(label_dict))
    ]

@ -40,8 +39,12 @@ def get_batch_size(yeild_data):
    return len(yeild_data[0])


-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, 
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+@provider(
+    init_hook=hook,
+    should_shuffle=True,
+    calc_batch_size=get_batch_size,
+    can_over_batch_size=False,
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_name):
    with open(file_name, 'r') as fdata:
        for line in fdata:
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@ -47,7 +47,6 @@ if not is_predict:
            w = line.strip()
            predicate_dict[w] = i

-
    if is_test:
        train_list_file = None

@ -57,9 +56,11 @@ if not is_predict:
        test_list=test_list_file,
        module='dataprovider',
        obj='process',
-        args={'word_dict': word_dict,
+        args={
+            'word_dict': word_dict,
            'label_dict': label_dict,
-              'predicate_dict': predicate_dict })
+            'predicate_dict': predicate_dict
+        })

    word_dict_len = len(word_dict)
    label_dict_len = len(label_dict)
@ -77,24 +78,16 @@ mark_dim = 5
 hidden_dim = 512
 depth = 8

-
-
 ########################### Optimizer #######################################

-
 settings(
    batch_size=150,
    learning_method=MomentumOptimizer(momentum=0),
    learning_rate=2e-2,
    regularization=L2Regularization(8e-4),
    is_async=False,
-    model_average=ModelAverage(average_window=0.5,
-                               max_average_window=10000),
-                               
-)
-
-
-
+    model_average=ModelAverage(
+        average_window=0.5, max_average_window=10000), )

 ####################################### network ##############################
 #8 features and 1 target
@ -108,22 +101,28 @@ ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
 ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
 mark = data_layer(name='mark_data', size=mark_dict_len)

-
 if not is_predict:
    target = data_layer(name='target', size=label_dict_len)

-
 default_std = 1 / math.sqrt(hidden_dim) / 3.0

 emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
 std_0 = ParameterAttribute(initial_std=0.)
 std_default = ParameterAttribute(initial_std=default_std)

-predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
-mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+predicate_embedding = embedding_layer(
+    size=word_dim,
+    input=predicate,
+    param_attr=ParameterAttribute(
+        name='vemb', initial_std=default_std))
+mark_embedding = embedding_layer(
+    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)

 word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+emb_layers = [
+    embedding_layer(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
 emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)

@ -131,14 +130,18 @@ hidden_0 = mixed_layer(
    name='hidden0',
    size=hidden_dim,
    bias_attr=std_default,
-    input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
-
+    input=[
+        full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])

 mix_hidden_lr = 1e-3
 lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
-hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
+hidden_para_attr = ParameterAttribute(
+    initial_std=default_std, learning_rate=mix_hidden_lr)

-lstm_0 = lstmemory(name='lstm0',
+lstm_0 = lstmemory(
+    name='lstm0',
    input=hidden_0,
    act=ReluActivation(),
    gate_act=SigmoidActivation(),
@ -149,18 +152,21 @@ lstm_0 = lstmemory(name='lstm0',
 #stack L-LSTM and R-LSTM with direct edges
 input_tmp = [hidden_0, lstm_0]

-
 for i in range(1, depth):

-    mix_hidden = mixed_layer(name='hidden'+str(i),
+    mix_hidden = mixed_layer(
+        name='hidden' + str(i),
        size=hidden_dim,
        bias_attr=std_default,
-                             input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                    full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                   ]
-                             )
-
-    lstm = lstmemory(name='lstm'+str(i),
+        input=[
+            full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ])
+
+    lstm = lstmemory(
+        name='lstm' + str(i),
        input=mix_hidden,
        act=ReluActivation(),
        gate_act=SigmoidActivation(),
@ -171,44 +177,42 @@ for i in range(1, depth):

    input_tmp = [mix_hidden, lstm]

-feature_out = mixed_layer(name='output',
+feature_out = mixed_layer(
+    name='output',
    size=label_dict_len,
    bias_attr=std_default,
-                          input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                 full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                ],
-                          )
-
-
+    input=[
+        full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)
+    ], )

 if not is_predict:
-    crf_l = crf_layer( name = 'crf',
+    crf_l = crf_layer(
+        name='crf',
        size=label_dict_len,
        input=feature_out,
        label=target,
-                       param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
-
-                      )
-
+        param_attr=ParameterAttribute(
+            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))

-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
        size=label_dict_len,
        input=feature_out,
        label=target,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
-
+        param_attr=ParameterAttribute(name='crfw'))

    eval = sum_evaluator(input=crf_dec_l)

    outputs(crf_l)

 else:
-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
        size=label_dict_len,
        input=feature_out,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
+        param_attr=ParameterAttribute(name='crfw'))

    outputs(crf_dec_l)
-
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@ -26,7 +26,8 @@ UNK_IDX = 0


 class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
+    def __init__(self, train_conf, dict_file, model_dir, label_file,
+                 predicate_dict_file):
        """
        train_conf: trainer configure.
        dict_file: word dictionary file name.
@ -44,24 +45,17 @@ class Prediction():
        len_pred = len(self.predicate_dict)

        conf = parse_config(
-            train_conf,
-            'dict_len=' + str(len_dict) + 
-            ',label_len=' + str(len_label) +
-            ',pred_len=' + str(len_pred) +
-            ',is_predict=True')
+            train_conf, 'dict_len=' + str(len_dict) + ',label_len=' +
+            str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True')
        self.network = swig_paddle.GradientMachine.createFromConfigProto(
            conf.model_config)
        self.network.loadParameters(model_dir)

        slots = [
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), 
-            integer_value_sequence(len_pred),
-            integer_value_sequence(2)
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_pred), integer_value_sequence(2)
        ]
        self.converter = DataProviderConverter(slots)

@ -78,6 +72,7 @@ class Prediction():

        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
            self.predicate_dict[line.strip()] = line_count
+
    def get_data(self, data_file):
        """
        Get input data of paddle format.
@ -90,7 +85,8 @@ class Prediction():
                sen_len = len(words)

                word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)
+                                  ] * sen_len
                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
                ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
                ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
@ -123,7 +119,8 @@ class Prediction():


 def option_parser():
-    usage = ("python predict.py -c config -w model_dir " 
+    usage = (
+        "python predict.py -c config -w model_dir "
        "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
    parser = OptionParser(usage="usage: %s [options]" % usage)
    parser.add_option(
@ -187,7 +184,8 @@ def main():
    output_file = options.output_file

    swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
+    predict = Prediction(train_conf, dict_file, model_path, label_file,
+                         predict_dict_file)
    predict.predict(data_file, output_file)


--- a/doc_cn/cluster/k8s/start_paddle.py
+++ b/doc_cn/cluster/k8s/start_paddle.py
@ -19,7 +19,6 @@ import socket
 import os
 import argparse

-
 # configuration for cluster
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
@ -145,8 +144,8 @@ def startPaddle(idMap={}, train_args_dict=None):


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
+    parser = argparse.ArgumentParser(
+        prog="start_paddle.py", description='simple tool for k8s')
    args, train_args_list = parser.parse_known_args()
    train_args = refine_unknown_args(train_args_list)
    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@ -15,8 +15,8 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"

-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"

 std::vector<int> GradientMachine::defaultParamTypes = {
    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@ -16,14 +16,13 @@ limitations under the License. */

 #include "PaddleAPI.h"

-#include <vector>
 #include <algorithm>
+#include <vector>

 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
  dest->resize(src.size());
-  std::transform(src.begin(),
-                 src.end(),
-                 dest->begin(),
-                 [](T1 t) { return static_cast<T2>(t); });
+  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t) {
+    return static_cast<T2>(t);
+  });
 }
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include <iostream>
 #include <cstring>
+#include <iostream>
+#include "PaddleAPI.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/SparseMatrix.h"

 struct MatrixPrivate {
  std::shared_ptr<paddle::Matrix> mat;
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@ -16,8 +16,8 @@ limitations under the License. */

 #include <stddef.h>
 #include <stdint.h>
-#include <string>
 #include <stdexcept>
+#include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/TypeDefs.h"
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
+#include "PaddleAPI.h"

 struct ParameterPrivate {
  std::shared_ptr<paddle::Parameter> sharedPtr;
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
-#include "Internal.h"
 #include <algorithm>
+#include "Internal.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"

 struct ParameterOptimizerPrivate {
  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
@ -36,10 +36,7 @@ struct ParameterTraverseCallbackPrivate {
             size_t sparseId) {
    std::vector<paddle::VectorPtr> real_vecs;
    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(),
-                   vecs.end(),
-                   real_vecs.begin(),
-                   [](Vector* v) {
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
      if (v) {
        return *(paddle::VectorPtr*)(v->getSharedPtr());
      } else {
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <algorithm>
+#include <iterator>
+#include <sstream>
+#include <vector>
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/Flags.h"
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include <iterator>

 // used to represent partial sequence
 struct Path {
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@ -16,12 +16,12 @@ limitations under the License. */
 #include "PaddleAPIPrivate.h"

 #include <stdlib.h>
-#include <memory>
 #include <atomic>
+#include <memory>

+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/ParamUtil.h"
 #include "paddle/trainer/Trainer.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/TrainerInternal.h"
 #include "paddle/utils/Flags.h"

--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@ -14,16 +14,16 @@ limitations under the License. */

 #include "PaddleAPI.h"

-#include "paddle/utils/Util.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"

 #include <fenv.h>
+#include <algorithm>
 #include <iostream>
 #include <iterator>
-#include <algorithm>

 void initPaddle(int argc, char** argv) {
  paddle::initMain(argc, argv);
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@ -104,7 +104,8 @@ class TestMatrix(unittest.TestCase):
    def test_numpy(self):
        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
        m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
+                         numpy_mat.shape)
        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
        for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
            self.assertAlmostEqual(a, e)
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@ -104,7 +104,6 @@ class TestVector(unittest.TestCase):
        for i in xrange(len(v)):
            self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))

-
    def testCpuNumpy(self):
        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
@ -137,7 +136,6 @@ class TestVector(unittest.TestCase):
        for n, v in zip(numpy_arr, vecData):
            self.assertTrue(util.doubleEqual(n, v))

-
    def testCopyFromNumpy(self):
        vec = swig_paddle.Vector.createZero(1, False)
        arr = np.array([1.3, 3.2, 2.4], dtype="float32")
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@ -223,9 +223,9 @@ typedef struct {

 #ifdef __NVCC__

-#include "paddle/utils/Logging.h"
-#include "hl_cuda.h"
 #include "cuda_runtime.h"
+#include "hl_cuda.h"
+#include "paddle/utils/Logging.h"

 extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
--- a/Show More
+++ b/Show More