Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into prior_box

8 years ago · f7c0ad9d35
parent 142f632886 c80af6ffaa
commit f7c0ad9d35
28 changed files with 923 additions and 129 deletions
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/place.h"
 DECLARE_bool(do_memory_benchmark);
 DEFINE_bool(check_nan_inf, false,
            "Checking whether operator produce NAN/INF or not. It will be "
            "extremely slow so please use this flag wisely.");
@ -117,6 +118,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(3) << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
    if (FLAGS_do_memory_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);
    }
    if (FLAGS_check_nan_inf) {
      for (auto& vname : op->OutputVars(true)) {
        auto* var = local_scope->FindVar(vname);
@ -130,6 +135,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  if (create_vars && create_local_scope) {
    scope->DeleteScope(local_scope);
  }
  if (FLAGS_do_memory_benchmark) {
    VLOG(2) << "-------------------------------------------------------";
    VLOG(2) << "Memory used after deleting local scope: "
            << memory::memory_usage(place_);
    VLOG(2) << "-------------------------------------------------------";
  }
 }
 }  // namespace framework
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
 DEFINE_bool(do_memory_benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
            "and add some memory usage logs");
 namespace paddle {
 namespace framework {
@ -88,8 +92,12 @@ void Scope::DeleteScope(Scope* scope) {
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
-  // Make delete async.
+  // When making memory benchmark on Fluid, we have to delete scope sync.
-  Async([scope] { delete scope; });
+  if (FLAGS_do_memory_benchmark) {
    delete scope;
  } else {
    Async([scope] { delete scope; });
  }
 }
 void Scope::Rename(const std::string& origin_name,
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@ -39,6 +39,11 @@ N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
 calculated by %s
 )DOC",
                               comment.type, comment.equation));
    AddAttr<int>("axis",
                 "(int, default -1). The start dimension index "
                 "for broadcasting Y onto X.")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
  }
 };
@ -95,11 +100,5 @@ REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
 REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
 REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
 REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
 REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
 REGISTER_LOGICAL_KERNEL(greater_than, CPU,
                        paddle::operators::GreaterThanFunctor);
 REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
 REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_OP(equal, "Out = X == Y");
 REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@ -16,8 +16,4 @@ limitations under the License. */
 REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
 REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
 REGISTER_LOGICAL_KERNEL(greater_than, CUDA,
                        paddle::operators::GreaterThanFunctor);
 REGISTER_LOGICAL_KERNEL(greater_equal, CUDA,
                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@ -16,6 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <type_traits>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/elementwise_op_function.h"
 #include "paddle/platform/transform.h"
 namespace paddle {
@ -33,18 +34,6 @@ struct LessEqualFunctor {
  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
 };
 template <typename T>
 struct GreaterThanFunctor {
  using ELEM_TYPE = T;
  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
 };
 template <typename T>
 struct GreaterEqualFunctor {
  using ELEM_TYPE = T;
  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
 };
 template <typename T>
 struct EqualFunctor {
  using ELEM_TYPE = T;
@ -65,14 +54,7 @@ class CompareOpKernel
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    using T = typename Functor::ELEM_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
    auto* y = context.Input<framework::Tensor>("Y");
    auto* out = context.Output<framework::Tensor>("Out");
    Functor binary_func;
    platform::Transform<DeviceContext> trans;
    trans(context.template device_context<DeviceContext>(), x->data<T>(),
          x->data<T>() + x->numel(), y->data<T>(),
          out->mutable_data<bool>(context.GetPlace()), binary_func);
  }
 };
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@ -176,14 +176,15 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext>
 };
 #endif
-template <typename Functor, typename T, typename DeviceContext>
+template <typename Functor, typename T, typename DeviceContext,
          typename OutType = T>
 class TransformFunctor {
 public:
  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
      : x_(x->data<T>()),
        y_(y->data<T>()),
-        z_(z->mutable_data<T>(ctx.GetPlace())),
+        z_(z->mutable_data<OutType>(ctx.GetPlace())),
        nx_(x->numel()),
        ctx_(ctx),
        func_(func) {}
@ -208,7 +209,7 @@ class TransformFunctor {
 private:
  const T* x_;
  const T* y_;
-  T* z_;
+  OutType* z_;
  int64_t nx_;
  const DeviceContext& ctx_;
  Functor func_;
@ -364,15 +365,16 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
  }
 }
-template <typename Functor, typename DeviceContext, typename T>
+template <typename Functor, typename DeviceContext, typename T,
          typename OutType = T>
 void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
  using Tensor = framework::Tensor;
  auto* x = ctx.Input<Tensor>("X");
  auto* y = ctx.Input<Tensor>("Y");
  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
+  z->mutable_data<OutType>(ctx.GetPlace());
-  TransformFunctor<Functor, T, DeviceContext> functor(
+  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
  auto x_dims = x->dims();
--- a/python/paddle/v2/dataset/init.py
+++ b/python/paddle/v2/dataset/init.py
@ -24,11 +24,23 @@ import conll05
 import uci_housing
 import sentiment
 import wmt14
 import wmt16
 import mq2007
 import flowers
 import voc2012
 __all__ = [
-    'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
+    'mnist',
-    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc2012'
+    'imikolov',
    'imdb',
    'cifar',
    'movielens',
    'conll05',
    'sentiment'
    'uci_housing',
    'wmt14',
    'wmt16',
    'mq2007',
    'flowers',
    'voc2012',
 ]
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@ -25,8 +25,12 @@ import glob
 import cPickle as pickle
 __all__ = [
-    'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
+    'DATA_HOME',
-    'convert'
+    'download',
    'md5file',
    'split',
    'cluster_files_reader',
    'convert',
 ]
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
@ -58,12 +62,15 @@ def md5file(fname):
    return hash_md5.hexdigest()
-def download(url, module_name, md5sum):
+def download(url, module_name, md5sum, save_name=None):
    dirname = os.path.join(DATA_HOME, module_name)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
-    filename = os.path.join(dirname, url.split('/')[-1])
+    filename = os.path.join(dirname,
                            url.split('/')[-1]
                            if save_name is None else save_name)
    retry = 0
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
@ -196,9 +203,11 @@ def convert(output_path, reader, line_count, name_prefix):
    Convert data from reader to recordio format files.
    :param output_path: directory in which output files will be saved.
-    :param reader: a data reader, from which the convert program will read data instances.
+    :param reader: a data reader, from which the convert program will read
                   data instances.
    :param name_prefix: the name prefix of generated files.
-    :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
                                 writing.
    """
    assert line_count >= 1
--- a/python/paddle/v2/dataset/tests/wmt16_test.py
+++ b/python/paddle/v2/dataset/tests/wmt16_test.py
@ -0,0 +1,66 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.wmt16
 import unittest
 class TestWMT16(unittest.TestCase):
    def checkout_one_sample(self, sample):
        # train data has 3 field: source language word indices,
        # target language word indices, and target next word indices.
        self.assertEqual(len(sample), 3)
        # test start mark and end mark in source word indices.
        self.assertEqual(sample[0][0], 0)
        self.assertEqual(sample[0][-1], 1)
        # test start mask in target word indices
        self.assertEqual(sample[1][0], 0)
        # test en mask in target next word indices
        self.assertEqual(sample[2][-1], 1)
    def test_train(self):
        for idx, sample in enumerate(
                paddle.v2.dataset.wmt16.train(
                    src_dict_size=100000, trg_dict_size=100000)()):
            if idx >= 10: break
            self.checkout_one_sample(sample)
    def test_test(self):
        for idx, sample in enumerate(
                paddle.v2.dataset.wmt16.test(
                    src_dict_size=1000, trg_dict_size=1000)()):
            if idx >= 10: break
            self.checkout_one_sample(sample)
    def test_val(self):
        for idx, sample in enumerate(
                paddle.v2.dataset.wmt16.validation(
                    src_dict_size=1000, trg_dict_size=1000)()):
            if idx >= 10: break
            self.checkout_one_sample(sample)
    def test_get_dict(self):
        dict_size = 1000
        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
        self.assertEqual(len(word_dict), dict_size)
        self.assertEqual(word_dict[0], "<s>")
        self.assertEqual(word_dict[1], "<e>")
        self.assertEqual(word_dict[2], "<unk>")
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@ -25,12 +25,20 @@ import gzip
 import paddle.v2.dataset.common
 from paddle.v2.parameters import Parameters
-__all__ = ['train', 'test', 'build_dict', 'convert']
+__all__ = [
-
+    'train',
-URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
+    'test',
    'get_dict',
    'convert',
 ]
 URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
                'cslm_joint_paper/data/dev+test.tgz')
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
-# this is a small set of data for test. The original data is too large and will be add later.
+# this is a small set of data for test. The original data is too large and
-URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
+# will be add later.
 URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
             'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
 URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
@ -42,8 +50,8 @@ UNK = "<unk>"
 UNK_IDX = 2
-def __read_to_dict__(tar_file, dict_size):
+def __read_to_dict(tar_file, dict_size):
-    def __to_dict__(fd, size):
+    def __to_dict(fd, size):
        out_dict = dict()
        for line_count, line in enumerate(fd):
            if line_count < size:
@ -58,19 +66,19 @@ def __read_to_dict__(tar_file, dict_size):
            if each_item.name.endswith("src.dict")
        ]
        assert len(names) == 1
-        src_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
        names = [
            each_item.name for each_item in f
            if each_item.name.endswith("trg.dict")
        ]
        assert len(names) == 1
-        trg_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
        return src_dict, trg_dict
 def reader_creator(tar_file, file_name, dict_size):
    def reader():
-        src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
        with tarfile.open(tar_file, mode='r') as f:
            names = [
                each_item.name for each_item in f
@ -152,7 +160,7 @@ def get_dict(dict_size, reverse=True):
    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
    if reverse:
        src_dict = {v: k for k, v in src_dict.items()}
        trg_dict = {v: k for k, v in trg_dict.items()}
--- a/python/paddle/v2/dataset/wmt16.py
+++ b/python/paddle/v2/dataset/wmt16.py
--- a/python/paddle/v2/fluid/init.py
+++ b/python/paddle/v2/fluid/init.py
@ -86,7 +86,9 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
-    read_env_flags = ['use_pinned_memory', 'check_nan_inf']
+    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
    ]
    if core.is_compile_gpu():
        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
    core.init_gflags([sys.argv[0]] +
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
@ -14,6 +14,7 @@
 import functools
 import layers
 import framework
 from . import core
 __all__ = [
@ -66,7 +67,7 @@ def error_clip_callback(block, context):
 class BaseGradientClipAttr(object):
-    def process_context(self, context, p_g):
+    def process_context(self, context, param, grad):
        raise NotImplementedError()
    def create_operators(self, param, grad):
@ -74,7 +75,7 @@ class BaseGradientClipAttr(object):
 class NullGradientClipAttr(BaseGradientClipAttr):
-    def process_context(self, context, p_g):
+    def process_context(self, context, param, grad):
        pass
    def create_operators(self, param, grad):
@ -91,7 +92,7 @@ class GradientClipByValue(BaseGradientClipAttr):
        self.max = max
        self.min = min
-    def process_context(self, context, p_g):
+    def process_context(self, context, param, grad):
        pass
    def create_operators(self, param, grad):
@ -99,19 +100,93 @@ class GradientClipByValue(BaseGradientClipAttr):
        return param, new_grad
 class GradientClipByNorm(BaseGradientClipAttr):
    def __init__(self, clip_norm):
        self.clip_norm = clip_norm
    def process_context(self, context, param, grad):
        pass
    def create_operators(self, param, grad):
        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
        return param, new_grad
 class GradientClipByGlobalNorm(BaseGradientClipAttr):
    def __init__(self, clip_norm, group_name="default_group"):
        if not isinstance(group_name, basestring):
            raise TypeError("'group_name' must be a basestring.")
        self.clip_norm = clip_norm
        self.group_name = group_name
    def process_context(self, context, param, grad):
        if self.group_name not in context:
            context[self.group_name] = []
            context[self.group_name + "_clip_value"] = self.clip_norm
            context[self.group_name + "_clip"] = layers.fill_constant(
                shape=[1], dtype="float32", value=self.clip_norm)
        else:
            if not self.clip_norm == context[self.group_name + "_clip_value"]:
                raise ValueError(
                    "All parameters' 'clip_norm' of a same group should be the same"
                )
        local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0))
        context[self.group_name].append(local_norm_var)
        self.context = context
    def create_operators(self, param, grad):
        group_scale_name = self.group_name + "_scale"
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
            layers.sqrt(x=group_norm_var, out=group_norm_var)
            clip_var = self.context[self.group_name + "_clip"]
            group_scale_var = layers.elementwise_div(
                x=clip_var,
                y=layers.elementwise_max(
                    x=clip_var, y=group_norm_var))
            assert group_scale_var.shape == (1L, )
            self.context[group_scale_name] = group_scale_var
        new_grad = layers.elementwise_mul(
            x=grad, y=self.context[group_scale_name])
        return param, new_grad
 def gradient_clip_by_global_norm(clip_norm,
                                 param_list=None,
                                 group_name="default_group",
                                 program=None):
    if program is None:
        program = framework.default_main_program()
    if param_list is None:
        param_list = program.block(0).all_parameters()
    if all(isinstance(elem, basestring) for elem in param_list):
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(
            "'param_list' should be a list of Parameter or basestring(parameter's name)."
        )
    for param in param_list:
        param.gradient_clip_attr = GradientClipByGlobalNorm(clip_norm,
                                                            group_name)
 def append_gradient_clip_ops(param_grad):
    context = dict()
    create_op_callbacks = []
    for p, g in param_grad:
-        clip_attr = getattr(p, 'clip_attr', NullGradientClipAttr())
+        clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
        if clip_attr is None:
            clip_attr = NullGradientClipAttr()
        if not isinstance(clip_attr, BaseGradientClipAttr):
            raise TypeError(
-                "clip attribute should be an instance of BaseGradientClippingAttr"
+                "clip attribute should be an instance of BaseGradientClipAttr")
            )
-        clip_attr.process_context(context=context, p_g=param_grad)
+        clip_attr.process_context(context=context, param=p, grad=g)
        create_op_callbacks.append(
            functools.partial(
                clip_attr.create_operators, param=p, grad=g))
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@ -780,7 +780,7 @@ class Block(object):
                trainable=p.trainable,
                optimize_attr=p.optimize_attr,
                regularizer=p.regularizer,
-                clip_attr=p.clip_attr,
+                gradient_clip_attr=p.gradient_clip_attr,
                error_clip=p.error_clip,
                name=v.name)
            self.vars[new_p.name] = new_p
@ -948,7 +948,7 @@ class Parameter(Variable):
        self.regularizer = kwargs.get('regularizer', None)
-        self.clip_attr = kwargs.get('clip_attr', None)
+        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 # program is a global instance.
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@ -11,22 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
-from ..layer_helper import LayerHelper, unique_name
+from layer_function_generator import autodoc
 from ..framework import Program, Variable, Operator
 from .. import core
 from tensor import assign, fill_constant
-import contextlib
+from .. import core
-from ..registry import autodoc
+from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 __all__ = [
-    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard',
+    'split_lod_tensor',
-    'BlockGuardWithCompletion', 'StaticRNNMemoryLink', 'WhileGuard', 'While',
+    'merge_lod_tensor',
-    'lod_rank_table', 'max_sequence_len', 'topk', 'lod_tensor_to_array',
+    'BlockGuard',
-    'array_to_lod_tensor', 'increment', 'array_write', 'create_array',
+    'BlockGuardWithCompletion',
-    'less_than', 'array_read', 'shrink_memory', 'array_length', 'IfElse',
+    'StaticRNNMemoryLink',
-    'DynamicRNN', 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank',
+    'WhileGuard',
-    'ParallelDo', 'Print'
+    'While',
    'lod_rank_table',
    'max_sequence_len',
    'topk',
    'lod_tensor_to_array',
    'array_to_lod_tensor',
    'increment',
    'array_write',
    'create_array',
    'less_than',
    'array_read',
    'shrink_memory',
    'array_length',
    'IfElse',
    'DynamicRNN',
    'ConditionalBlock',
    'StaticRNN',
    'reorder_lod_tensor_by_rank',
    'ParallelDo',
    'Print',
 ]
@ -1458,7 +1477,7 @@ class DynamicRNN(object):
                method))
-@autodoc
+@autodoc()
 def reorder_lod_tensor_by_rank(x, rank_table):
    helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
    helper.is_instance('x', Variable)
--- a/python/paddle/v2/fluid/layers/device.py
+++ b/python/paddle/v2/fluid/layers/device.py
@ -15,14 +15,14 @@
 All util layers.
 """
-from ..layer_helper import LayerHelper
+from layer_function_generator import autodoc
 from ..framework import unique_name
-from ..registry import autodoc
+from ..layer_helper import LayerHelper
 __all__ = ['get_places']
-@autodoc
+@autodoc()
 def get_places(device_count=None, device_type=None):
    helper = LayerHelper('get_places', **locals())
    out_places = helper.create_variable(name=unique_name(helper.name + ".out"))
--- a/python/paddle/v2/fluid/layers/layer_function_generator.py
+++ b/python/paddle/v2/fluid/layers/layer_function_generator.py
@ -13,17 +13,19 @@
 # limitations under the License.
 import re
 import cStringIO
 import warnings
 import functools
-import inspect
+import warnings
 from .. import proto
-import proto.framework_pb2 as framework_pb2
+framework_pb2 = proto.framework_pb2
-from framework import OpProtoHolder, Variable, Program, Operator
+
-from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
+from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
 __all__ = [
    'deprecated',
-    'register_layer',
+    'generate_layer_fn',
    'autodoc',
 ]
@ -96,7 +98,7 @@ def _generate_doc_string_(op_proto):
    return buf.getvalue()
-def register_layer(op_type):
+def generate_layer_fn(op_type):
    """Register the Python layer for an Operator.
    Args:
@ -207,7 +209,10 @@ def deprecated(func_or_class):
    return func_wrapper
-def autodoc(func):
+def autodoc(comment=""):
-    func.__doc__ = _generate_doc_string_(OpProtoHolder.instance().get_op_proto(
+    def __impl__(func):
-        func.__name__))
+        func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
-    return func
+        ).get_op_proto(func.__name__)) + comment
        return func
    return __impl__
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from layer_function_generator import generate_layer_fn
 from ..registry import register_layer
 __activations__ = [
    'sigmoid',
@ -46,21 +45,11 @@ __activations__ = [
 ]
 __all__ = [
-    'mean',
+    'mean', 'mul', 'reshape', 'scale', 'transpose',
-    'mul',
+    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
-    'reshape',
+    'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min',
-    'scale',
+    'clip', 'clip_by_norm', 'sequence_softmax'
    'transpose',
    'sigmoid_cross_entropy_with_logits',
    'elementwise_add',
    'elementwise_div',
    'elementwise_sub',
    'elementwise_mul',
    'elementwise_max',
    'elementwise_min',
    'clip',
    'sequence_softmax',
 ] + __activations__
 for _OP in set(__all__):
-    globals()[_OP] = register_layer(_OP)
+    globals()[_OP] = generate_layer_fn(_OP)
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@ -25,13 +25,13 @@ class ParamAttr(object):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 clip=None):
+                 gradient_clip=None):
        self.name = name
        self.initializer = initializer
        self.learning_rate = learning_rate
        self.regularizer = regularizer
        self.trainable = trainable
-        self.clip = clip
+        self.gradient_clip = gradient_clip
    def set_default_initializer(self, initializer):
        if initializer is None:
@ -77,7 +77,7 @@ class ParamAttr(object):
            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
-            'clip_attr': self.clip
+            'gradient_clip_attr': self.gradient_clip
        }
        if with_initializer:
            kwargs['initializer'] = self.initializer
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@ -6,3 +6,4 @@ endforeach()
 add_subdirectory(book)
 add_subdirectory(book_distribute)
 add_subdirectory(book_memory_optimization)
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@ -27,7 +27,7 @@ hidden1 = fluid.layers.fc(input=image,
                          act='relu',
                          param_attr=fluid.ParamAttr(
                              regularizer=regularizer,
-                              clip=fluid.clip.ClipByValue(10)))
+                              gradient_clip=fluid.clip.ClipByValue(10)))
 hidden2 = fluid.layers.fc(input=hidden1,
                          size=64,
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
@ -0,0 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 list(REMOVE_ITEM TEST_OPS test_memopt_image_classification_train)
 py_test(test_memopt_image_classification_train_resnet SRCS test_memopt_image_classification_train.py ARGS resnet)
 py_test(test_memopt_image_classification_train_vgg SRCS test_memopt_image_classification_train.py ARGS vgg)
 # default test
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)
 endforeach()
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@ -0,0 +1,44 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 y_predict = fluid.layers.fc(input=x, size=1, act=None)
 y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
 avg_cost = fluid.layers.mean(x=cost)
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
 sgd_optimizer.minimize(avg_cost)
 # memopt_program = fluid.default_main_program()
 memopt_program = fluid.memory_optimize(fluid.default_main_program())
 BATCH_SIZE = 200
 train_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.uci_housing.train(), buf_size=500),
    batch_size=BATCH_SIZE)
 place = fluid.CPUPlace()
 feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
 exe = fluid.Executor(place)
 exe.run(fluid.default_startup_program())
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
    fluid.io.save_persistables(exe, "./fit_a_line.model/")
    fluid.io.load_persistables(exe, "./fit_a_line.model/")
    for data in train_reader():
        avg_loss_value, = exe.run(memopt_program,
                                  feed=feeder.feed(data),
                                  fetch_list=[avg_cost])
        if avg_loss_value[0] < 10.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@ -0,0 +1,133 @@
 from __future__ import print_function
 import sys
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 def resnet_cifar10(input, depth=32):
    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
        tmp = fluid.layers.conv2d(
            input=input,
            filter_size=filter_size,
            num_filters=ch_out,
            stride=stride,
            padding=padding,
            act=None,
            bias_attr=False)
        return fluid.layers.batch_norm(input=tmp, act=act)
    def shortcut(input, ch_in, ch_out, stride):
        if ch_in != ch_out:
            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
        else:
            return input
    def basicblock(input, ch_in, ch_out, stride):
        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
        short = shortcut(input, ch_in, ch_out, stride)
        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
        tmp = block_func(input, ch_in, ch_out, stride)
        for i in range(1, count):
            tmp = block_func(tmp, ch_out, ch_out, 1)
        return tmp
    assert (depth - 2) % 6 == 0
    n = (depth - 2) / 6
    conv1 = conv_bn_layer(
        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
    pool = fluid.layers.pool2d(
        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
    return pool
 def vgg16_bn_drop(input):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act='relu',
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
            pool_type='max')
    conv1 = conv_block(input, 64, 2, [0.3, 0])
    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
 classdim = 10
 data_shape = [3, 32, 32]
 images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
 label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 net_type = "vgg"
 if len(sys.argv) >= 2:
    net_type = sys.argv[1]
 if net_type == "vgg":
    print("train vgg net")
    net = vgg16_bn_drop(images)
 elif net_type == "resnet":
    print("train resnet")
    net = resnet_cifar10(images, 32)
 else:
    raise ValueError("%s network is not supported" % net_type)
 predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
 cost = fluid.layers.cross_entropy(input=predict, label=label)
 avg_cost = fluid.layers.mean(x=cost)
 optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 # memopt_program = fluid.default_main_program()
 memopt_program = fluid.memory_optimize(fluid.default_main_program())
 BATCH_SIZE = 128
 PASS_NUM = 1
 train_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.cifar.train10(), buf_size=128 * 10),
    batch_size=BATCH_SIZE)
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
 feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe.run(fluid.default_startup_program())
 for pass_id in range(PASS_NUM):
    accuracy.reset(exe)
    for data in train_reader():
        loss, acc = exe.run(memopt_program,
                            feed=feeder.feed(data),
                            fetch_list=[avg_cost] + accuracy.metrics)
        pass_acc = accuracy.eval(exe)
        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
            pass_acc))
        # this model is slow, so if we can train two mini batch, we think it works properly.
        exit(0)
 exit(1)
--- a/python/paddle/v2/fluid/tests/test_compare_op.py
+++ b/python/paddle/v2/fluid/tests/test_compare_op.py
@ -38,8 +38,6 @@ def create_test_class(op_type, typename, callback):
 for _type_name in {'float32', 'float64', 'int32', 'int64'}:
    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
 if __name__ == '__main__':
--- a/Show More
+++ b/Show More