Merge branch 'develop' of github.com:PaddlePaddle/Paddle into rpc_complete_interface

7 years ago · d4f51218ef
parent be772741cf 5ebbfaa8b9
commit d4f51218ef
184 changed files with 5219 additions and 1677 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -136,6 +136,12 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 if(WITH_MKL)
  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
  if (MKL_SPLIT_GEMM)
    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
  endif()
 endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
    if (WITH_MKL AND AVX2_FOUND)
--- a/README.md
+++ b/README.md
@ -18,7 +18,21 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
 ### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
 pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
 pip install paddlepaddle-gpu==0.14.0.post87
 # Linux GPU cuda8cudnn5
 pip install paddlepaddle-gpu==0.14.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
 ## Features
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function train() {
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function clock_to_seconds() {
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function train() {
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function clock_to_seconds() {
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function train() {
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function train() {
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function test() {
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function test() {
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function test() {
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function test() {
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@ -4,6 +4,14 @@ set(tmp_version "HEAD")
 set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
 set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
  # Check current branch name
  execute_process(
    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
    OUTPUT_VARIABLE GIT_BRANCH_NAME
    RESULT_VARIABLE GIT_BRANCH_RESULT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
  if (NOT ${GIT_BRANCH_RESULT})
    execute_process(
      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
@ -11,6 +19,8 @@ while ("${PADDLE_VERSION}" STREQUAL "")
      RESULT_VARIABLE GIT_RESULT
      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
    if (NOT ${GIT_RESULT})
      # Check if current branch is release branch
      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
        # Check the tag is a correct version
        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
          # if no tag was found, set PADDLE_VERSION to latest
@ -20,10 +30,17 @@ while ("${PADDLE_VERSION}" STREQUAL "")
        else()  # otherwise, get the previous git tag name.
          set(tmp_version "${GIT_TAG_NAME}~1")
        endif()
      else() # otherwise, we always set PADDLE_VERSION to latest
        set(PADDLE_VERSION "latest")
      endif()
    else()
      set(PADDLE_VERSION "0.0.0")
      message(WARNING "Cannot add paddle version from git tag")
    endif()
  else()
    set(PADDLE_VERSION "0.0.0")
    message(WARNING "Cannot add paddle version for wrong git branch result")
  endif()
 endwhile()
 add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
@ -0,0 +1,89 @@
 ## Motivation
 There is a ```gap``` between the ```Program``` defined by
 user and the ```Executable``` that can be scheduled
 efficiently on heterogeneous hardware, either locally
 or distributedly.
 Usually, the ```gap``` is bridged by
 * A serious transformations with defined order.
 * These transformations usually involve
 ```insert, delete, clustering, split, dependency analysis```.
 * Has a simple way to verify and debug each transformation.
 * Flexible to add, remove or customize transformations to fit
 the requirements of various algorithms (models) and hardware secenarios.
 Some other events also push us to a better unified pattern.
 * The deep learning framework is built around the concepts of graphs.
 To leverage tools such as compilation (e.g. TVM and nGraph) or
 cross-framework conversion (e.g. ONNX), we also need a intermediate
 representation that can be connected to the rest of the ecosystem.
 We need a unified pattern to naturally support the requirements
 described above. The pattern should fit both training, inference
 and other offline serielized model transformations.
 Learned from LLVM and other deep learning framework, we draft the
 design below.
 ## Design
 ### Major Concepts
 #### Node
 ```Node``` represents an operation that performs some computation or
 a variable that is input or output of operation.
 ```Node```s are connected to other ```Node```s via inputs and outputs.
 Other properties (maybe device placement information) can be added
 to ```Node``` in the future if it's a
 common requirement of many other ```Pass```es. Otherwise, it should live
 in a ```Node``` wrapper class that is private to some ```Pass``` or be
 a local member of a ```Pass```.
 #### Graph
 ```Graph``` contains a list of ```Node```s, which are connected to
 each other via inputs and outputs.
 TODO: Better definitions for the graph.
 ```Graph``` can also contain ```Attribute```s. ```Attribute```s
 can be ``any`` thing. For example, it can be a list of "wraper"
 nodes. The ```wrapper``` nodes compose ```Node```s and provide
 helper method for execution or transformation. ```Attribute```
 can also contain other things that describe some properties of
 the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
 across ```Pass```. However, it should be used with care.
 #### Pass
 ```Pass``` represents a transformation of ```Graph```. Its input
 is a ```Graph``` and its output is also a ```Graph```. For example,
 a ```Pass``` can simply print out the ```Graph```. A ```Pass```
 can also fuse some ```Graph```'s ```Node```s.
 #### Optimize
 ```Optimize``` contains a series of ```Pass``` with defined order.
 ```Optimize``` transforms a ```Graph``` that only contains raw
 modeling logic to a ```Graph``` that can be run efficiently while
 maintaining the original modeling logic.
 ### Optimize Process
 * Program is first converted to Graph.
 * Graph goes through a series of Pass
 * Graph is transformed from raw model logic to a
 form that is efficient to execute.
 Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
--- a/doc/v2/faq/parameter/index_en.rst
+++ b/doc/v2/faq/parameter/index_en.rst
@ -1,5 +1,198 @@
-#################
+##################
-Parameter Setting
+Parameter Settings
-#################
+##################
-TBD
+.. contents::
 1. How to Choose the Learning Rate of SGD Algorithm
 --------------------------
 An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
 Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
 If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
 2. How to Implement Learning Rate Annealing
 ------------------------------------------------
 We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
 .. code-block:: python
     Optimizer = paddle.optimizer.Adam(
         Learning_rate=1e-3,
         Learning_rate_decay_a=0.5,
         Learning_rate_decay_b=0.75,
         Learning_rate_schedule="poly",)
 PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
 * "constant"
   Lr = learning_rate
 * "poly"
   Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
   Variable :code:`num_samples_processed` is the number of trained samples.
 * "caffe_poly"
   Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
 * "exp"
   Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
 * "discexp"
   Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
 * "linear"
   Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
 * "manual"
   This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
   .. code-block:: python
       Optimizer = paddle.optimizer.Adam(
           Learning_rate=1e-3,
           Learning_rate_schedule="manual",
           Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
   In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
 * "pass_manual"
   This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
   .. code-block:: python
       Optimizer = paddle.optimizer.Adam(
           Learning_rate=1e-3,
           Learning_rate_schedule="pass_manual",
           Learning_rate_args="1:1.0,2:0.9,3:0.8",)
   In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
 3. How to Initialize Parameters
 -----------------
 By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
 * Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
 * Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
 For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
 .. code-block:: python
     Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
                       Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
 The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
 4. How to Share Parameters
 ---------------
 PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
 A simple fully connected network has its configuration of parameter sharing as follows \:
 .. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
 Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
 5. How to Load Pre-training Parameters
 ------------------------
 * For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
 .. code-block:: python
     Emb_para = paddle.attr.Param(name='emb', is_static=True)
     Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
 * Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
 .. code-block:: python
     Def load_parameter(file_name, h, w):
         With open(file_name, 'rb') as f:
             F.read(16) # skip header.
             Return np.fromfile(f, dtype=np.float32).reshape(h, w)
     Parameters = paddle.parameters.create(my_cost)
     Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
 6. Format of the Stored Parameter and How to Convert the File to Plain Text
 --------------------------------------------------
 The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
 When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
 .. code-block:: python
     Def read_parameter(fname, width):
         s = open(fname).read()
         # skip header
         Vec = np.fromstring(s[16:], dtype=np.float32)
         # width is the size of the corresponding layer
         Np.savetxt(fname + ".csv", vec.reshape(width, -1),
                 Fmt="%.6f", delimiter=",")
 When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
 .. code-block:: python
     Def gen_rand_param(param_file, width, height, need_trans):
         Np.random.seed()
         Header = struct.pack("iil", 0, 4, height * width)
         Param = np.float32(np.random.rand(height, width))
         With open(param_file, "w") as fparam:
             Fparam.write(header + param.tostring())
 7. A Protocol Message Rejected Because of its Large Size
 -------------------------------------------------- ----------
 If you are training NLP related models, and the following error occurs:
 .. code-block:: bash
     [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
     F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
 The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
 .. code-block:: python
      Src_dict = dict()
      For line_count, line in enumerate(open(src_dict_path, "r")):
         Src_dict[line.strip()] = line_count
      Define_py_data_sources2(
         Train_list,
         Test_list,
         Module="dataprovider",
         Obj="process",
         Args={"src_dict": src_dict})
 The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
 .. code-block:: python
      Define_py_data_sources2(
         Train_list,
         Test_list,
         Module="dataprovider",
         Obj="process",
         Args={"src_dict_path": src_dict_path})
 The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
--- a/doc/v2/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/v2/howto/rnn/hierarchical_layer_en.rst
@ -1,4 +1,89 @@
-Layers supporting hierarchical sequence as input
+###########################
-================================================
+Layers that Support Hierarchical Sequences as Input
-
+###########################
-TBD
+ 
 .. contents::
 Overview 
 ====
 A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
 A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
 We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
 + 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
 + Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
 + Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
 In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
 `pooling`
 ========
 The use of pooling is as follows:
 .. code-block:: bash
         Seq_pool = pooling(input=layer,
                            Pooling_type=pooling.Max(),
                            Agg_level=AggregateLevel.TO_SEQUENCE)
 - `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
 - When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
   - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
   - Input: a double-level sequence or a single-level sequence
   - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
 - When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
   - Effect: a double-level sequence will be transformed into a single-level sequence
   - Input: a double-level sequence
   - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
 `last_seq` and `first_seq`
 =====================
 An example of using `last_seq` is as follows (usage of `first_seq` is similar).
 .. code-block:: bash
         Last = last_seq(input=layer,
                         Agg_level=AggregateLevel.TO_SEQUENCE)
 - When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
   - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
   - Input: a double-level sequence or a single-level sequence
   - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
 - When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
   - Effect: a double-level sequence will be transformed into a single-level sequence
   - Input: a double-level sequence
   - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
 `expand`
 ======
 The use of expand is as follows.
 .. code-block:: bash
         Ex = expand(input=layer1,
                     Expand_as=layer2,
                     Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
 - When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
   - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
   - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
   - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
 - When `expand_level=ExpandLevel.FROM_SEQUENCE`:
   - Effect: a single-level sequence is extended to a double-level sequence
   - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
   - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -179,13 +179,13 @@ paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, default
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
-paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True))
+paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@ -208,9 +208,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@ -340,6 +337,26 @@ paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps
 paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32'))
 paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
 paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -1,4 +1,5 @@
 add_subdirectory(details)
 add_subdirectory(ir)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
@ -93,7 +94,7 @@ else()
 endif()
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -1,12 +1,11 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
@ -35,7 +34,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -23,10 +23,14 @@ namespace framework {
 namespace details {
 #ifdef PADDLE_WITH_CUDA
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places,
                                     const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+    : OpHandleBase(node),
      local_scopes_(local_scopes),
      places_(places),
      nccl_ctxs_(ctxs) {
  if (nccl_ctxs_) {
    for (auto &p : places_) {
      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
@ -34,9 +38,10 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
  }
 }
 #else
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 void AllReduceOpHandle::RunImpl() {
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@ -30,11 +30,11 @@ namespace details {
 struct AllReduceOpHandle : public OpHandleBase {
 #ifdef PADDLE_WITH_CUDA
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *ctxs);
 #else
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places);
 #endif
  std::string Name() const override;
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@ -35,10 +35,13 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
 public:
 #ifdef PADDLE_WITH_CUDA
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
        local_scopes_(local_scopes),
        places_(places),
        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@ -46,9 +49,9 @@ struct BroadcastOpHandle : public OpHandleBase {
    }
  }
 #else
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
  std::string Name() const override;
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@ -96,48 +96,61 @@ struct TestBroadcastOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("input");
    std::unique_ptr<ir::Node> n(
        new ir::Node("node0", ir::Node::Type::kOperation));
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+                                             nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+                                             nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
 #endif
    }
-    auto* in_var_handle =
+    std::unique_ptr<ir::Node> v(
-        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+        new ir::Node("node1", ir::Node::Type::kVariable));
    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
                                        gpu_list_[input_scope_idx]);
    vars_.emplace_back(in_var_handle);
    op_handle_->AddInput(in_var_handle);
    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+
    std::unique_ptr<ir::Node> v2(
        new ir::Node("node2", ir::Node::Type::kVariable));
    vars_.emplace_back(new DummyVarHandle(v2.get()));
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    dummy_var_handle->generated_op_ = nullptr;
+    dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(dummy_var_handle);
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      if (!use_gpu_) {
        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      }
-      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      std::unique_ptr<ir::Node> v3(
          new ir::Node("node3", ir::Node::Type::kVariable));
      VarHandle* out_var_handle =
          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
    }
    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    std::unique_ptr<ir::Node> v4(
        new ir::Node("node4", ir::Node::Type::kVariable));
    vars_.emplace_back(new DummyVarHandle(v4.get()));
    DummyVarHandle* out_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    out_dummy_var_handle->generated_op_ = nullptr;
+    out_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddOutput(out_dummy_var_handle);
  }
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@ -19,9 +19,10 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                         platform::Place place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
      op_(framework::OpRegistry::CreateOp(*node->Op())),
      scope_(scope),
      place_(place) {}
@ -35,8 +36,8 @@ void ComputationOpHandle::RunImpl() {
 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
  bool need_wait =
-      in_var && in_var->generated_op_ &&
+      in_var && in_var->GeneratedOp() &&
-      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
  return need_wait;
 }
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@ -28,8 +28,7 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
 public:
-  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
                      platform::Place place);
  std::string Name() const override;
--- a/Show More
+++ b/Show More