Merge remote-tracking branch 'upstream/develop' into windows/build

7 years ago · 30849d1f20
parent 36cd18b549 6224e61fd9
commit 30849d1f20
19 changed files with 773 additions and 65 deletions
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -109,7 +109,8 @@ function(op_library TARGET)

    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
-"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
+"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
+"fusion_transpose_flatten_concat_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -116,7 +116,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

-cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto)
+cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler transfer_scope_cache)

--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@ -17,16 +17,28 @@
 namespace paddle {
 namespace framework {

+// Holds all the transfer scope across the process.
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
+  typedef std::unordered_map<size_t, Scope*> map_t;
+  thread_local std::unique_ptr<map_t> x(new map_t);
  return *x;
 }

+// Holds all the transfer scope for this thread.
 std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  thread_local auto* x = new std::unordered_set<Scope*>;
+  typedef std::unordered_set<Scope*> set_t;
+  thread_local std::unique_ptr<set_t> x(new set_t);
  return *x;
 }

+// Try to create a transfer scope. If one cached scope has match the
+// requirement, just return that one.
+// Inputs:
+// @type0: the source kernel type.
+// @type1: the target kernel type.
+// @scope: the execution scope of this op.
+// Returns: A scope used to hold the transfer data across the different kernel
+// type.
 Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
                              const Scope* scope) {
  Scope* new_scope{nullptr};
@ -46,27 +58,5 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
  return new_scope;
 }

-void RemoveKidsFromTransferScopeCache(Scope* scope) {
-  auto it = global_transfer_scope_cache().find(scope);
-  if (it != global_transfer_scope_cache().end()) {
-    global_transfer_scope_cache().erase(it);
-  }
-  for (auto* s : scope->kids()) {
-    auto it = global_transfer_scope_cache().find(s);
-    if (it != global_transfer_scope_cache().end()) {
-      global_transfer_scope_cache().erase(it);
-    }
-  }
-
-  // remove global transfer data cache
-  auto& cache = global_transfer_data_cache();
-  for (auto it = cache.begin(); it != cache.end();) {
-    if (it->second == scope)
-      it = cache.erase(it);
-    else
-      it++;
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) {

  size_t thread_num = 32;
  size_t sleep_time = 40;
-  size_t extra_time = 2;
+  size_t extra_time = 10;

  // Reserve to perform more tests in the future
  std::vector<std::shared_ptr<Allocator>> allocators;
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@ -1,2 +1,6 @@
 include(operators)
-register_operators()
+register_operators(EXCLUDES fusion_transpose_flatten_concat_op)
+if (WITH_GPU)
+  op_library(fusion_transpose_flatten_concat_op)
+  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
+endif()
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
+                      "Inputs(X) of ConcatOp should be empty.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ConcatOp should not be null.");
+
+    auto ins = ctx->GetInputsDim("X");
+    const size_t n = ins.size();
+    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
+
+    std::vector<int> trans_axis =
+        ctx->Attrs().Get<std::vector<int>>("trans_axis");
+    int flatten_axis = ctx->Attrs().Get<int>("flatten_axis");
+    int concat_axis = ctx->Attrs().Get<int>("concat_axis");
+
+    size_t x_rank = ins[0].size();
+    size_t trans_axis_size = trans_axis.size();
+    PADDLE_ENFORCE_EQ(x_rank, trans_axis_size,
+                      "The input tensor's rank(%d) "
+                      "should be equal to the permutation axis's size(%d)",
+                      x_rank, trans_axis_size);
+
+    auto dims0 =
+        GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[0]));
+    std::vector<int> out_dims(dims0);
+    for (size_t i = 1; i < n; i++) {
+      auto dimsi =
+          GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[i]));
+      for (int j = 0; j < static_cast<int>(dims0.size()); j++) {
+        if (j == concat_axis) {
+          out_dims[concat_axis] += dimsi[j];
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], dimsi[j],
+                            "After flatting, the %d-th dim should be save "
+                            "except the specify axis.",
+                            j);
+        }
+      }
+    }
+    if (out_dims[concat_axis] < 0) {
+      out_dims[concat_axis] = -1;
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+  }
+};
+
+class TransposeFlattenConcatFusionOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor, tensors with rank up to 6 are supported.")
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor)The output tensor.");
+    AddAttr<std::vector<int>>(
+        "trans_axis",
+        "(vector<int>) A list of values, and the size of the list should be "
+        "the same with the input tensor rank. This operator permutes the input "
+        "tensor's axes according to the values given.");
+    AddAttr<int>("flatten_axis",
+                 "(int)"
+                 "Indicate up to which input dimensions (exclusive) should be"
+                 "flattened to the outer dimension of the output. The value"
+                 "for axis must be in the range [0, R], where R is the rank of"
+                 "the input tensor. When axis = 0, the shape of the output"
+                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
+                 "input tensor is (d_0, d_1, ... d_n).");
+    AddAttr<int>("concat_axis",
+                 "The axis along which the input tensors will be concatenated. "
+                 "It should be 0 or 1, since the tensor is 2D after flatting.");
+    AddComment(R"DOC(
+
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_transpose_flatten_concat,
+                  ops::TransposeFlattenConcatFusionOp,
+                  ops::TransposeFlattenConcatFusionOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T>
+class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto odims = out->dims();
+
+    std::vector<int> trans_axis = ctx.Attr<std::vector<int>>("trans_axis");
+    int flatten_axis = ctx.Attr<int>("flatten_axis");
+    int concat_axis = ctx.Attr<int>("concat_axis");
+
+    int rank = ins[0]->dims().size();
+    // use at least 4D in cudnnTransformTensor
+    int max_dim = rank < 4 ? 4 : rank;
+    std::vector<int> stride_x(max_dim, 0);
+    std::vector<int> stride_y(max_dim, 0);
+    std::vector<int> dims_y(max_dim, 0);
+
+    cudnnTensorDescriptor_t in_desc;
+    cudnnTensorDescriptor_t out_desc;
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
+    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+
+    T* odata = out->data<T>();
+    for (size_t k = 0; k < ins.size(); ++k) {
+      auto perm_shape = GetPermuteShape(trans_axis, ins[k]->dims());
+      int osize = 1;
+      auto idims = ins[k]->dims();
+      for (int i = 0; i < rank; i++) {
+        stride_x[i] = 1;
+        for (int j = trans_axis[i] + 1; j < rank; j++) {
+          stride_x[i] *= idims[j];
+        }
+        dims_y[i] = perm_shape[i];
+        osize *= perm_shape[i];
+      }
+      stride_y[rank - 1] = 1;
+      for (int i = rank - 2; i >= 0; i--) {
+        if (((i + 1) == flatten_axis) && (concat_axis == 1)) {
+          stride_y[i] = odims[1];
+        } else {
+          stride_y[i] = stride_y[i + 1] * perm_shape[i + 1];
+        }
+      }
+
+      // Since concat is aftern flatten, the output is 2D tensor.
+      // If concat_axis is 0, each input's permutated tensor is continuous.
+      // If concat_axis is 1, the stride of 0-th dim of each input's
+      // permutated tensor is odims()[1].
+
+      for (int i = rank; i < max_dim; i++) {
+        stride_x[i] = 1;
+        stride_y[i] = 1;
+        dims_y[i] = 1;
+      }
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
+
+      CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
+          handle, CudnnDataType<T>::kOne(), in_desc,
+          static_cast<const void*>(ins[k]->data<T>()),
+          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
+      if (concat_axis == 0) {
+        odata += osize;
+      } else {
+        auto flat_shape = GetFlattenShape(flatten_axis, perm_shape);
+        odata += flat_shape[1];
+      }
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fusion_transpose_flatten_concat,
+                        ops::TransposeFlattenConcatFusionKernel<float>,
+                        ops::TransposeFlattenConcatFusionKernel<double>);
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+inline std::vector<int32_t> GetPermuteShape(const std::vector<int>& axis,
+                                            const framework::DDim& in_dims) {
+  std::vector<int32_t> out_dims(in_dims.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    out_dims[i] = in_dims[axis[i]];
+  }
+  return out_dims;
+}
+
+inline std::vector<int32_t> GetFlattenShape(const int axis,
+                                            const std::vector<int>& in_dims) {
+  int64_t outer = 1, inner = 1;
+  for (int i = 0; i < static_cast<int>(in_dims.size()); ++i) {
+    if (i < axis) {
+      outer *= in_dims[i];
+    } else {
+      inner *= in_dims[i];
+    }
+  }
+  std::vector<int32_t> out_shape(2);
+  out_shape[0] = outer;
+  out_shape[1] = inner;
+  return out_shape;
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
                      framework::proto::VarType::FP32,
                      "The sparse table only support FP32");
    w_t->Get(ids_t, out_t, true, is_test);
+    out_t->set_lod(ids_t.lod());
  }
 };

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel<T> {
        math::scatter::MergeAdd<DeviceContext, T> merge_add;
        merge_add(context.template device_context<DeviceContext>(), inputs,
                  out);
+
+        out->SyncIndex();
+
      } else {
        // no data, just set a empty out tensor.
        out->mutable_value()->mutable_data<T>(framework::make_ddim({0}),
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@ -31,6 +31,11 @@ int main(int argc, char** argv) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  new_argv.push_back(
      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
+#elif __clang__
+  new_argv.push_back(
+      strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
+             "mb,allocator_strategy"));
+  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
 #else
  new_argv.push_back(
      strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -91,6 +91,7 @@ def __bootstrap__():
    """
    import sys
    import os
+    import platform
    from . import core

    in_test = 'unittest' in sys.modules
@ -110,14 +111,17 @@ def __bootstrap__():
        print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)

    os.environ['OMP_NUM_THREADS'] = str(num_threads)
-
+    sysstr = platform.system()
    read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
-        'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy',
+        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
+        'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
+        'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
+        'eager_delete_tensor_gb', 'allocator_strategy',
        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
    ]
+    if 'Darwin' not in sysstr:
+        read_env_flags.append('use_pinned_memory')
+
    if os.name != 'nt':
        read_env_flags.append('warpctc_dir')
        read_env_flags.append('cpu_deterministic')
--- a/python/paddle/fluid/contrib/utils/init.py
+++ b/python/paddle/fluid/contrib/utils/init.py
@ -13,8 +13,10 @@
 # limitations under the License.

 from __future__ import print_function
-
+from . import lookup_table_utils
+from .lookup_table_utils import *
 from . import hdfs_utils
 from .hdfs_utils import *

+__all__ = lookup_table_utils.__all__
 __all__ = hdfs_utils.__all__
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@ -1698,6 +1698,7 @@ class Program(object):

        p._copy_param_info_from(self)
        p._copy_data_info_from(self)
+        p._copy_dist_param_info_from(self)
        return p

    def _prune(self, targets):
@ -1938,6 +1939,25 @@ class Program(object):
                             "program, with represent the same topology")
        self.global_block()._copy_param_info_from(other.global_block())

+    def _copy_dist_param_info_from(self, other):
+        """
+        Copy the information of distributed information from other program.
+
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("_copy_dist_param_info_from should be invoked with "
+                            "Program")
+        self._is_distributed = other._is_distributed
+        self._is_chief = other._is_chief
+        self._slice_vars_and_attrs = other._slice_vars_and_attrs
+        self._endpoints = other._endpoints
+        self._distributed_lookup_table = other._distributed_lookup_table
+
    def _copy_data_info_from(self, other):
        """
        Copy the information of data variables from other program.
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@ -165,6 +165,7 @@ def save_vars(executor,

        save_vars(
            executor,
+            main_program=main_program,
            dirname=dirname,
            vars=list(filter(predicate, main_program.list_vars())),
            filename=filename)
@ -172,11 +173,18 @@ def save_vars(executor,
        save_program = Program()
        save_block = save_program.global_block()

+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program should be as Program type or None")
+
        save_var_map = {}
        for each_var in vars:
            # NOTE: don't save the variable which type is RAW
            if each_var.type == core.VarDesc.VarType.RAW:
                continue
+            if each_var.name == main_program._distributed_lookup_table:
+                continue
            new_var = _clone_var_in_block_(save_block, each_var)
            if filename is None:
                save_block.append_op(
@ -198,6 +206,16 @@ def save_vars(executor,
                outputs={},
                attrs={'file_path': os.path.join(dirname, filename)})

+        # if there is lookup table, the trainer 0 will notify all pserver to save.
+        if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+            lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+            attrs = {}
+            attrs['epmap'] = main_program._endpoints
+            attrs['dir'] = lookup_table_filename
+            attrs['lookup_table'] = main_program._distributed_lookup_table
+            save_block.append_op(
+                type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+
        executor.run(save_program)


@ -379,11 +397,22 @@ def load_vars(executor,
        load_prog = Program()
        load_block = load_prog.global_block()

+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program should be as Program type or None")
+
+        load_slice_vars = []
+        for each_var in main_program._slice_vars_and_attrs:
+            load_slice_vars.append(each_var[2].name)
+
        load_var_map = {}
        for each_var in vars:
            assert isinstance(each_var, Variable)
            if each_var.type == core.VarDesc.VarType.RAW:
                continue
+            if each_var.name in load_slice_vars:
+                continue
            new_var = _clone_var_in_block_(load_block, each_var)
            if filename is None:
                load_block.append_op(
@ -406,9 +435,6 @@ def load_vars(executor,
                attrs={'file_path': os.path.join(dirname, filename)})
        executor.run(load_prog)

-        if main_program is None:
-            main_program = default_main_program()
-
        # load slice vars on pserver, if have it.
        _load_slice_up_vars(executor, dirname,
                            main_program._slice_vars_and_attrs)
@ -618,13 +644,6 @@ def save_inference_model(dirname,
    if main_program is None:
        main_program = default_main_program()

-    # if there is lookup table, the trainer 0 will notify all pserver to save.
-    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-        _save_lookup_tables_by_notify(executor, lookup_table_filename,
-                                      main_program._distributed_lookup_table,
-                                      main_program._endpoints)
-
    # when a pserver and a trainer running on the same machine, mkdir may conflict
    try:
        os.makedirs(dirname)
@ -642,6 +661,9 @@ def save_inference_model(dirname,
    # it can only be loaded for inference directly. If it's false, the whole
    # original program and related meta are saved so that future usage can be
    # more flexible.
+
+    origin_program = main_program.clone()
+
    if export_for_deployment:
        main_program = main_program.clone()
        global_block = main_program.global_block()
@ -666,8 +688,11 @@ def save_inference_model(dirname,
        with open(model_basename + ".main_program", "wb") as f:
            f.write(main_program.desc.serialize_to_string())

+    main_program._copy_dist_param_info_from(origin_program)
+
    if params_filename is not None:
        params_filename = os.path.basename(params_filename)
+
    save_persistables(executor, dirname, main_program, params_filename)


@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
        slice_var = var_tuple[2]
        end = start + slice_var.shape[0]

+        orig_var_name = orig_var.name
+        orig_var.name = "{}.origin".format(orig_var_name)
+
        clone_orig_var = load_block.create_var(
            name=orig_var.name,
            type=orig_var.type,
@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
            type='load',
            inputs={},
            outputs={'Out': [clone_orig_var]},
-            attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
+            attrs={'file_path': os.path.join(dirname, orig_var_name)})
        load_block.append_op(
            type="slice",
            inputs={'Input': clone_orig_var},
@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
                   'starts': [start],
                   'ends': [end]})
        need_delete_vars.append(clone_orig_var)
+
    load_block.append_op(
        type='delete_var',
        inputs={'X': need_delete_vars}, )
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -6972,18 +6972,18 @@ def prelu(x, mode, param_attr=None, name=None):
    """
    Equation:

-        y = \max(0, x) + alpha \min(0, x)
+        y = \max(0, x) + alpha * \min(0, x)

    Args:
        x (Variable): The input tensor.
-	  param_attr(ParamAttr|None): The parameter attribute for the learnable
-                                    weight (alpha).
-        mode (string): The mode for weight sharing
-		       all: all elements share same weight
- 		       channel:elements in a channel share same weight
- 		       element:each element has a weight
-	name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                       weight (alpha).
+        mode (string): The mode for weight sharing. It supports all, channel
+                       and element. all: all elements share same weight
+                       channel:elements in a channel share same weight
+                       element:each element has a weight
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.

    Returns:
        Variable: The output tensor with the same shape as input.
@ -6992,7 +6992,7 @@ def prelu(x, mode, param_attr=None, name=None):

        .. code-block:: python

-         x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
+            x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
            mode = 'channel'
            output = fluid.layers.prelu(x,mode)
    """
--- a/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+class TestFusionTransposeFlattenConcationOp(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.op_type = "fusion_transpose_flatten_concat"
+
+        ins = []
+        flats = []
+        for i in range(len(self.shapes)):
+            in_shape = self.shapes[i]
+            a = np.random.random(in_shape).astype("float32")
+            ins.append(("x%d" % i, a))
+
+            b = a.transpose(self.trans_axis)
+            flat_shape = (np.prod(b.shape[:self.flatten_axis]),
+                          np.prod(b.shape[self.flatten_axis:]))
+            c = b.reshape(flat_shape)
+            flats.append(c)
+        out = np.concatenate(flats, axis=self.concat_axis)
+
+        self.inputs = {'X': ins}
+        self.attrs = {
+            'trans_axis': list(self.trans_axis),
+            'flatten_axis': self.flatten_axis,
+            'concat_axis': self.concat_axis
+        }
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, 1e-6)
+        else:
+            pass
+
+    def init_test_case(self):
+        self.shapes = [(3, 4, 17, 17), (3, 8, 7, 7), (3, 12, 5, 5)]
+        self.trans_axis = (0, 2, 3, 1)
+        self.flatten_axis = 1
+        self.concat_axis = 1
+
+
+class TestCase1(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 4, 18, 17), (3, 8, 18, 7), (6, 12, 9, 5)]
+        self.trans_axis = (0, 2, 3, 1)
+        self.flatten_axis = 2
+        self.concat_axis = 1
+
+
+class TestCase2(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
+        self.trans_axis = (0, 2, 3, 1)
+        self.flatten_axis = 2
+        self.concat_axis = 0
+
+
+class TestCase3(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
+        self.trans_axis = (0, 3, 2, 1)
+        self.flatten_axis = 1
+        self.concat_axis = 1
+
+
+class TestCase4(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 9, 17), (8, 3, 9, 17), (4, 6, 9, 17)]
+        self.trans_axis = (0, 2, 1, 3)
+        self.flatten_axis = 3
+        self.concat_axis = 1
+
+
+class TestCase5(TestFusionTransposeFlattenConcationOp):
+    def init_test_case(self):
+        self.shapes = [(3, 8, 9, 17, 2), (3, 8, 2, 17, 9), (3, 17, 9, 8, 2)]
+        self.trans_axis = (0, 2, 1, 4, 3)
+        self.flatten_axis = 1
+        self.concat_axis = 1
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@ -644,6 +644,9 @@ in a single call.")
            else:
                recv_inputs.append(single_trainer_var)

+        self._slice_params_and_optimizes = self._get_slice_vars_and_attrs(
+            endpoint)
+
        # step 3
        # Create a union-find data structure from optimize ops,
        # If two ops are connected, we could add these two ops
@ -766,7 +769,7 @@ in a single call.")
                                               grad_to_block_id, merged_var,
                                               lr_ops)

-# dedup grad to ids list
+        # dedup grad to ids list
        grad_to_block_id = list(set(grad_to_block_id))
        # append global ops
        if global_ops:
@ -827,8 +830,8 @@ in a single call.")
            attrs=attrs)

        # add distributed attrs
-        pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
-            endpoint)
+        pserver_program._slice_vars_and_attrs = list(
+            self._slice_params_and_optimizes.values())

        pserver_program._sync_with_cpp()
        # save pserver program to generate pserver side startup relatively.
@ -941,12 +944,12 @@ to transpile() call.")
                    outputs={"Out": startup_tmpvar})

        # add slice vars
-        s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
+        s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs

        return s_prog

    def _get_slice_vars_and_attrs(self, endpoint):
-        slice_vars_and_attrs = []
+        slice_vars_and_attrs = {}
        block_suffix = "block"
        for param in self.param_grad_ep_mapping[endpoint]["params"]:
            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
@ -960,8 +963,7 @@ to transpile() call.")
            slice_vars = self.param_var_mapping[orig_var_name]
            for slice_var in slice_vars[:block_idx]:
                skip_dim0 += slice_var.shape[0]
-            slice_vars_and_attrs.append([orig_var, skip_dim0, param])
-
+            slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param]
        return slice_vars_and_attrs

    # ====================== private transpiler functions =====================
@ -1662,10 +1664,10 @@ to transpile() call.")
            if key in ["Param", "Grad", "LearningRate"]:
                continue
            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
+            param_var = new_inputs["Param"]
            # update accumulator variable shape
-            param_shape = new_inputs["Param"].shape
-            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
-                                                        var.shape, param_shape)
+            new_shape = self._get_optimizer_input_shape(
+                opt_op.type, key, var.shape, param_var.shape)
            tmpvar = pserver_block.create_var(
                name=var.name,
                persistable=var.persistable,
@ -1673,6 +1675,13 @@ to transpile() call.")
                shape=new_shape)
            new_inputs[key] = tmpvar

+            # var shape been changed
+            if new_shape != var.shape:
+                slice_var_args = self._slice_params_and_optimizes[
+                    param_var.name]
+                self._slice_params_and_optimizes[
+                    var.name] = [var, slice_var_args[1], tmpvar]
+
        # change output's ParamOut variable
        outputs = self._get_output_map_from_op(
            self.origin_program.global_block().vars, opt_op)