Merge branch 'develop' of github.com:baidu/Paddle into feature/refine_get_places_op

7 years ago · e5e206e2b6
parent 4059c9ca7f 4bcc0b64cb
commit e5e206e2b6
60 changed files with 2132 additions and 237 deletions
--- a/benchmark/paddle/image/plotlog.py
+++ b/benchmark/paddle/image/plotlog.py
@ -0,0 +1,114 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import argparse
+import matplotlib.pyplot as plt
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Parse Log')
+    parser.add_argument(
+        '--file_path', '-f', type=str, help='the path of the log file')
+    parser.add_argument(
+        '--sample_rate',
+        '-s',
+        type=float,
+        default=1.0,
+        help='the rate to take samples from log')
+    parser.add_argument(
+        '--log_period', '-p', type=int, default=1, help='the period of log')
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_file(file_name):
+    loss = []
+    error = []
+    with open(file_name) as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if not line.startswith('pass'):
+                continue
+            line_split = line.split(' ')
+            if len(line_split) != 5:
+                continue
+
+            loss_str = line_split[2][:-1]
+            cur_loss = float(loss_str.split('=')[-1])
+            loss.append(cur_loss)
+
+            err_str = line_split[3][:-1]
+            cur_err = float(err_str.split('=')[-1])
+            error.append(cur_err)
+
+    accuracy = [1.0 - err for err in error]
+
+    return loss, accuracy
+
+
+def sample(metric, sample_rate):
+    interval = int(1.0 / sample_rate)
+    if interval > len(metric):
+        return metric[:1]
+
+    num = len(metric) / interval
+    idx = [interval * i for i in range(num)]
+    metric_sample = [metric[id] for id in idx]
+    return metric_sample
+
+
+def plot_metric(metric,
+                batch_id,
+                graph_title,
+                line_style='b-',
+                line_label='y',
+                line_num=1):
+    plt.figure()
+    plt.title(graph_title)
+    if line_num == 1:
+        plt.plot(batch_id, metric, line_style, label=line_label)
+    else:
+        for i in range(line_num):
+            plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
+    plt.xlabel('batch')
+    plt.ylabel(graph_title)
+    plt.legend()
+    plt.savefig(graph_title + '.jpg')
+    plt.close()
+
+
+def main():
+    args = parse_args()
+    assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
+
+    loss, accuracy = parse_file(args.file_path)
+    batch = [args.log_period * i for i in range(len(loss))]
+
+    batch_sample = sample(batch, args.sample_rate)
+    loss_sample = sample(loss, args.sample_rate)
+    accuracy_sample = sample(accuracy, args.sample_rate)
+
+    plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
+    plot_metric(
+        accuracy_sample,
+        batch_sample,
+        'accuracy',
+        line_style='g-',
+        line_label='accuracy')
+
+
+if __name__ == '__main__':
+    main()
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -63,7 +63,7 @@ ExternalProject_Add(
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})

-ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)

--- a/paddle/framework/device_data_transform_test.cu
+++ b/paddle/framework/device_data_transform_test.cu
@ -105,8 +105,7 @@ static void BuildVar(const std::string& param_name,
 TEST(Operator, CPUtoGPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-
-  ASSERT_EQ(InitDevices({"CPU", "GPU:0"}), true);
+  InitDevices();

  paddle::framework::Scope scope;
  paddle::platform::CPUPlace cpu_place;
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@ -35,7 +35,7 @@ const std::string kFetchOpType = "fetch";

 Executor::Executor(const platform::Place& place) : place_(place) {}

-void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
  if (var_type == proto::VarDesc::LOD_TENSOR) {
    var->GetMutable<LoDTensor>();
  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@ -45,7 +45,5 @@ class Executor {
  const platform::Place place_;
 };

-void CreateTensor(Variable* var, proto::VarDesc::VarType var_type);
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@ -87,7 +87,11 @@ class GradOpDescMakerBase {
    auto onames = this->Output(name);
    ret_val.reserve(onames.size());
    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
-                   GradVarName);
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     (*this->grad_to_var_)[g_name] = fwd_var_name;
+                     return g_name;
+                   });
    return ret_val;
  }

--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@ -40,40 +40,23 @@ void InitGflags(std::vector<std::string> &argv) {
  });
 }

-bool InitDevices(const std::vector<std::string> &devices) {
-  // device format
-  // CPU
-  // GPU:1
-  // TODO(dzhwinter) : add device format annotation for users.
+void InitDevices() {
+  /*Init all avaiable devices by default */
+
  std::vector<platform::Place> places;
-  for (auto &device : devices) {
-    auto p = string::Piece(device);
-    if (string::HasPrefix(p, "CPU")) {
-      places.emplace_back(platform::CPUPlace());
-    } else if (string::HasPrefix(p, "GPU")) {
+  places.emplace_back(platform::CPUPlace());
+
 #ifdef PADDLE_WITH_CUDA
-      auto pos = string::RFind(p, ':', string::Piece::npos);
-      auto number = device.substr(pos + 1);
-      places.emplace_back(platform::CUDAPlace(std::stoi(number)));
+  int count = platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
 #else
-      LOG(WARNING)
-          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+  LOG(WARNING)
+      << "'GPU' is not supported, Please re-compile with WITH_GPU option";
 #endif
-    } else {
-      return false;
-    }
-  }

-  if (std::find_if(places.begin(), places.end(),
-                   [&](const platform::Place &place) {
-                     return platform::is_cpu_place(place);
-                   }) == places.end()) {
-    places.emplace_back(platform::CPUPlace());
-    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
-  }
  platform::DeviceContextPool::Init(places);
-  // framework::UseALL();
-  return true;
 }

 void InitGLOG(const std::string &prog_name) {
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
@ -24,7 +24,7 @@ void InitGflags(std::vector<std::string> &argv);

 void InitGLOG(const std::string &prog_name);

-bool InitDevices(const std::vector<std::string> &devices);
+void InitDevices();

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@ -14,18 +14,13 @@ limitations under the License. */
 #include "gtest/gtest.h"

 #include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"

-TEST(Init, InitDevices) {
+TEST(InitDevices, CPU) {
  using paddle::framework::InitDevices;
-  std::vector<std::string> ds1 = {"CPU"};
-  ASSERT_EQ(InitDevices(ds1), true);
+  using paddle::platform::DeviceContextPool;

-#ifdef PADDLE_WITH_CUDA
-  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
-  ASSERT_EQ(InitDevices(ds2), true);
-
-  // test re-init
-  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
-  ASSERT_EQ(InitDevices(ds3), true);
-#endif
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_GE(pool.size(), 1U);
 }
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@ -44,9 +44,19 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }

 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(platform::is_cpu_place(t.place()));
  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());

+  if (!platform::is_cpu_place(t.place())) {
+    LoDTensor tt;
+    framework::Copy(t, platform::CPUPlace(), &tt);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(t.place());
+    dev_ctx.Wait();
+
+    os << tt;
+    return os;
+  }
+
  os << "dim: " << t.dims() << "\n";
  os << "lod: " << t.lod() << "\n";

@ -211,38 +221,23 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }

+// TODO(tonyyang-svail): make this function support LoD
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
    const std::vector<platform::Place> places) const {
  check_memory_size();
-  //  PADDLE_ENFORCE(lod().empty() || (lod().size() == 1 && lod()[0].empty())
-  //                 , "Disable parallel lod for now");
  PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
  PADDLE_ENFORCE(dims()[0] % places.size() == 0,
                 "Batch size should be divided by places size");

  std::vector<LoDTensor> lods;
  for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
-    size_t begin = place_idx * dims()[0] / places.size();
-    size_t end = (place_idx + 1) * dims()[0] / places.size();
-    auto src = Slice(static_cast<int>(begin), static_cast<int>(end));
+    int begin = place_idx * dims()[0] / places.size();
+    int end = (place_idx + 1) * dims()[0] / places.size();

-    LoDTensor dst;
-    dst.Resize(src.dims());
+    auto src = Slice(begin, end);
    auto &dst_place = places[place_idx];
-    auto dst_ptr = dst.mutable_data(dst_place, src.type());
-
-    // TODO(tonyyang-svail):
-    //   change the following to framework::Copy
-    auto src_place = src.place();
-    auto src_ptr = src.data<void>();
-    auto size = src.numel() * SizeOfType(src.type());
-    if (platform::is_cpu_place(src_place) &&
-        platform::is_cpu_place(dst_place)) {
-      memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                   boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-    } else {
-      PADDLE_THROW("Not Implemented");
-    }
+    LoDTensor dst;
+    framework::Copy(src, dst_place, &dst);

    lods.emplace_back(dst);
  }
@ -250,28 +245,30 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
  return lods;
 }

+// TODO(tonyyang-svail): make this function support LoD
 void LoDTensor::MergeLoDTensor(
-    const std::vector<const LoDTensor *> &lod_tensors, platform::Place place) {
-  PADDLE_ENFORCE(platform::is_cpu_place(place));
+    const std::vector<const LoDTensor *> &lod_tensors,
+    platform::Place dst_place) {
  PADDLE_ENFORCE(!lod_tensors.empty());
-
  framework::DDim new_dim = lod_tensors[0]->dims();
  std::type_index new_type = lod_tensors[0]->type();
+  auto new_layout = lod_tensors[0]->layout();
  for (auto *lod : lod_tensors) {
    PADDLE_ENFORCE(new_dim == lod->dims());
    PADDLE_ENFORCE(new_type == lod->type());
-    PADDLE_ENFORCE(platform::is_cpu_place(lod->place()));
+    PADDLE_ENFORCE(new_layout == lod->layout());
  }
  new_dim[0] *= lod_tensors.size();
  Resize(new_dim);
+  set_layout(new_layout);

-  auto *dst_ptr = reinterpret_cast<uint8_t *>(mutable_data(place, new_type));
+  mutable_data(dst_place, new_type);
+  int begin = 0;
  for (auto *src : lod_tensors) {
-    auto size = src->numel() * SizeOfType(src->type());
-    memory::Copy(boost::get<platform::CPUPlace>(place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src->place()),
-                 src->data<void>(), size);
-    dst_ptr += size;
+    int end = begin + src->dims()[0];
+    auto dst = Slice(begin, end);
+    framework::Copy(*src, dst_place, &dst);
+    begin = end;
  }
 }

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@ -115,5 +115,21 @@ TEST(LoD, AppendLoD) {
  EXPECT_EQ(origin, expected);
 }

+TEST(LoD, ToAbsOffset) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(abs_lod, expected);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@ -129,7 +129,7 @@ class OpDesc {
  }

  proto::OpDesc desc_;
-  // input arg name => output variable names
+  // input arg name => input variable names
  VariableNameMap inputs_;
  // output arg name => output variable names
  VariableNameMap outputs_;
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@ -69,7 +69,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                             paddle::framework::OpWithoutKernelCheckerMaker);

 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
@ -195,7 +195,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,

 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
@ -225,7 +225,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
  using namespace paddle::framework;

-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
  proto::OpDesc op_desc;

  op_desc.set_type("op_multi_inputs_with_kernel");
@ -264,7 +264,7 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };

 TEST(Operator, Clone) {
-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                  paddle::framework::VariableNameMap{},
                  paddle::framework::AttributeMap{});
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@ -31,9 +31,10 @@ namespace framework {
 *
 * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
 */
-
 inline void Copy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst) {
+  VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
  src.check_memory_size();

  dst->Resize(src.dims());
@ -88,26 +89,25 @@ inline void Copy(const Tensor& src, const platform::Place& dst_place,
 }

 /**
- * @brief Copy supports CPU <-> CPU
+ * @brief Wrapper on
+ *     Copy(const Tensor& src, const platform::Place& dst_place,
+ *              const platform::DeviceContext& ctx, Tensor* dst);
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
 */
 inline void Copy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
-  src.check_memory_size();
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-
-  auto src_place = src.place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = dst->mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  PADDLE_ENFORCE(platform::is_cpu_place(src_place) &&
-                 platform::is_cpu_place(dst_place));
-
-  memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-               boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  if (platform::is_gpu_place(src.place())) {
+    dev_ctx = pool.Get(src.place());
+  } else {
+    dev_ctx = pool.Get(dst_place);
+  }
+  Copy(src, dst_place, *dev_ctx, dst);
 }

 /**
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().tensor();
    default:
-      PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
+      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
  }
 }

--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
@ -169,7 +169,7 @@ void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
  }

  auto* place = new platform::CPUPlace();
-  framework::InitDevices({"CPU"});
+  framework::InitDevices();
  framework::Executor* executor = new framework::Executor(*place);
  framework::Scope* scope = new framework::Scope();

--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@ -114,5 +114,21 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {

 #endif

+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+  return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(gpu);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+  return boost::apply_visitor(Usage(), p);
+}
+
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@ -54,6 +54,13 @@ void Free(Place place, void* ptr);
 template <typename Place>
 size_t Used(Place place);

+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
 /**
 * \brief   Free memory block in one place.
 *
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@ -44,6 +44,9 @@ TEST(BuddyAllocator, CPUAllocation) {

  EXPECT_NE(p, nullptr);

+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
  paddle::memory::Free(cpu, p);
 }

@ -99,6 +102,9 @@ TEST(BuddyAllocator, GPUAllocation) {

  EXPECT_NE(p, nullptr);

+  paddle::platform::Place place = gpu;
+  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
+
  paddle::memory::Free(gpu, p);
 }

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -151,6 +151,7 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding math_function)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@ -39,7 +39,7 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,

  std::map<size_t /*offset*/, std::vector<Item>> hash;
  framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int>(platform::CPUPlace());
+  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
  auto *scores_data =
      selected_scores->mutable_data<float>(platform::CPUPlace());

@ -66,7 +66,7 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,

 void BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
                                      std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int>();
+  auto *pre_ids_data = pre_ids.data<int64_t>();

  for (size_t offset = 0; offset < items->size(); offset++) {
    auto prefix_id = pre_ids_data[offset];
@ -127,7 +127,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
  auto abs_lod = framework::ToAbsOffset(ids.lod());
  PADDLE_ENFORCE_GE(source_abs_two_level_lod.size(), 2UL);

-  auto *ids_data = ids.data<int>();
+  auto *ids_data = ids.data<int64_t>();
  auto *scores_data = scores.data<float>();

  size_t instance_dim = 1;
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@ -230,7 +230,6 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
            ops::ConvOpGrad);
-namespace ops = paddle::operators;
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@ -12,6 +12,7 @@ if(WITH_GPU)
    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
+    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
@ -27,6 +28,7 @@ else()
    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
+    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
@ -38,3 +40,4 @@ cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
+cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/operators/math/im2col.h"
 #include <gtest/gtest.h>
-#include <iostream>

 template <typename DeviceContext, typename Place>
 void testIm2col() {
@ -102,6 +101,7 @@ void testIm2col() {
    Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
    out_ocf_ptr = output_tmp.data<float>();
  }
+
  for (int i = 0; i < 6; ++i) {
    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
  }
@ -154,6 +154,9 @@ void testIm2col() {
  for (int i = 0; i < 6; ++i) {
    EXPECT_EQ(in_ptr[i], col2im_data[i]);
  }
+
+  delete place;
+  delete context;
 }

 TEST(math, im2col) {
--- a/paddle/operators/math/sequence_padding.cc
+++ b/paddle/operators/math/sequence_padding.cc
@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    for (size_t i = 0; i < max_sequence_length; ++i) {
+      for (size_t j = 0; j < num_sequences; ++j) {
+        size_t start_pos = abs_offset_lod[level][j];
+        size_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
+        if (i < sequence_length) {
+          // i > 0 => sequence_length > 0
+          T scale =
+              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+          for (size_t k = 0; k < sequence_width; ++k) {
+            padding_data[(i * num_sequences + j) * sequence_width + k] =
+                seq_data[(start_pos + i) * sequence_width + k] * scale;
+          }
+        } else {
+          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
+                 sequence_width * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    for (size_t i = 0; i < num_sequences; ++i) {
+      size_t start_pos = abs_offset_lod[level][i];
+      size_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
+      for (size_t j = 0; j < sequence_length; ++j) {
+        // sequence_width > j > 0
+        T scale =
+            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+        for (size_t k = 0; k < sequence_width; ++k) {
+          seq_data[(start_pos + j) * sequence_width + k] =
+              padding_data[(j * num_sequences + i) * sequence_width + k] *
+              scale;
+        }
+      }
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/Show More
+++ b/Show More