Merge branch 'develop' into add_reorg_op

7 years ago · 9f65b616b2
parent 45565784bf 08d22cf7e1
commit 9f65b616b2
40 changed files with 1680 additions and 206 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -62,7 +62,6 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -175,6 +175,7 @@ paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dim
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_proto_maker.h"
 namespace paddle {
 namespace framework {
@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
              static_cast<int>(OpRole::kForward));
 }
 // a->OP0->b
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_proto_maker.h"
 namespace paddle {
 namespace framework {
@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
              static_cast<int>(OpRole::kForward));
 }
 // a->OP0->b
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -23,8 +23,62 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
 namespace {
 void CheckProgram(const ProgramDesc &program) {
  std::map<int, bool> visit;
 #define _INT(role) static_cast<int>(role)
  for (size_t i = 0; i < program.Size(); ++i) {
    for (OpDesc *op : program.Block(i).AllOps()) {
      // For backward compatibility, some program doesn't have role added.
      if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
      int role_id = boost::get<int>(
          op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
      visit[role_id] = true;
      switch (role_id) {
        case _INT(OpRole::kForward):
          PADDLE_ENFORCE(
              visit.find(_INT(OpRole::kBackward)) == visit.end(),
              "Cannot add forward operator before backward operator.");
          break;
        case _INT(OpRole::kBackward):
        case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
          PADDLE_ENFORCE(
              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
              "Cannot add backward operator before optimize operator.");
          break;
        case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
                                    _INT(OpRole::kLoss)) == visit.end(),
                         "Cannot add backward|loss operator before "
                         "forward|loss operator.");
          PADDLE_ENFORCE(
              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
              "Cannot add backward operator before optimize operator.");
          break;
        case _INT(OpRole::kOptimize):
        case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
                         "Optimize operators must follow backward operator.");
          break;
        case _INT(OpRole::kLRSched):
        case _INT(OpRole::kDist):
        case _INT(OpRole::kRPC):
        case _INT(OpRole::kNotSpecified):
          break;
        default:
          LOG(FATAL) << "Unknown operator role. Don't add new role because "
                        "you don't know what you are doing.";
      }
    }
  }
 #undef _INT
 }
 }  // namespace
 Graph::Graph(const ProgramDesc &program) : program_(program) {
  CheckProgram(program_);
  // Make the nodes id start from 0.
  Node::ResetId();
  auto var_nodes = InitFromProgram(program_);
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
  return result;
 }
 bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
                  const std::pair<PDNode *, Node *> &b) {
  if (a.first != b.first) {
    return a.first < b.first;
  } else {
    return a.second < b.second;
  }
 }
 // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
 // see https://github.com/PaddlePaddle/Paddle/issues/13550
 void GraphPatternDetector::UniquePatterns(
@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
  std::vector<GraphPatternDetector::subgraph_t> result;
  std::unordered_set<size_t> set;
  std::hash<std::string> hasher;
  for (auto &g : *subgraphs) {
-    size_t key = 0;
+    // Sort the items in the sub-graph, and transform to a string key.
-    for (auto &item : g) {
+    std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
-      key ^= std::hash<void *>{}(item.first);
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
-      key ^= std::hash<void *>{}(item.second);
+    std::stringstream ss;
-    }
+    for (auto &item : sorted_keys) {
      ss << item.first << ":" << item.second;
    }
    auto key = hasher(ss.str());
    if (!set.count(key)) {
      result.emplace_back(g);
      set.insert(key);
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
    PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
    for (size_t j = 0; j < lod.size(); ++j) {
      auto &sub_lod = new_lod[j];
-      auto &offset = sub_lod.back();
+      size_t offset = sub_lod.back();
      for (size_t k = 1; k < lod[j].size(); ++k) {
        sub_lod.push_back(lod[j][k] + offset);
      }
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() {
  }
 }
-static bool VarIsTensor(const Variable* var) {
+static bool VarIsTensor(const Variable& var) {
-  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+  return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
 }
-const Tensor* GetTensorFromVar(Variable* var) {
+const Tensor* GetTensorFromVar(const Variable& var) {
-  if (var->IsType<LoDTensor>()) {
+  if (var.IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
+    return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
-  } else if (var->IsType<SelectedRows>()) {
+  } else if (var.IsType<SelectedRows>()) {
-    return var->GetMutable<SelectedRows>()->mutable_value();
+    return &(var.Get<SelectedRows>().value());
  } else {
    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
+                 var.Type().name());
  }
 }
@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  auto* var = InputVar(name);
-  return var == nullptr ? nullptr
+  return var == nullptr ? nullptr : GetTensorFromVar(*var);
                        : GetTensorFromVar(const_cast<Variable*>(var));
 }
 template <>
@ -428,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
  std::transform(names.begin(), names.end(), std::back_inserter(res),
                 [&](const std::string& sub_name) {
                   auto var = scope_.FindVar(sub_name);
-                   return var == nullptr ? nullptr : GetTensorFromVar(var);
+                   return var == nullptr ? nullptr : GetTensorFromVar(*var);
                 });
  return res;
 }
@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack(
  for (auto& var_name : inplace_vars) {
    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
-    auto* transformed_tensor =
+    auto* var = transfer_scope.FindVar(var_name);
-        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
                   var_name);
    auto* transformed_tensor = GetTensorFromVar(*var);
    original_tensor->ShareDataWith(*transformed_tensor);
  }
 }
@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData(
    for (auto& var_name : var_name_item.second) {
      auto* var = scope.FindVar(var_name);
      // Only tensor can be tranfer to another device.
-      if (var == nullptr || !VarIsTensor(var)) {
+      if (var == nullptr || !VarIsTensor(*var)) {
        continue;
      }
-      auto* tensor_in = GetTensorFromVar(var);
+      auto* tensor_in = GetTensorFromVar(*var);
      if (!tensor_in->IsInitialized()) {
        continue;
      }
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -63,7 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
-const Tensor* GetTensorFromVar(Variable* var);
+const Tensor* GetTensorFromVar(const Variable& var);
 class OperatorBase;
 class ExecutionContext;
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
                                        platform::CPUPlace());
    EXPECT_EQ(p1, p2);
  }
  // Not sure if it's desired, but currently, Tensor type can be changed.
  {
    framework::Tensor src_tensor;
    int8_t* p1 = src_tensor.mutable_data<int8_t>(framework::make_ddim({1}),
                                                 platform::CPUPlace());
    EXPECT_NE(p1, nullptr);
    *p1 = 1;
    uint8_t* p2 = src_tensor.mutable_data<uint8_t>(framework::make_ddim({1}),
                                                   platform::CPUPlace());
    EXPECT_NE(p2, nullptr);
    EXPECT_EQ(static_cast<int>(p2[0]), 1);
  }
 #ifdef PADDLE_WITH_CUDA
  {
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
  } else if (platform::is_cuda_pinned_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
    auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
                 nullptr);
  }
 #endif
 }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,3 +1,6 @@
 if(WITH_TESTING)
  include(test.cmake) # some generic cmake funtion for inference
 endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@ -26,16 +26,11 @@ function (inference_analysis_test TARGET)
     set(oneValueArgs "")
     set(multiValueArgs SRCS ARGS EXTRA_DEPS)
     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        set(mem_opt "")
+     inference_base_test(${TARGET}
-        if(WITH_GPU)
+             SRCS ${analysis_test_SRCS}
            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
        endif()
        cc_test(${TARGET}
                SRCS "${analysis_test_SRCS}"
             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
+             ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
-        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+  endif()
    endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
  op->SetType(type);
  op->SetInput("Xs", inputs);
  op->SetOutput("Xs", outputs);
  op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
              static_cast<int>(framework::OpRole::kForward));
 }
 TEST(DataFlowGraph, Build_IR_Graph) {
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@ -17,39 +17,12 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
-
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
 set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}
        )
 if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
 endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
        set(oneValueArgs SRC)
        set(multiValueArgs ARGS)
        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 	if (WITH_GPU)
 		cc_test(${TARGET_NAME}
 			SRCS ${inference_test_SRC}
 			DEPS "${inference_deps}"
 			ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15)
        else()
 		cc_test(${TARGET_NAME}
 			SRCS ${inference_test_SRC}
 			DEPS "${inference_deps}"
 			ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
 	endif()
        if(inference_test_ARGS)
            set_tests_properties(${TARGET_NAME}
                    PROPERTIES DEPENDS "${inference_test_ARGS}")
        endif()
    endif(WITH_TESTING)
 endfunction(inference_api_test)
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
@ -59,8 +32,11 @@ cc_test(test_paddle_inference_api
        SRCS api_tester.cc
        DEPS paddle_inference_api)
-inference_api_test(test_api_impl SRC api_impl_tester.cc
+if(WITH_TESTING)
-                    ARGS test_word2vec test_image_classification)
+  inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
  set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
 endif()
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
        ARGS --dirname=${PYTHON_TESTS_DIR}/book)
@ -68,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
-
+  if(WITH_TESTING)
-inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
+    inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
                      ARGS --dirname=${WORD2VEC_MODEL_DIR})
  endif()
 endif()
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@ -22,12 +22,14 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 #ifdef __clang__
-#define ACC_DIFF 4e-2
+#define ACC_DIFF 4e-3
 #else
-#define ACC_DIFF 1e-2
+#define ACC_DIFF 1e-3
 #endif
-DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(word2vec_dirname, "",
              "Directory of the word2vec inference model.");
 DEFINE_string(book_dirname, "", "Directory of the book inference model.");
 namespace paddle {
@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
 NativeConfig GetConfig() {
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.model_dir = FLAGS_word2vec_dirname;
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
 #ifdef PADDLE_WITH_CUDA
@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "/image_classification_resnet.inference.model";
+      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
  const bool is_combined = false;
  std::vector<std::vector<int64_t>> feed_target_shapes =
@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "/image_classification_resnet.inference.model";
+      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
  std::vector<framework::LoDTensor> jobs(num_jobs);
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
  //# 1. Create PaddlePredictor with a config.
  NativeConfig config0;
-  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.model_dir = FLAGS_dirname;
  config0.use_gpu = true;
  config0.fraction_of_gpu_memory = 0.3;
  config0.device = 0;
  MixedRTConfig config1;
-  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.model_dir = FLAGS_dirname;
  config1.use_gpu = true;
  config1.fraction_of_gpu_memory = 0.3;
  config1.device = 0;
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do
    -DWITH_GPU=$TEST_GPU_CPU \
    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
  make -j
-  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
+  word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
  if [ -d $word2vec_model ]; then
    for use_gpu in $use_gpu_list; do
      ./simple_on_word2vec \
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@ -70,12 +70,8 @@ void Main(bool use_gpu) {
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
         i++) {
-      // Here will result random fail, for that the model is trained by CI, the
+      CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
-      // train phase is not stable, so the result will be random.
+                 0.001);
      // TODO(Superjomn) will restore after the model is upload.
      // CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
      // result[i],
      // 0.001);
    }
  }
 }
--- a/paddle/fluid/inference/test.cmake
+++ b/paddle/fluid/inference/test.cmake
@ -0,0 +1,31 @@
 set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
    "A path setting inference demo download directories.")
 function (inference_download install_dir url filename)
    message(STATUS "Download inference test stuff from ${url}/${filename}")
    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
    message(STATUS "finish downloading ${filename}")
 endfunction()
 function (inference_download_and_uncompress install_dir url filename)
    inference_download(${install_dir} ${url} ${filename})
    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
 endfunction()
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
 if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
    inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
 function (inference_base_test TARGET)
   set(options "")
   set(oneValueArgs "")
   set(multiValueArgs SRCS ARGS DEPS)
   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   if(WITH_GPU)
       set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
   endif()
   cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS})
 endfunction()
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -1,18 +1,4 @@
 set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
    "A path setting inference demo download directories.")
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
 function (inference_download install_dir url filename)
    message(STATUS "Download inference test stuff from ${url}/${filename}")
    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
    message(STATUS "finish downloading ${filename}")
 endfunction()
 function (inference_download_and_uncompress install_dir url filename)
    inference_download(${install_dir} ${url} ${filename})
    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
 endfunction()
 function(download_model_and_data install_dir model_name data_name)
    if (NOT EXISTS ${install_dir})
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@ -0,0 +1,112 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using ScopedSpatialTransformerDescriptor =
    platform::ScopedSpatialTransformerDescriptor;
 template <typename T>
 class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use CUDAPlace.");
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
    auto* theta = ctx.Input<Tensor>("Theta");
    auto* output = ctx.Output<Tensor>("Output");
    const T* theta_data = theta->data<T>();
    int n = theta->dims()[0];
    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
    Tensor h_sizes;
    int* h_size_data;
    if (size_attr.size() == 0) {
      auto* output_shape = ctx.Input<Tensor>("OutputShape");
      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
      h_size_data = h_sizes.data<int>();
    } else {
      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
      h_size_data[0] = n;
      h_size_data[1] = size_attr[1];
      h_size_data[2] = size_attr[2];
      h_size_data[3] = size_attr[3];
    }
    T* output_data = output->mutable_data<T>(
        {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
    ScopedSpatialTransformerDescriptor st_desc;
    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
        st_desc.descriptor<T>(4, h_size_data);
    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
        handle, cudnn_st_desc, theta_data, output_data));
  }
 };
 template <typename T>
 class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use CUDAPlace.");
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
    int n = output_grad->dims()[0];
    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
    Tensor h_sizes;
    int* h_size_data;
    if (size_attr.size() == 0) {
      auto* output_shape = ctx.Input<Tensor>("OutputShape");
      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
      h_size_data = h_sizes.data<int>();
    } else {
      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
      h_size_data[0] = n;
      h_size_data[1] = size_attr[1];
      h_size_data[2] = size_attr[2];
      h_size_data[3] = size_attr[3];
    }
    ScopedSpatialTransformerDescriptor st_desc;
    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
        st_desc.descriptor<T>(4, h_size_data);
    const T* output_grad_data = output_grad->data<T>();
    T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
        handle, cudnn_st_desc, output_grad_data, theta_grad_data));
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace plat = paddle::platform;
 REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNAffineGridOpKernel<float>,
                   paddle::operators::CUDNNAffineGridOpKernel<double>);
 REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNAffineGridGradOpKernel<float>,
                   paddle::operators::CUDNNAffineGridGradOpKernel<double>);
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@ -0,0 +1,233 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/affine_grid_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 template <typename T>
 struct Linspace<paddle::platform::CPUDeviceContext, T> {
  framework::Tensor operator()(T start, T end, int count,
                               const framework::ExecutionContext& ctx) {
    Tensor numbers;
    T* number_data = numbers.mutable_data<T>({count}, platform::CPUPlace());
    T slice = (end - start) / (T)(count - 1);
    for (int i = 0; i < count; ++i) {
      number_data[i] = start + (T)i * slice;
    }
    return numbers;
  }
 };
 class AffineGridOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Theta"),
                   "Input(Theta) of AffineGridOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Output"),
                   "Output(Output) of AffineGridOp should not be null.");
    auto theta_dims = ctx->GetInputDim("Theta");
    PADDLE_ENFORCE(theta_dims.size() == 3,
                   "AffineGrid's Input(Theta) should be 3-D tensor.");
    auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
    if (output_shape.size() == 0) {
      PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
                     "Input(OutputShape) of AffineGridOp should not be null if "
                     "attr(output_shape) is not configured.");
      auto output_shape_dims = ctx->GetInputDim("OutputShape");
      PADDLE_ENFORCE(output_shape_dims.size() == 1,
                     "AffineGrid's Input(OutputShape) should be 1-D tensor.");
    } else {
      PADDLE_ENFORCE(output_shape.size() == 4,
                     "The size of attr(output_shape) should be 4.");
    }
    PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
    PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
    // N * H * W * 2
    ctx->SetOutputDim("Output",
                      framework::make_ddim({theta_dims[0], -1, -1, 2}));
    ctx->ShareLoD("Theta", "Output");
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    framework::LibraryType library{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
    if (platform::CanCUDNNBeUsed(ctx)) {
      library = framework::LibraryType::kCUDNN;
    }
 #endif
    auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
    return framework::OpKernelType(data_type, ctx.GetPlace(),
                                   framework::DataLayout::kAnyLayout, library);
  }
 };
 class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput(
        "Theta",
        "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
        "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
        "y_1).");
    AddInput("OutputShape",
             "(Tensor) The shape of target image with format [N, C, H, W].")
        .AsDispensable();
    AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
    AddAttr<bool>(
        "use_cudnn",
        "(bool, default false) Only used in cudnn kernel, need install cudnn")
        .SetDefault(true);
    AddAttr<std::vector<int>>(
        "output_shape",
        "The target output image shape with format [N, C, H, W].")
        .SetDefault(std::vector<int>());
    AddComment(R"DOC(
    It generates a grid of (x,y) coordinates using the parameters of the
    affine transformation that correspond to a set of points where the input
    feature map should be sampled to produce the transformed output feature map.
    Given:
        Theta = [[[x_11, x_12, x_13]
                  [x_14, x_15, x_16]]
                 [[x_21, x_22, x_23]
                  [x_24, x_25, x_26]]]
        OutputShape = [2, 3, 5, 5]
    Step 1:
        Generate relative coordinates according to OutputShape.
        The values of relative coordinates are in the interval between -1 and 1.
        The shape of the relative coordinates is [2, H, W] as below:
        C = [[[-1.  -1.  -1.  -1.  -1. ]
              [-0.5 -0.5 -0.5 -0.5 -0.5]
              [ 0.   0.   0.   0.   0. ]
              [ 0.5  0.5  0.5  0.5  0.5]
              [ 1.   1.   1.   1.   1. ]] 
             [[-1.  -0.5  0.   0.5  1. ]
              [-1.  -0.5  0.   0.5  1. ]
              [-1.  -0.5  0.   0.5  1. ]
              [-1.  -0.5  0.   0.5  1. ]
              [-1.  -0.5  0.   0.5  1. ]]]
        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
    Step2:
        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
        C_ = [[-1.  -1.   1. ]
              [-0.5 -1.   1. ]
              [ 0.  -1.   1. ]
              [ 0.5 -1.   1. ]
              [ 1.  -1.   1. ]
              [-1.  -0.5  1. ]
              [-0.5 -0.5  1. ]
              [ 0.  -0.5  1. ]
              [ 0.5 -0.5  1. ]
              [ 1.  -0.5  1. ]
              [-1.   0.   1. ]
              [-0.5  0.   1. ]
              [ 0.   0.   1. ]
              [ 0.5  0.   1. ]
              [ 1.   0.   1. ]
              [-1.   0.5  1. ]
              [-0.5  0.5  1. ]
              [ 0.   0.5  1. ]
              [ 0.5  0.5  1. ]
              [ 1.   0.5  1. ]
              [-1.   1.   1. ]
              [-0.5  1.   1. ]
              [ 0.   1.   1. ]
              [ 0.5  1.   1. ]
              [ 1.   1.   1. ]]
    Step3:
        Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
    )DOC");
  }
 };
 class AffineGridOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    auto theta_dims = ctx->GetInputDim("Theta");
    if (ctx->HasOutput(framework::GradVarName("Theta"))) {
      ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
    }
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
    if (platform::CanCUDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kCUDNN;
    }
 #endif
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
        ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
  }
 };
 class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto* op = new framework::OpDesc();
    op->SetType("affine_grid_grad");
    op->SetInput("Theta", Input("Theta"));
    op->SetInput("OutputShape", Input("OutputShape"));
    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
    op->SetAttrMap(Attrs());
    op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
    return std::unique_ptr<framework::OpDesc>(op);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
                  ops::AffineGridGradMaker);
 REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
 REGISTER_OP_CPU_KERNEL(
    affine_grid,
    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    affine_grid_grad,
    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@ -0,0 +1,190 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using Array1 = Eigen::DSizes<int64_t, 1>;
 using Array2 = Eigen::DSizes<int64_t, 2>;
 using Array3 = Eigen::DSizes<int64_t, 3>;
 using Array4 = Eigen::DSizes<int64_t, 4>;
 /**
 *Return a tensor with evenly spaced numbers over a specified interval.
 */
 template <typename DeviceContext, typename T>
 struct Linspace {
  framework::Tensor operator()(T start, T end, int count,
                               const framework::ExecutionContext& ctx);
 };
 template <typename DeviceContext, typename T>
 class AffineGridOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    auto* theta = ctx.Input<Tensor>("Theta");
    int n = theta->dims()[0];
    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
    int h = 0;
    int w = 0;
    if (size_attr.size() == 0) {
      auto* output_shape = ctx.Input<Tensor>("OutputShape");
      Tensor h_sizes;
      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
      const int* h_size_data = h_sizes.data<int>();
      h = h_size_data[2];
      w = h_size_data[3];
    } else {
      h = size_attr[2];
      w = size_attr[3];
    }
    auto* output = ctx.Output<Tensor>("Output");
    output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
    math::SetConstant<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), output,
        static_cast<T>(0));
    Linspace<DeviceContext, T> linspace;
    // Get indexes of height with shape [height, width, 1]
    auto h_idx = linspace((T)-1, (T)1, h, ctx);
    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
    // Get indexes of width with shape [height, width, 1]
    auto w_idx = linspace((T)-1, (T)1, w, ctx);
    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
    // Get constant ones tensor with shape [height, width, 1]
    Tensor ones;
    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
    // ones
    Tensor grid;
    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
    auto grid_t = EigenTensor<T, 4>::From(grid);
    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
                               .broadcast(Array2(h, 1))
                               .reshape(Array3(h, w, 1))
                               .concatenate(h_idx_t.reshape(Array2(1, h))
                                                .broadcast(Array2(w, 1))
                                                .shuffle(Array2(1, 0))
                                                .reshape(Array3(h, w, 1)),
                                            2)
                               .eval()
                               .concatenate(ones_t, 2)
                               .reshape(Array4(1, h, w, 3))
                               .broadcast(Array4(n, 1, 1, 1));
    // output = grid * theta.T
    // TODO(wanghaoshuang): Refine batched matrix multiply
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
    for (int i = 0; i < n; ++i) {
      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
      Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
      Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
      blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
                  T(0));
    }
  }
 };
 template <typename DeviceContext, typename T>
 class AffineGridGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
    int n = output_grad->dims()[0];
    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
    int h = 0;
    int w = 0;
    if (size_attr.size() == 0) {
      auto* output_shape = ctx.Input<Tensor>("OutputShape");
      Tensor h_sizes;
      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
      const int* h_size_data = h_sizes.data<int>();
      h = h_size_data[2];
      w = h_size_data[3];
    } else {
      h = size_attr[2];
      w = size_attr[3];
    }
    theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
    math::SetConstant<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), theta_grad,
        static_cast<T>(0));
    Linspace<DeviceContext, T> linspace;
    // Get indexes of height with shape [height, width, 1]
    auto h_idx = linspace((T)-1, (T)1, h, ctx);
    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
    // Get indexes of width with shape [height, width, 1]
    auto w_idx = linspace((T)-1, (T)1, w, ctx);
    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
    // Get constant ones tensor with shape [height, width, 1]
    Tensor ones;
    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
    // ones
    Tensor grid;
    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
    auto grid_t = EigenTensor<T, 4>::From(grid);
    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
                               .broadcast(Array2(h, 1))
                               .reshape(Array3(h, w, 1))
                               .concatenate(h_idx_t.reshape(Array2(1, h))
                                                .broadcast(Array2(w, 1))
                                                .shuffle(Array2(1, 0))
                                                .reshape(Array3(h, w, 1)),
                                            2)
                               .eval()
                               .concatenate(ones_t, 2)
                               .reshape(Array4(1, h, w, 3))
                               .broadcast(Array4(n, 1, 1, 1));
    // output = grid * theta.T
    // TODO(wanghaoshuang): Refine batched matrix multiply
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
    for (int i = 0; i < n; ++i) {
      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
      Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
      blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
                  &sliced_theta_grad, T(0));
    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
  }
 };
 class DeleteVarOpShapeInference : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {}
 };
 class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@ -48,4 +53,5 @@ It should not be configured by users directly.
 REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::DeleteVarOpInfoMaker);
+                  paddle::operators::DeleteVarOpInfoMaker,
                  paddle::operators::DeleteVarOpShapeInference);
--- a/Show More
+++ b/Show More