From 0ab0d2d04b8876f38a395adb2d9061dffb77733c Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 11 Sep 2018 15:39:54 +0800
Subject: [PATCH 1/8] add versioning doc

---
 doc/fluid/dev/versioning_en.md | 66 ++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 doc/fluid/dev/versioning_en.md

diff --git a/doc/fluid/dev/versioning_en.md b/doc/fluid/dev/versioning_en.md
new file mode 100644
index 0000000000..f3187df265
--- /dev/null
+++ b/doc/fluid/dev/versioning_en.md
@@ -0,0 +1,66 @@
+# Versioning (Work In Progress)
+
+
+PaddlePaddle framework follows Semantic Versioning 2.0 (semver).
+Each release has version of the following format: MAJOR.MINOR.PATCH
+(e.g. 1.2.0). Some key points:
+
+
+ * Major version number change can result in backward-incompatible changes. Codes working in old version don’t necessarily work in the new version. In addition, data, such as program model and checkpointed parameters, generated by the previous major version might not work in the new version. Tools will be attempted to be built to help the release migration.
+
+ * Minor version number change always maintain backward compatibility. It normally contains compatible improvements and bug fixes.
+
+ * Patch number change is for bug fixes.
+g
+ * Violation of the policy are considered as bugs and should be fixed.
+
+### What is Covered
+
+* All public documented Python APIs, excluding those live in the contrib namespace.
+
+### What is Not Covered
+
+* If an API’s implementation has bugs, we reserve the rights to fix the bugs and change the behavior.
+
+* The Python APIs in contrib namespace.
+
+* The Python function and classes that start with ‘_’.
+
+* The offline tools.
+
+* The data generated by the framework, such as serialized Program model file and checkpointed variables, are subject to different versioning scheme described below.
+
+* C++ Inference APIs. (To be covered)
+
+
+## Data
+
+
+Data refers to the artifacts generated by the framework. Here, we specifically mean model Program file and the checkpointed variables.
+
+
+
+* Backward Compatibility: User sometimes generates Data at PaddlePaddle version 1.1 and expects it to be consumed by PaddlePaddle version 1.2.
+  This can happen when an new online system wants to serve an old model trained previously.
+
+
+
+* Forward Compatibility: User sometimes generates Data at PaddlePaddle version 1.2 and expects it to be consumed by PaddlePaddle version 1.1.
+  The can happen when an new successful research model want to be served by an old online system that is not frequently upgraded.
+
+
+
+### Versioning
+
+Data version. Data is assigned an integer version number. Version is increased when incompatible change is introduced.
+
+PaddlePaddle framework has an interval of Data version that it supports. PadlePaddle framework within the same major version (semver) cannot drop support of lower version of Data. Hence, a minor version change cannot drop support of Data version.
+
+
+For example, For PaddlePaddle version 1.1, it supports Program version 3 to 5. Later, Program version is increased from 5 to 6 due to addition of an attribute. As a result PaddlePaddle version 1.1 won’t be able to consume it. PaddlePaddle 1.2 should support Program version 3 to 6. PaddlePaddle can only drop support for Program version 3 until PaddlePaddle version 2.0.
+
+
+
+### Known Issues
+
+Currently, forward compatibility for new Data version is best-effort.

From 46808d9c1eec22c4deac27e16e9981984a9aca76 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 11 Sep 2018 15:40:28 +0800
Subject: [PATCH 2/8] clean

---
 doc/fluid/dev/versioning_en.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fluid/dev/versioning_en.md b/doc/fluid/dev/versioning_en.md
index f3187df265..f15fd029dc 100644
--- a/doc/fluid/dev/versioning_en.md
+++ b/doc/fluid/dev/versioning_en.md
@@ -11,7 +11,7 @@ Each release has version of the following format: MAJOR.MINOR.PATCH
  * Minor version number change always maintain backward compatibility. It normally contains compatible improvements and bug fixes.
 
  * Patch number change is for bug fixes.
-g
+
  * Violation of the policy are considered as bugs and should be fixed.
 
 ### What is Covered

From 68e2125e5998c984871de4669093e7e3fc6ac309 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Mon, 17 Sep 2018 14:49:33 +0800
Subject: [PATCH 3/8] fix dso cmake typo

---
 cmake/tensorrt.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 8f65a737c4..fa0e834a1d 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -18,7 +18,7 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
   if(WITH_DSO)
     set(TENSORRT_FOUND ON)
-  endif(WITH DSO)
+  endif(WITH_DSO)
 else()
     set(TENSORRT_FOUND OFF)
 endif()

From ec6ee0a2939c53f3d520c9ec51492e39bd1c33ee Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 17 Sep 2018 17:39:29 +0800
Subject: [PATCH 4/8] simplify and hide bcast_params

---
 paddle/fluid/framework/parallel_executor.cc | 45 +++------------------
 paddle/fluid/framework/parallel_executor.h  |  2 +-
 2 files changed, 7 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 5b8c75a93d..48e440bda6 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -209,30 +209,9 @@ ParallelExecutor::ParallelExecutor(
 
 void ParallelExecutor::BCastParamsToDevices(
     const std::unordered_set<std::string> &vars) const {
-  // the initializing bcast, all vars would be bcast from device(0),
-  // otherwise
-  // bcast from the specified device.
-  bool initializing = member_->executor_ ? false : true;
+  // the initializing bcast, all vars would be bcast from device(0).
   for (auto &var : vars) {
-    int var_dev_id = -1;
-    if (member_->executor_) {
-      auto &sharded_var_device =
-          member_->executor_->Graph().Get<details::ShardedVarDevice>(
-              details::kShardedVarDevice);
-      if (sharded_var_device.find(var) != sharded_var_device.end()) {
-        var_dev_id = sharded_var_device.at(var);
-      }
-    }
-
-    if (!initializing && var_dev_id == -1) continue;
-
-    framework::Variable *main_var = nullptr;
-    if (initializing) {
-      main_var = member_->local_scopes_[0]->FindVar(var);
-    } else {
-      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
-    }
-
+    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
     if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
@@ -248,8 +227,7 @@ void ParallelExecutor::BCastParamsToDevices(
         auto place = member_->places_[i];
         void *buffer;
 
-        if ((initializing && i == 0) ||
-            (!initializing && static_cast<int>(i) == var_dev_id)) {
+        if (i == 0) {
           buffer = const_cast<void *>(main_tensor.data<void>());
         } else {
           auto local_scope = member_->local_scopes_[i];
@@ -266,29 +244,18 @@ void ParallelExecutor::BCastParamsToDevices(
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
-          if (initializing) {
-            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
-                                         nccl_ctx.comm_, nccl_ctx.stream());
-          } else {
-            if (var_dev_id >= 0) {
-              platform::dynload::ncclBcast(buffers[i], numel, data_type,
-                                           var_dev_id, nccl_ctx.comm_,
-                                           nccl_ctx.stream());
-            }
-          }
+          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
         }
         member_->nccl_ctxs_->WaitAll();
       }
-
 #else
       PADDLE_THROW("Not compiled with CUDA");
 #endif
     } else {
       platform::CPUPlace cpu;
       for (size_t i = 0; i < member_->places_.size(); ++i) {
-        if ((initializing && i == 0) ||
-            (!initializing && static_cast<int>(i) == var_dev_id))
-          continue;
+        if (i == 0) continue;
 
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 5fb748fa20..557d8be227 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -66,9 +66,9 @@ class ParallelExecutor {
   void Run(const std::vector<std::string> &fetch_tensors,
            const std::string &fetched_var_name);
 
+ private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
- private:
   ParallelExecutorPrivate *member_;
 };
 

From e5b322051b13811747bc5244a093cdb22caceeb6 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 17 Sep 2018 18:53:04 +0800
Subject: [PATCH 5/8] clean

---
 .../details/multi_devices_graph_pass.cc        |  3 +++
 .../details/multi_devices_graph_pass.h         | 18 ++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 250e093a5f..8f319116ab 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -127,6 +127,9 @@ static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
 
 void MultiDevSSAGraphBuilder::Init() const {
+  all_vars_.clear();
+  balance_vars_.clear();
+
   loss_var_name_ = Get<const std::string>(kLossVarName);
   places_ = Get<const std::vector<platform::Place>>(kPlaces);
   local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 1ca8c4b855..47aaa80f4d 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -40,12 +40,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                          size_t device_id) const;
   void Init() const;
 
- private:
-  mutable std::string loss_var_name_;
-  mutable std::vector<platform::Place> places_;
-  mutable std::vector<Scope *> local_scopes_;
-  mutable std::unordered_set<std::string> grad_names_;
-
 #ifdef PADDLE_WITH_CUDA
   mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
@@ -95,13 +89,17 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   size_t GetAppropriateDeviceID(
       const std::vector<std::string> &var_names) const;
 
- private:
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
+
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
+
   mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
   mutable std::vector<int64_t> balance_vars_;
-
-  void SetCommunicationContext(OpHandleBase *op_handle,
-                               const platform::Place &p) const;
 };
 }  // namespace details
 }  // namespace framework

From 6eeb063e8c266e605fc6c7d286d38d6558063847 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 17 Sep 2018 12:36:39 +0000
Subject: [PATCH 6/8] fix random failed

---
 python/paddle/fluid/tests/unittests/test_dist_transformer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index e55f8707a9..47083ca7e9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -61,9 +61,7 @@ class TestDistTransformer2x2Sync(TestDistBase):
 
     def test_transformer(self):
         download_files()
-        #Note: loss on test dataset of the first 5 batch are:
-        # 10.518872, 10.518871, 10.518868, 10.518862, 10.518855
-        self.check_with_place("dist_transformer.py", delta=1e-7)
+        self.check_with_place("dist_transformer.py", delta=1e-5)
 
 
 class TestDistTransformer2x2Async(TestDistBase):

From 2d8984912561b6928f53351297b2af5b0f475379 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Tue, 18 Sep 2018 11:00:29 +0800
Subject: [PATCH 7/8] add WITH_INFERENCE_API_TEST option (#13425)

---
 CMakeLists.txt                                |   1 +
 paddle/fluid/inference/CMakeLists.txt         |   8 +-
 paddle/fluid/inference/api/CMakeLists.txt     |  21 ----
 .../fluid/inference/tests/api/CMakeLists.txt  | 102 ++++++++++--------
 .../api/anakin_mobilenet_tester.cc}           |   0
 .../api/anakin_rnn1_tester.cc}                |   0
 paddle/scripts/paddle_build.sh                |   4 +
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +-
 8 files changed, 66 insertions(+), 72 deletions(-)
 rename paddle/fluid/inference/{api/api_anakin_engine_tester.cc => tests/api/anakin_mobilenet_tester.cc} (100%)
 rename paddle/fluid/inference/{api/api_anakin_engine_rnn_tester.cc => tests/api/anakin_rnn1_tester.cc} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2fa5420e9..d43df124bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,7 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index efb91bcf75..6698efd1fa 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,9 +17,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 
-#if(APPLE)
-  add_subdirectory(api)
-#endif()
+add_subdirectory(api)
 
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
@@ -57,5 +55,7 @@ endif()
 if(WITH_TESTING)
   # tests/book depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
-  add_subdirectory(tests/api)
+  if(WITH_INFERENCE_API_TEST)
+    add_subdirectory(tests/api)
+  endif()  
 endif()
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 5df486f345..e569df94c5 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -69,25 +69,4 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     endfunction()
     anakin_target(inference_anakin_api)
     anakin_target(inference_anakin_api_shared)
-    if (WITH_TESTING)
-        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
-        if(WITH_GPU)
-            set(anakin_test_extra_deps dynload_cuda)
-            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
-                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
-                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
-        endif()
-        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
-                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
-                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
-                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
-    endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 2d89fa89e7..508ef1ce40 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,77 +1,87 @@
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo")
+set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
+set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
+    "A path setting inference demo download directories.")
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
-function (inference_download_and_uncompress install_dir filename)
-    message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}")
+function (inference_download install_dir url filename)
+    message(STATUS "Download inference test stuff from ${url}/${filename}")
     execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
     message(STATUS "finish downloading ${filename}")
-endfunction(inference_download_and_uncompress)
+endfunction()
+
+function (inference_download_and_uncompress install_dir url filename)
+    inference_download(${install_dir} ${url} ${filename})
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+endfunction()
 
 function(download_model_and_data install_dir model_name data_name)
-    if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
-        inference_download_and_uncompress(${install_dir} ${model_name})
-        inference_download_and_uncompress(${install_dir} ${data_name})
+    if (NOT EXISTS ${install_dir})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_name})
     endif()
 endfunction()
 
+function(inference_analysis_api_test target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
+endfunction()
+
 # RNN1
-# TODO: fix this test on MACOS
-message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
 if(NOT APPLE)
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
     download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
-    inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc 
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RNN1_INSTALL_DIR}/model
-            --infer_data=${RNN1_INSTALL_DIR}/data.txt)
-endif(NOT APPLE)
+    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
+else()
+    # TODO: fix this test on MACOS, the reason is that
+    # fusion_seqexpand_concat_fc_op is not supported on MACOS
+    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
+endif()
 
 # RNN2
 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${RNN2_INSTALL_DIR}/model
-         --infer_data=${RNN2_INSTALL_DIR}/data.txt)
+inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
 download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
-inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
-        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc)
 
 # lac
 set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
 download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${LAC_INSTALL_DIR}/model
-        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/model
-         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
+inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
 
 # ocr
-set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
-set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
-if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
-    get_filename_component(filename ${OCR_MODEL_URL} NAME)
-    message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}")
-    execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}")
-    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}")
-    message(STATUS "finish downloading ${filename}")
+set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
+if (NOT EXISTS ${OCR_INSTALL_DIR})
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+endif()
+inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+
+# anakin
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
+   # anakin rnn1
+   set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
+   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
+   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
+   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
+   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc 
+           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
+           DEPS inference_anakin_api_shared SERIAL)
+   # anakin mobilenet
+   if(WITH_GPU)
+       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
+       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
+       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc 
+               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
+               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+   endif()
 endif()
-inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${OCR_INSTALL_DIR}/model
-        --infer_data=${OCR_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
similarity index 100%
rename from paddle/fluid/inference/api/api_anakin_engine_tester.cc
rename to paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
diff --git a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
similarity index 100%
rename from paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
rename to paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2822d2d420..77b9b36e68 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -135,6 +135,8 @@ function cmake_gen() {
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
+        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
@@ -165,6 +167,8 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
+        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DPY_VERSION=${PY_VERSION:-2.7}
 }
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 35c4e996c5..958e72ce27 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -29,8 +29,8 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
 
-message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext")
 if(APPLE)
+    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext")
     # this op is not support on mac
     list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
     # TODO: add the unitest back when it fixed

From 72dd6b37d9a9a10a00e9c70436f68d7e19618770 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 18 Sep 2018 12:36:43 +0800
Subject: [PATCH 8/8] Add sequence_expand_as_op (#13420)

* Add sequence_expand_as_op

* follow comment
---
 paddle/fluid/API.spec                         |   1 +
 .../fluid/operators/sequence_expand_as_op.cc  | 168 ++++++++++++++++++
 .../fluid/operators/sequence_expand_as_op.cu  | 134 ++++++++++++++
 .../fluid/operators/sequence_expand_as_op.h   | 148 +++++++++++++++
 python/paddle/fluid/layers/nn.py              |  66 +++++++
 .../unittests/test_sequence_expand_as.py      |  77 ++++++++
 6 files changed, 594 insertions(+)
 create mode 100644 paddle/fluid/operators/sequence_expand_as_op.cc
 create mode 100644 paddle/fluid/operators/sequence_expand_as_op.cu
 create mode 100644 paddle/fluid/operators/sequence_expand_as_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_expand_as.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index e362d34864..fff03ffa67 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -116,6 +116,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
diff --git a/paddle/fluid/operators/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_expand_as_op.cc
new file mode 100644
index 0000000000..33c1e1c973
--- /dev/null
+++ b/paddle/fluid/operators/sequence_expand_as_op.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_expand_as_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+
+class SequenceExpandAsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceExpandAsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SequenceExpandAsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceExpandAsOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = x_dims;
+
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Dimension number of Input(X) should be at least 2.");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+
+      auto& x_dim = x_var->Get<LoDTensor>().dims();
+      auto& y_lod = y_var->Get<LoDTensor>().lod();
+
+      PADDLE_ENFORCE_EQ(y_lod.size(), 1,
+                        "Level number of Input(Y)'s lod should be 1.");
+
+      PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dim[0]), y_lod[0].size() - 1,
+                        "The first dimension of Input(X) should be equal "
+                        "to the size of Input(Y)'s 0 level lod.");
+
+      int64_t out_first_dim = 0;
+      if (y_lod[0].size() <= 1) {
+        out_first_dim = x_dims[0];
+      } else {
+        for (size_t i = 1; i < y_lod[0].size(); ++i) {
+          out_first_dim += (y_lod[0][i] - y_lod[0][i - 1]);
+        }
+      }
+      out_dims[0] = out_first_dim;
+    } else {
+      out_dims[0] = -1;
+    }
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("Y", /*->*/ "Out");
+  }
+};
+
+class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
+             "level is at most 1.");
+    AddInput("Y",
+             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
+             "lod (specified level) is referred by Input(X).");
+    AddOutput("Out",
+              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
+              "generated from Input(X) by referring lod of Input(Y).");
+    AddComment(R"DOC(
+Sequence Expand As Operator.
+
+This operator expands `X` according to the zeroth level lod of `Y`. Current
+implementation requires the level number of Input(Y)'s lod should be 1, and
+the first dimension of Input(X) should be equal to the size of Input(Y)'s zeroth
+level lod, and lod of Input(X) is not considered.
+
+Following are cases to better explain how this works:
+
+Case 1:
+
+Given a 1-level LoDTensor input(X)
+    X.data = [[a], [b], [c], [d]]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0, 3, 6, 7, 8]]
+ref_level: 0
+then we get 1-level LoDTensor
+    Out.lod =  [[0,            3,              6,  7,  8]]
+    Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
+    Out.dims = [8, 1]
+
+Case 2:
+
+Given a common Tensor input(X)
+    X.data = [[a, b], [c, d], [e, f]]
+    X.dims = [3, 2]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+ref_level: 0
+then we get a common LoDTensor
+    Out.lod =  [[0,             2,     3,                    6]]
+    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
+    Out.dims = [6, 2]
+
+)DOC");
+  }
+};
+
+class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", x_grad_name);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp,
+                  ops::SequenceExpandAsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_as,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_as_grad,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>);
diff --git a/paddle/fluid/operators/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_expand_as_op.cu
new file mode 100644
index 0000000000..7357f5ae6e
--- /dev/null
+++ b/paddle/fluid/operators/sequence_expand_as_op.cu
@@ -0,0 +1,134 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/operators/sequence_expand_as_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+static __global__ void sequence_expand_as_kernel(const T *in_data,
+                                                 const size_t *expand_offset,
+                                                 const size_t src_hight,
+                                                 const size_t src_widht,
+                                                 T *out_data) {
+  for (int h_id = blockIdx.x; h_id < src_hight; h_id += gridDim.x) {
+    int span = expand_offset[h_id + 1] - expand_offset[h_id];
+    if (span == 0) continue;
+    const T *src = in_data + h_id * src_widht;
+    for (int w_id = threadIdx.x; w_id < src_widht; w_id += blockDim.x) {
+      T ele = src[w_id];
+      int offset = expand_offset[h_id] * src_widht;
+      for (int k = 0; k < span; ++k) {
+        out_data[offset + k * src_widht + w_id] = ele;
+      }
+    }
+  }
+}
+
+template <typename T>
+static __global__ void sequence_expand_as_grad_kernel(
+    const T *dout_data, const size_t *expand_offset, const size_t dst_hight,
+    const size_t dst_width, T *dx_data) {
+  for (int h_id = blockIdx.x; h_id < dst_hight; h_id += gridDim.x) {
+    T *dst = dx_data + h_id * dst_width;
+    int span = expand_offset[h_id + 1] - expand_offset[h_id];
+
+    for (int w_id = threadIdx.x; w_id < dst_width; w_id += blockDim.x) {
+      T result = 0;
+      for (int k = 0; k < span; ++k) {
+        int offset = (expand_offset[h_id] + k) * dst_width;
+        const T *src = dout_data + offset;
+        result += src[w_id];
+      }
+      dst[w_id] = result;
+    }
+  }
+}
+
+template <typename T>
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(
+      const platform::CUDADeviceContext &context, const LoDTensor &x,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      LoDTensor *out) {
+    int hight = x.dims()[0];
+    int width = framework::product(x.dims()) / hight;
+
+    const int kThreadsPerBlock = 1024;
+    int thread_x = kThreadsPerBlock;
+    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      thread_x = ((width + 31) >> 5) << 5;
+    }
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int block_x = std::max(max_threads / thread_x, 1);
+
+    dim3 block_size(thread_x);
+    dim3 grid_size(block_x);
+    sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
+        out->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+template <typename T>
+struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext &context,
+                  const LoDTensor &dout,
+                  const framework::Vector<size_t> &ref_lod, /*expand based lod*/
+                  LoDTensor *dx) {
+    int hight = dx->dims()[0];
+    int width = framework::product(dx->dims()) / hight;
+
+    const int kThreadsPerBlock = 1024;
+    int thread_x = kThreadsPerBlock;
+    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      thread_x = ((width + 31) >> 5) << 5;
+    }
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int block_x = std::max(max_threads / thread_x, 1);
+
+    dim3 block_size(thread_x);
+    dim3 grid_size(block_x);
+    sequence_expand_as_grad_kernel<<<grid_size, block_size, 0,
+                                     context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
+        dx->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_as,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_as_grad,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>);
diff --git a/paddle/fluid/operators/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_expand_as_op.h
new file mode 100644
index 0000000000..42c90d01c0
--- /dev/null
+++ b/paddle/fluid/operators/sequence_expand_as_op.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <numeric>  // std::iota
+#include <sstream>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+struct SequenceExpandFunctor {
+  void operator()(
+      const DeviceContext &ctx, const framework::LoDTensor &x,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *out);
+};
+
+template <typename DeviceContext, typename T>
+struct SequenceExpandAsGradFunctor {
+  void operator()(
+      const DeviceContext &ctx, const framework::LoDTensor &dout,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *dx);
+};
+
+template <typename T>
+struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext &context, const framework::LoDTensor &x,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *out) {
+    int64_t hight = x.dims()[0];
+    int64_t width = framework::product(x.dims()) / hight;
+
+    const T *in_data = x.data<T>();
+    T *out_data = out->mutable_data<T>(context.GetPlace());
+
+    for (int h_id = 0; h_id < hight; ++h_id) {
+      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      if (span == 0) continue;
+      const T *src = in_data + h_id * width;
+      for (int64_t w_id = 0; w_id < width; ++w_id) {
+        T ele = src[w_id];
+        size_t offset = ref_lod[h_id] * width;
+        for (size_t k = 0; k < span; ++k) {
+          out_data[offset + k * width + w_id] = ele;
+        }
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceExpandAsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::LoDTensor>("X");
+    auto *y = context.Input<framework::LoDTensor>("Y");
+    auto *out = context.Output<framework::LoDTensor>("Out");
+
+    auto &y_lod = y->lod();
+    PADDLE_ENFORCE_EQ(y_lod.size(), 1, "LoD of Y should be 1.");
+    PADDLE_ENFORCE_GT(y_lod[0].size(), 1, ".");
+
+    out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    SequenceExpandFunctor<DeviceContext, T> seq_espand_functor;
+    seq_espand_functor(dev_ctx, *x, y_lod[0], out);
+  }
+};
+
+/*
+ *Given Grad(Out)
+ *
+ *    Grad(Out).lod = [[0,              3,            6]]
+ *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+ * Then
+ *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
+ *                 = [0.6, 1.5]
+ *    Grad(X).lod = Input(X).lod
+ *
+ * */
+template <typename T>
+struct SequenceExpandAsGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext &context,
+      const framework::LoDTensor &dout,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *dx) {
+    int64_t hight = dx->dims()[0];
+    int64_t width = framework::product(dx->dims()) / hight;
+
+    const T *dout_data = dout.data<T>();
+    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+
+    for (int64_t h_id = 0; h_id < hight; ++h_id) {
+      T *dst = dx_data + h_id * width;
+      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      for (int64_t w_id = 0; w_id < width; ++w_id) {
+        T result = 0;
+        for (size_t k = 0; k < span; ++k) {
+          size_t offset = (ref_lod[h_id] + k) * width;
+          result += dout_data[offset + w_id];
+        }
+        dst[w_id] = result;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceExpandAsGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *g_out =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *y = context.Input<framework::LoDTensor>("Y");
+    auto *g_x =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+
+    g_x->mutable_data<T>(context.GetPlace());
+
+    SequenceExpandAsGradFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *g_out,
+            y->lod()[0], g_x);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3ae0fac4be..3bc3acabee 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -54,6 +54,7 @@ __all__ = [
     'conv2d_transpose',
     'conv3d_transpose',
     'sequence_expand',
+    'sequence_expand_as',
     'sequence_pad',
     'lstm_unit',
     'reduce_sum',
@@ -2666,6 +2667,71 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     return tmp
 
 
+def sequence_expand_as(x, y, name=None):
+    """Sequence Expand As Layer. This layer will expand the input variable **x**
+    according to the zeroth level lod of **y**. Current implementation requires
+    the level number of Input(Y)'s lod must be 1, and the first dimension of
+    Input(X) should be equal to the size of Input(Y)'s zeroth level lod, and
+    lod of Input(X) is not considered.
+
+    Following examples will explain how sequence_expand_as works:
+
+    .. code-block:: text
+
+        * Case 1:
+
+            Given a 1-level LoDTensor input(X)
+                X.data = [[a], [b], [c], [d]]
+                X.dims = [4, 1]
+            and input(Y)
+                Y.lod = [[0, 3, 6, 7, 8]]
+            ref_level: 0
+            then we get 1-level LoDTensor
+                Out.lod =  [[0,            3,              6,  7,  8]]
+                Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
+                Out.dims = [8, 1]
+
+        * Case 2:
+
+            Given a common Tensor input(X)
+                X.data = [[a, b], [c, d], [e, f]]
+                X.dims = [3, 2]
+            and input(Y)
+                Y.lod = [[0, 2, 3, 6]]
+            ref_level: 0
+            then we get a common LoDTensor
+                Out.lod =  [[0,             2,     3,                    6]]
+                Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
+                Out.dims = [6, 2]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand_as(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand_as', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand_as',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': tmp})
+    return tmp
+
+
 @templatedoc()
 def sequence_pad(x, pad_value, maxlen=None):
     """
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
new file mode 100644
index 0000000000..4ac97f7ed4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSequenceExpandAs(OpTest):
+    def setUp(self):
+        self.op_type = 'sequence_expand_as'
+        self.set_data()
+        self.compute()
+
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[1, 3, 4]]
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
+
+    def compute(self):
+        x = self.inputs['X']
+        x_data, x_lod = x if type(x) == tuple else (x, None)
+        y_data, y_lod = self.inputs['Y']
+
+        assert len(y_lod) == 1 and len(y_lod[0]) == x_data.shape[0]
+
+        repeats = []
+        for i in range(len(y_lod[0])):
+            repeat_num = y_lod[0][i]
+            if repeat_num == 0:
+                continue
+            repeats.extend([i for _ in range(repeat_num)])
+
+        out_data = x_data[repeats]
+        self.outputs = {'Out': (out_data, y_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSequenceExpandAsCase1(TestSequenceExpandAs):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        x_lod = [[2, 3]]
+        y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float32')
+        y_lod = [[2, 2, 0, 3, 3]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
+class TestSequenceExpandAsCase2(TestSequenceExpandAs):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
+        x_lod = [[1]]
+        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
+        y_lod = [[2]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()