delete matplotlib

nccl init support hostname and ip; test=develop (#28163 )
update. (#28166 )
198 changed files with 9574 additions and 8112 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -193,10 +193,19 @@ if(WITH_BRPC_RDMA)
    endif()
 endif()

-# lite subgraph compilation depends on CUDNN_ROOT,
-# so include(cudnn) needs to be in front of include(third_party/lite)
-include(cudnn)              # set cudnn libraries, must before configure
-include(third_party)        # download, build, install third_party
+if(WITH_GPU)
+    include(cuda)
+    # lite subgraph compilation depends on CUDNN_ROOT,
+    # so include(cudnn) needs to be in front of include(third_party/lite)
+    include(cudnn)              # set cudnn libraries, must before configure
+    include(tensorrt)
+    # there is no official support of nccl, cupti in windows
+    if(NOT WIN32)
+        include(cupti)
+    endif()
+endif()
+
+include(third_party)  # download, build, install third_party, Contains about 20+ dependencies

 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
@ -209,18 +218,8 @@ if(WITH_DISTRIBUTE)
    endif()
 endif()

-# there is no official support of nccl, cupti in windows
-if(NOT WIN32)
-    include(cupti)
-endif()
-
 include(flags)              # set paddle compile flags

-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-endif()
-
 if(WITH_PROFILER)
    find_package(Gperftools REQUIRED)
    include_directories(${GPERFTOOLS_INCLUDE_DIR})
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -198,7 +198,9 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
 endif()

-add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
+add_definitions("-DCUDA_VERSION_MAJOR=\"${CUDA_VERSION_MAJOR}\"")
+add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"")
+add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"")

 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
@ -249,3 +251,4 @@ endif()

 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
+
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -35,17 +35,18 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
 	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
 	)
 set(CUDNN_LIB_NAME "")
+
 if (LINUX)
-set(CUDNN_LIB_NAME "libcudnn.so")
+    set(CUDNN_LIB_NAME "libcudnn.so")
 endif(LINUX)

 if(WIN32)
-# only support cudnn7
-set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+    # only support cudnn7
+    set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)

 if(APPLE)
-set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+    set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
 endif(APPLE)

 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
@ -88,7 +89,7 @@ macro(find_cudnn_version cudnn_header_file)
        if(NOT CUDNN_MAJOR_VERSION)
            set(CUDNN_VERSION "???")
        else()
-            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
+            add_definitions("-DCUDNN_MAJOR_VERSION=\"${CUDNN_MAJOR_VERSION}\"")
            math(EXPR CUDNN_VERSION
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@ -14,55 +14,40 @@

 INCLUDE(ExternalProject)

-execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
-
 SET(GLOO_PROJECT       "extern_gloo")
-IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(GLOO_VER "master" CACHE STRING "" FORCE)
-  SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
-
-  if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
-  else()
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
-  endif()
-ENDIF()
-
-MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
-SET(GLOO_SOURCE_DIR    "${THIRD_PARTY_PATH}/gloo")
-SET(GLOO_DOWNLOAD_DIR  "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
-SET(GLOO_DST_DIR       "gloo")
-SET(GLOO_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(GLOO_INSTALL_DIR   ${GLOO_INSTALL_ROOT}/${GLOO_DST_DIR})
-SET(GLOO_ROOT          ${GLOO_INSTALL_DIR})
-SET(GLOO_INC_DIR       ${GLOO_ROOT}/include)
-SET(GLOO_LIB_DIR       ${GLOO_ROOT}/lib)
-SET(GLOO_LIB           ${GLOO_LIB_DIR}/libgloo.a)
-#SET(GLOO_IOMP_LIB      ${GLOO_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${GLOO_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${GLOO_INC_DIR})
-
-FILE(WRITE ${GLOO_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(GLOO)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${GLOO_NAME}/include ${GLOO_NAME}/lib \n"
-  "        DESTINATION ${GLOO_DST_DIR})\n")
+SET(GLOO_PREFIX_DIR    ${THIRD_PARTY_PATH}/gloo)
+SET(GLOO_SOURCE_DIR    ${THIRD_PARTY_PATH}/gloo/src/extern_gloo/gloo)
+SET(GLOO_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gloo)
+SET(GLOO_INCLUDE_DIR   "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
+SET(GLOO_LIBRARY_DIR   "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
+SET(GLOO_REPOSITORY    https://github.com/sandyhouse/gloo.git)
+SET(GLOO_TAG           v0.0.2)
+SET(GLOO_LIBRARIES     "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
+
+INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
+
+cache_third_party(extern_gloo
+    REPOSITORY    ${GLOO_REPOSITORY}
+    TAG           ${GLOO_TAG}
+    DIR           GLOO_SOURCE_DIR)

 ExternalProject_Add(
-    ${GLOO_PROJECT}
+    extern_gloo
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${GLOO_SOURCE_DIR}
-    DOWNLOAD_DIR          ${GLOO_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${GLOO_URL} -c -q -O ${GLOO_NAME}.tar.gz
-                          && tar zxvf ${GLOO_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
+    ${SHALLOW_CLONE}
+    "${GLOO_DOWNLOAD_CMD}"
+    PREFIX                "${GLOO_PREFIX_DIR}"
+    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${GLOO_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GLOO_INSTALL_ROOT}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
 )

-ADD_LIBRARY(gloo SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIB})
+
+ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
 ADD_DEPENDENCIES(gloo ${GLOO_PROJECT})
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@ -22,7 +22,7 @@ if(XPU_SDK_ROOT)
  set(LITE_WITH_XPU ON)
  include_directories("${XPU_SDK_ROOT}/XTDK/include")
  include_directories("${XPU_SDK_ROOT}/XTCL/include")
-  add_definitions(-DPADDLE_WITH_XPU)
+  add_definitions(-DLITE_SUBGRAPH_WITH_XPU)
  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
 endif()
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@ -84,19 +84,6 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
      VLOG(3) << "do not perform " + type() + "+bias fuse";
      return;
    }
-    if (conv->Op()->HasAttr("dilations")) {
-      auto dilations =
-          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
-      for (const auto& d : dilations) {
-        if (d != 1) {
-          LOG(WARNING)
-              << "dilation conv not supported in MKLDNN, fuse not apply "
-              << "and set conv attribute use_mkldnn = false";
-          conv->Op()->SetAttr("use_mkldnn", false);
-          return;
-        }
-      }
-    }

    auto* eltwise_bias_tensor =
        scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -84,6 +84,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
  }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@ -285,6 +291,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
  }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@ -100,7 +100,19 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
  serv_addr.sin_family = AF_INET;
  serv_addr.sin_port = htons(port);

-  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
+  char *ip = NULL;
+  struct hostent *hp;
+  if ((hp = gethostbyname(host.c_str())) == NULL) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Fail to get host by name %s.", host));
+  }
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr *)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+  if (inet_pton(AF_INET, ip, &serv_addr.sin_addr) <= 0) {
    PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
  }

--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@ -16,6 +16,7 @@
 // network header files
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include <arpa/inet.h>
+#include <netdb.h>
 #include <netinet/in.h>
 #include <stdlib.h>
 #include <sys/socket.h>
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@ -20,6 +20,8 @@
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"

+DECLARE_bool(use_mkldnn);
+
 namespace paddle {
 namespace imperative {

@ -91,8 +93,10 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
  // GetKernelType functions, so we need to copy the attributes there.
  // Const qualifier of Attrs had to be discarded to overwrite it.
-  auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
-  mutable_op_attrs = attrs;
+  if (FLAGS_use_mkldnn) {
+    auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
+    mutable_op_attrs = attrs;
+  }
 #endif
  auto expected_kernel_key =
      op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@ -20,7 +20,7 @@ namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;

 imperative::ParallelStrategy GetStrategy(int local_rank) {
-  std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
+  std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
  imperative::ParallelStrategy strategy;
  strategy.trainer_endpoints_ = eps;
  strategy.current_endpoint_ = eps[local_rank];
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@ -375,7 +375,7 @@ void AnalysisConfig::Update() {
  }

  if (use_xpu_) {
-#ifndef PADDLE_WITH_XPU
+#ifndef LITE_SUBGRAPH_WITH_XPU
    PADDLE_THROW(platform::errors::Unavailable(
        "You tried to use an XPU device, but Paddle was not compiled "
        "with XPU-runtime."));
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
 cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context ${XPU_DEPS})
 cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@ -16,7 +16,7 @@
 #define LITE_WITH_CUDA 1
 #endif

-#ifdef PADDLE_WITH_XPU
+#ifdef LITE_SUBGRAPH_WITH_XPU
 #define LITE_WITH_XPU 1
 #endif

@ -59,7 +59,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
      cfg.cpu_math_library_num_threads);
 #endif

-#ifdef PADDLE_WITH_XPU
+#ifdef LITE_SUBGRAPH_WITH_XPU
  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
      cfg.xpu_l3_workspace_size);
 #endif
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@ -13,12 +13,49 @@
 // limitations under the License.

 #include "paddle/fluid/operators/allclose_op.h"
+#include <cmath>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace operators {

+template <typename T>
+struct GetTensorValue<platform::CPUDeviceContext, T> {
+  T operator()(const platform::CPUDeviceContext& dev_ctx,
+               const framework::Tensor& tensor) const {
+    return *(tensor.data<T>());
+  }
+};
+
+template <typename T>
+struct AllcloseFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& other,
+                  const double rtol, const double atol, bool equal_nan,
+                  framework::Tensor* output) {
+    auto* in_a = in.data<T>();
+    auto* in_b = other.data<T>();
+    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
+    auto num = in.numel();
+    *out_data = true;
+    for (int i = 0; i < num; i++) {
+      const T a = in_a[i], b = in_b[i];
+      bool val;
+      if (std::isnan(a) || std::isnan(b)) {
+        val = equal_nan && std::isnan(a) == std::isnan(b);
+      } else {
+        T left = (a > b ? a - b : b - a);
+        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+        T diff = (left > right ? left - right : right - left);
+        val = a == b || left <= right || diff <= 1e-15;
+      }
+      *out_data &= val;
+    }
+  }
+};
+
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@ -26,12 +63,9 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
             "The input tensor, it's data type should be float32, float64.");
    AddInput("Other",
             "The input tensor, it's data type should be float32, float64.");
+    AddInput("Rtol", "The relative tolerance.");
+    AddInput("Atol", "The absolute tolerance.");
    AddOutput("Out", "The output tensor, it's data type is bool.");
-
-    AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
-        .SetDefault(1e-5);
-    AddAttr<float>("atol", "The absolute tolerance. Default: :math:`1e-8` .")
-        .SetDefault(1e-8);
    AddAttr<bool>("equal_nan",
                  "If :math:`True` , then two :math:`NaNs` will be "
                  "compared as equal. Default: :math:`False` .")
@ -54,16 +88,12 @@ class AllcloseOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::NotFound(
-                          "Input(Input) of allclose op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Other"), true,
-                      platform::errors::NotFound(
-                          "Input(Other) of allclose op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound(
-                          "The output(Out) of allclose op must not be null."));
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose");
+    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose");
+    OP_INOUT_CHECK(ctx->HasInput("Rtol"), "Input", "Rtol", "Allclose");
+    OP_INOUT_CHECK(ctx->HasInput("Atol"), "Input", "Atol", "Allclose");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose");

    auto input_dim = ctx->GetInputDim("Input");
    auto other_dim = ctx->GetInputDim("Other");
@ -96,7 +126,7 @@ class AllcloseOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
+      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
        ctx.device_context());
@ -105,7 +135,7 @@ class AllcloseOp : public framework::OperatorWithKernel {

 class AllcloseOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
+  void operator()(framework::InferVarTypeContext* ctx) const override {
    ctx->SetOutputDataType("Out", framework::proto::VarType::BOOL);
  }
 };
--- a/paddle/fluid/operators/allclose_op.cu
+++ b/paddle/fluid/operators/allclose_op.cu
@ -12,12 +12,70 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#define EIGEN_USE_GPU
-
+#include <cuda_runtime.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/allclose_op.h"

+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct GetTensorValue<platform::CUDADeviceContext, T> {
+  T operator()(const platform::CUDADeviceContext& dev_ctx,
+               const framework::Tensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place =
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
+                 dev_ctx.stream());
+    return value;
+  }
+};
+
+template <typename T>
+__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data,
+                                   const double rtol, const double atol,
+                                   bool equal_nan, int num, bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    if (!val) *out_data = false;
+  }
+}
+
+template <typename T>
+struct AllcloseFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor& in, const framework::Tensor& other,
+                  const double rtol, const double atol, bool equal_nan,
+                  framework::Tensor* output) {
+    int num = in.numel();
+    const T* in_data = in.data<T>();
+    const T* other_data = other.data<T>();
+    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+    cudaMemset(out_data, true, sizeof(bool));
+    AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, other_data, rtol, atol, equal_nan, num, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel<CUDA, float>,
--- a/paddle/fluid/operators/allclose_op.h
+++ b/paddle/fluid/operators/allclose_op.h
@ -22,38 +22,38 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;

+template <typename DeviceContext, typename T>
+struct GetTensorValue {
+  T operator()(const platform::DeviceContext& ctx,
+               const framework::Tensor& tensor) const;
+};
+
+template <typename DeviceContext, typename T>
+struct AllcloseFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& other, const float rtol,
+                  const float atol, bool equal_nan, framework::Tensor* output);
+};
+
 template <typename DeviceContext, typename T>
 class AllcloseKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    // get attrs
-    float rtol = ctx.Attr<float>("rtol");
-    float atol = ctx.Attr<float>("atol");
    bool equal_nan = ctx.Attr<bool>("equal_nan");
    // get input/output
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* other = ctx.Input<Tensor>("Other");
+    const auto* input = ctx.Input<Tensor>("Input");
+    const auto* other = ctx.Input<Tensor>("Other");
+    const auto* rtol = ctx.Input<Tensor>("Rtol");
+    const auto* atol = ctx.Input<Tensor>("Atol");
    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    // get place
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto input_v = framework::EigenVector<T>::Flatten(*input);
-    auto other_v = framework::EigenVector<T>::Flatten(*other);
-    auto out_v = framework::EigenScalar<bool>::From(*out);
-
-    auto left = (input_v - other_v).abs();
-    auto right = static_cast<T>(atol) + static_cast<T>(rtol) * other_v.abs();
-    auto compare_res = left <= right;
-
-    if (equal_nan) {
-      auto input_nan = input_v.isnan();
-      auto other_nan = other_v.isnan();
-      out_v.device(place) =
-          (input_nan == other_nan).all() && (compare_res != input_nan).all();
-    } else {
-      out_v.device(place) = compare_res.all();
-    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    GetTensorValue<DeviceContext, double> get_tensor_value;
+    double rtol_v = get_tensor_value(dev_ctx, *rtol);
+    double atol_v = get_tensor_value(dev_ctx, *atol);
+    AllcloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
+                                        equal_nan, out);
  }
 };

--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@ -54,7 +54,7 @@ class AssignFunctor {
    out_rows.set_height(rows.height());
    auto &t = rows.value();
    auto *m = out_rows.mutable_value();
-    framework::TensorCopy(t, dev_ctx_.GetPlace(), dev_ctx_, m);
+    framework::TensorCopy(t, t.place(), m);
  }

  template <typename T>
@ -70,7 +70,7 @@ class AssignFunctor {
                   framework::LoDTensor *out) const {
    if (lod_tensor.numel() == 0) return;
    auto &out_tensor = *out;
-    TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
+    TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
    out_tensor.set_lod(lod_tensor.lod());
  }

--- a/paddle/fluid/operators/assign_op_xpu.cc
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/assign_op.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class AssignOp : public framework::OperatorWithKernel {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->HasInput("X")) {
+      auto type = ctx->GetInputsVarType("X")[0];
+      if (type == framework::proto::VarType::SELECTED_ROWS ||
+          type == framework::proto::VarType::LOD_TENSOR) {
+        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+        if (type == framework::proto::VarType::LOD_TENSOR) {
+          ctx->ShareLoD("X", /*->*/ "Out");
+        }
+      } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+        if (ctx->IsRuntime()) {
+          // The runtime output shape is determined in kernel.
+          return;
+        } else {
+          ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+        }
+      }
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   expected_kernel_type.place_,
+                                   tensor.layout());
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const framework::Variable *var = ctx.InputVar("X");
+    if (var->IsType<framework::LoDTensorArray>()) {
+      auto t_arr = var->Get<framework::LoDTensorArray>();
+      // NOTE(liym27): Support an empty tensor array as Input.
+      // And set the kernel type is float.
+      if (t_arr.size() == 0) {
+        return framework::OpKernelType(framework::proto::VarType::FP32,
+                                       ctx.device_context());
+      }
+    }
+
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class AssignInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SyncTypeAndDataType("X", "Out");
+  }
+};
+
+class AssignKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *x = ctx.InputVar("X");
+    if (x == nullptr) {
+      return;
+    }
+    PADDLE_ENFORCE_EQ(
+        ctx.HasOutput("Out"), true,
+        platform::errors::NotFound("Output(Out) of assign_op is not found."));
+    auto *out = ctx.OutputVar("Out");
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(ctx.GetPlace());
+
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+template <typename T>
+class AssignGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("assign");
+    op->SetInput("X", this->OutputGrad("Out"));
+    op->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
+                               ops::AssignKernel, int, ops::AssignKernel,
+                               int64_t, ops::AssignKernel, bool,
+                               ops::AssignKernel);
+#endif
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/layout_utils.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"

@ -41,127 +42,6 @@ template <typename T>
 using ConstEigenVectorArrayMap =
    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;

-template <typename DeviceContext, typename T>
-inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
-                                 const Tensor* input,
-                                 Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[4];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    in_dims_vec[4] = input->dims()[3];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[3];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  } else if (dim == 1) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelFirst(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 4, 1, 2, 3};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 3, 1, 2};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  } else if (dim == 1) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
-    trans3(dev_ctx, *input, transformed_input, axis);
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelLast(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[4];
-    in_dims_vec[4] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  } else if (dim == 1) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelLast(const framework::ExecutionContext& context,
-                               const Tensor* input, Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 4, 1};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 1};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  } else if (dim == 1) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
-    trans3(dev_ctx, *input, transformed_input, axis);
-  }
-}
-
 class BatchNormOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/cast_op.h"
+#include <memory>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename InT>
+class CastXPUKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto in_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("in_dtype"));
+    auto out_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("out_dtype"));
+    auto* in_data = in->data<InT>();
+    auto numel = in->numel();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = -1;
+    if (out_type == framework::proto::VarType::FP32) {
+      auto* out_data = out->mutable_data<float>(context.GetPlace());
+      r = xpu::cast<InT, float>(dev_ctx.x_context(), in_data, out_data, numel);
+    } else if (out_type == framework::proto::VarType::INT32) {
+      auto* out_data = out->mutable_data<int>(context.GetPlace());
+      r = xpu::cast<InT, int>(dev_ctx.x_context(), in_data, out_data, numel);
+    } else if (out_type == framework::proto::VarType::INT64) {
+      auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
+      r = xpu::cast<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
+                                  numel);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
+                                                 in_type, out_type));
+    }
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
+#endif
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@ -67,6 +67,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
    }

    auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
+    bool use_align = context.Attr<bool>("use_align");

    if (context.Attr<bool>("check_name")) {
      for (size_t i = 0; i < in_var_names.size(); ++i) {
@ -93,7 +94,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
        context.Attr<int>("dtype"));
    size_t size_of_dtype = framework::SizeOfType(dtype);
    GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
-                       context.GetPlace());
+                       context.GetPlace(), use_align);

    // Alloc the continuous space
    auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
@ -111,8 +112,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
        framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
                              &sub_tensor);

-        offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-                  size_of_dtype;
+        offset +=
+            use_align
+                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                      size_of_dtype
+                : len;
      }
    } else if (context.Attr<bool>("set_constant")) {
      math::SetConstant<DeviceContext, T> set_constant;
@ -131,8 +135,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
          ->ShareDataWith(fused_tensor->Slice(
              static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
          .Resize(dim);
-      len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-            size_of_dtype;
+      len = use_align
+                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                      size_of_dtype
+                : len;
      offset += len;
      ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
         << " address: " << out_tensors[i]->data<void>() << ", ";
@ -144,7 +150,8 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
  void GetMemSizeAndDtype(
      const std::vector<const framework::LoDTensor *> &lod_tensors,
      const std::vector<std::string> var_names, size_t *numel,
-      const size_t &size_of_dtype, const platform::Place &place) const {
+      const size_t &size_of_dtype, const platform::Place &place,
+      const bool use_align = true) const {
    PADDLE_ENFORCE_EQ(
        lod_tensors.size(), var_names.size(),
        platform::errors::InvalidArgument(
@ -167,9 +174,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
      ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
         << ") "
         << " addres:" << lod_tensors[i]->data<void>() << ", ";
-      *numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
-                                    place) /
-                size_of_dtype;
+      *numel += use_align
+                    ? platform::Alignment(
+                          static_cast<size_t>(size) * size_of_dtype, place) /
+                          size_of_dtype
+                    : static_cast<size_t>(size);
    }

    VLOG(10) << ss.str();
@ -223,6 +232,10 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Whether to check the name of Input and Output to ensure "
                  "they are the same separately.")
        .SetDefault(false);
+    AddAttr<bool>("use_align",
+                  "Whether to consider memory chunk and take alignment into "
+                  "account for inputs and outputs.")
+        .SetDefault(true);
    AddComment(R"DOC(
 CoalesceTensor Operator.

--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#ifdef PADDLE_WITH_MKLDNN
+#include <paddle/fluid/platform/mkldnn_helper.h>
+#endif
+
+#ifdef PADDLE_WITH_XPU
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ConcatXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
+    int axis = ctx.Attr<int>("axis");
+    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
+                                           "The input should not be null."));
+    PADDLE_ENFORCE_NE(ctx.HasInput("AxisTensor"), true,
+                      platform::errors::InvalidArgument(
+                          "XPU donot surpport AxisTensor for now"));
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+    PADDLE_ENFORCE_GE(
+        axis, 0, platform::errors::InvalidArgument("concat: axis shoud >= 0!"));
+    PADDLE_ENFORCE_LT(axis, ins[0]->dims().size(),
+                      platform::errors::InvalidArgument(
+                          "concat: axis shoud < ins[0]->dims()!"));
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    std::vector<int> choose_idx;
+    int n = 0;
+    for (unsigned int i = 0; i < ins.size(); ++i) {
+      if (ins[i] && ins[i]->numel() > 0) {
+        choose_idx.push_back(i);
+        n++;
+      }
+    }
+    PADDLE_ENFORCE_LE(n, 8, platform::errors::InvalidArgument(
+                                "XPU only surpport at most 8 tensors for now"));
+    PADDLE_ENFORCE_GT(
+        n, 0, platform::errors::InvalidArgument("No tensor need concat?"));
+    int h = 1;
+    int w_except_axis = 1;
+    for (int i = 0; i < axis; ++i) {
+      h *= (ins[choose_idx[0]]->dims())[i];
+    }
+    for (int i = axis + 1; i < ins[0]->dims().size(); ++i) {
+      w_except_axis *= (ins[choose_idx[0]]->dims())[i];
+    }
+    for (int i = 1; i < n; ++i) {
+      int hh = 1;
+      int ww = 1;
+      for (int j = 0; j < axis; ++j) {
+        hh *= (ins[choose_idx[i]]->dims())[j];
+      }
+      for (int j = axis + 1; j < ins[i]->dims().size(); ++j) {
+        ww *= (ins[choose_idx[i]]->dims())[j];
+      }
+      PADDLE_ENFORCE_EQ(hh, h, platform::errors::InvalidArgument(
+                                   "concat: h should be eual!"));
+      PADDLE_ENFORCE_EQ(ww, w_except_axis,
+                        platform::errors::InvalidArgument(
+                            "concat: w should be eual except for axis!"));
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    std::unique_ptr<int[]> in_w_host(new int[n]);
+    std::unique_ptr<const float* []> ptrs(new const float*[n]);
+    for (int i = 0; i < n; ++i) {
+      ptrs[i] = ins[choose_idx[i]]->data<T>();
+      in_w_host[i] = w_except_axis * (ins[choose_idx[i]]->dims())[axis];
+    }
+    int r =
+        xpu::concat<float>(dev_ctx.x_context(), h, (const int*)in_w_host.get(),
+                           n, (const float**)ptrs.get(), out->data<T>());
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+template <typename DeviceContext, typename T>
+class ConcatGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+    {
+      auto dx = outs;
+      auto x = ins;
+      for (size_t i = 0; i < dx.size(); ++i) {
+        if (dx[i] != nullptr) {
+          dx[i]->set_lod(x[i]->lod());
+        }
+      }
+    }
+    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
+                                           "The input should not be null."));
+    auto axis = ctx.Attr<int>("axis");
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+    }
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName &&
+          outs[j]->numel() != 0UL) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
+    PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
+                                   "concat_grad: axis shoud >= 0!"));
+    PADDLE_ENFORCE_LT(axis, out_grad->dims().size(),
+                      platform::errors::InvalidArgument(
+                          "concat_grad: axis shoud < ins[0]->dims()!"));
+    auto out_grad_stride = framework::stride_numel(out_grad->dims());
+    int n = outputs.size();
+    PADDLE_ENFORCE_LE(n, 16,
+                      platform::errors::InvalidArgument(
+                          "XPU only surpport at most 16 tensors for now"));
+    int h = out_grad_stride[0] / out_grad_stride[axis];
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    std::unique_ptr<int[]> in_w_host(new int[n]);
+    std::unique_ptr<float* []> ptrs(new float*[n]);
+    for (int i = 0; i < n; ++i) {
+      auto out_stride = framework::stride_numel(outputs[i]->dims());
+      ptrs[i] = outputs[i]->data<T>();
+      in_w_host[i] = out_stride[axis];
+    }
+    int r = xpu::concat_grad(dev_ctx.x_context(), h, in_w_host.get(), n,
+                             reinterpret_cast<float**>(ptrs.get()),
+                             out_grad->data<T>());
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    concat, ops::ConcatXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    concat_grad,
+    ops::ConcatGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/layout_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
@ -138,102 +139,6 @@ inline bool IsExpand(const std::vector<int64_t>& filter_dim,
  return !(filter_1 && strides_1 && padding_0 && dilation_1);
 }

-template <typename DeviceContext, typename T>
-inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
-                                 const Tensor* input,
-                                 Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[4];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    in_dims_vec[4] = input->dims()[3];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[3];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelLast(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[4];
-    in_dims_vec[4] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelFirst(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 4, 1, 2, 3};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 3, 1, 2};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelLast(const framework::ExecutionContext& context,
-                               const Tensor* input, Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 4, 1};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 1};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  }
-}
 // Define Op classes in .h file so that other conv
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@ -27,10 +27,10 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
    // that avoids modifying the variable in the Scope.
    Tensor filter = *context.Input<Tensor>("Filter");
    Tensor* output = context.Output<Tensor>("Output");
-    Tensor* max_input = context.Output<Tensor>("MaxInput");
-    Tensor* max_filter = context.Output<Tensor>("MaxFilter");
-    max_input->mutable_data<T>(context.GetPlace());
-    max_filter->mutable_data<T>(context.GetPlace());
+    // Tensor* max_input = context.Output<Tensor>("MaxInput");
+    // Tensor* max_filter = context.Output<Tensor>("MaxFilter");
+    // max_input->mutable_data<T>(context.GetPlace());
+    // max_filter->mutable_data<T>(context.GetPlace());
    output->mutable_data<T>(context.GetPlace());
    int groups = context.Attr<int>("groups");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@ -47,33 +47,47 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
        dilations[0] == 1 && dilations[1] == 1, true,
        platform::errors::InvalidArgument("XPU only support dilation == 1."));
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
-                     max_input->data<T>()) == xpu::Error_t::SUCCESS,
-        true, platform::errors::InvalidArgument("XPU kernel error!"));
-    PADDLE_ENFORCE_EQ(
-        xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
-                     max_filter->data<T>()) == xpu::Error_t::SUCCESS,
-        true, platform::errors::InvalidArgument("XPU kernel error!"));
+    // PADDLE_ENFORCE_EQ(
+    //     xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
+    //                  max_input->data<T>()) == xpu::Error_t::SUCCESS,
+    //     true, platform::errors::InvalidArgument(
+    //               "XPU conv kernel error,can not finde max_input,please "
+    //               "check whether Baidu Kunlun "
+    //               "Card is properly installed."));
+    // PADDLE_ENFORCE_EQ(
+    //     xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
+    //                  max_filter->data<T>()) == xpu::Error_t::SUCCESS,
+    //     true, platform::errors::InvalidArgument(
+    //               "XPU conv kernel error,can not find max_filter,please "
+    //               "check whether Baidu Kunlun "
+    //               "Card is properly installed."));
    if (groups == 1) {
      int r = xpu::conv2d_forward_int16<float, float, float, float>(
          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
          dilations[1], groups, input->data<float>(), filter.data<float>(),
          output->data<float>(), nullptr, nullptr, xpu::Activation_t::LINEAR,
-          // nullptr, nullptr);
-          max_input->data<float>(), max_filter->data<float>());
-      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                        platform::errors::InvalidArgument("XPU kernel error!"));
+          nullptr, nullptr);
+      // max_input->data<float>(), max_filter->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
    } else {
      int r = xpu::conv2d_int16_with_group<float, float, float>(
          dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
          output->data<float>(), batch_size, img_c, img_h, img_w, f, win_h,
          win_w, groups, strides[0], strides[1], paddings[0], paddings[1],
-          // nullptr, nullptr);
-          max_input->data<float>(), max_filter->data<float>());
-      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                        platform::errors::InvalidArgument("XPU kernel error!"));
+          nullptr, nullptr);
+      // max_input->data<float>(), max_filter->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
    }
  }
 };
@ -82,9 +96,9 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* max_input = context.Input<Tensor>("MaxInput");
-    const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
-    Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
+    // const Tensor* max_input = context.Input<Tensor>("MaxInput");
+    // const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
+    // Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
    const Tensor* output_grad =
        context.Input<Tensor>(framework::GradVarName("Output"));
    Tensor* input_grad =
@ -119,34 +133,45 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(context.GetPlace());
    }
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    max_output_grad->Resize({4});
-    max_output_grad->mutable_data<T>(context.GetPlace());
-    PADDLE_ENFORCE_EQ(
-        xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
-                     output_grad->numel(),
-                     max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
-        true, platform::errors::InvalidArgument("XPU kernel error!"));
+    // max_output_grad->Resize({4});
+    // max_output_grad->mutable_data<T>(context.GetPlace());
+    // PADDLE_ENFORCE_EQ(
+    //     xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
+    //                  output_grad->numel(),
+    //                  max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
+    //     true,
+    //     platform::errors::External(
+    //         "XPU conv kernel error, can not find max_output_grad, please
+    //         check "
+    //         "whether Baidu Kunlun Card is "
+    //         "properly installed."));
    if (input_grad) {
      int r = xpu::conv2d_backward_int16(
          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
          dilations[1], groups, output_grad->data<float>(),
-          filter.data<float>(), input_grad->data<float>(),
-          // nullptr, nullptr,
-          max_output_grad->data<float>(), max_filter->data<float>());
-      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                        platform::errors::InvalidArgument("XPU kernel error!"));
+          filter.data<float>(), input_grad->data<float>(), nullptr, nullptr);
+      // max_output_grad->data<float>(), max_filter->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
    }
    if (filter_grad) {
      int r = xpu::conv2d_backward_weight_int16(
          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
          dilations[1], groups, output_grad->data<float>(),
-          input->data<float>(), filter_grad->data<float>(),
-          // nullptr, nullptr,
-          max_output_grad->data<float>(), max_input->data<float>());
-      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                        platform::errors::InvalidArgument("XPU kernel error!"));
+          input->data<float>(), filter_grad->data<float>(), nullptr, nullptr);
+      // max_output_grad->data<float>(), max_input->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
    }
  }
 };
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
swtkiwi	13e906c162	delete matplotlib	6 years ago
danleifeng	3f8746842b	nccl init support hostname and ip; test=develop (#28163 )	6 years ago
Wilber	c008ae696a	update. (#28166 )	6 years ago
lidanqing	56aba4b747	add FLAGS_use_mkldnn check (#28147 )	6 years ago
wangguanzhong	86887b9b1f	Cherry pick fix generate proposals labels (#28165 ) * fix generate_proposal_labels in cascade-rcnn series model, test=develop * fix example code & unittest, test=develop * update code from review comments, test=develop	6 years ago
wangguanzhong	5178d9f832	support multiclass nms for multi-batch, test=develop (#28164 )	6 years ago
liu zhengxi	5d94a5ca54	fix dynamic decode imperative (#28162 )	6 years ago
李灿	4193ae0d02	fix bugs test=develop (#28158 )	6 years ago
Zhou Wei	dc53ba879f	fix dynamic_loader more safe and error message on windows (#28144 )	6 years ago
WangXi	739043c6a6	【paddle.fleet】fleet add _get_applied_meta_list and _get_applied_graph_list, test=develop (#27952 ) (#28053 )	6 years ago
Double_V	39745d448c	Roi align kl (#28128 ) * add rois_num params for roi_align_xpu op, test=kunlun * fix xpu roi_align_op warning,test=kunlun	6 years ago
LielinJiang	172e76a852	transform add pil backend (#28132 )	6 years ago
tianshuo78520a	f63f8d730c	Add build paddle inference (#28137 )	6 years ago
李灿	2477ccc378	fix bugs test=develop (#28113 )	6 years ago
danleifeng	61162497b4	raise error if use multi-cards in fleet non_distributed mode;test=develop (#28093 )	6 years ago
LielinJiang	d4160941f3	fix dataloader (#28105 )	6 years ago
yinhaofeng	0b294906f9	lookup_table_v2_op_xpu report errors;test=kunlun (#28064 ) (#28100 ) * lookup_table_v2_op_xpu report errors;test=kunlun * lookup_table_v2_op_xpu report errors;test=kunlun	6 years ago
yinhaofeng	ea45fb90a4	【cherry-pick】xpu adam op (#28031 ) (#28097 ) * xpu adam op (#28031) * lookup_table_xpu op report errors;test=kunlun * add adam xpu op;test=kunlun * reset lookup * change adam wrong;test=kunlun * add adam xpu op;test=kunlun	6 years ago
Double_V	4f43d51f43	add rois_num params for roi_align_xpu op, test=kunlun (#28094 )	6 years ago
lilong12	c4e18dc093	build gloo from source code instead of using the pre-compiled library (#27930 ) (#28009 ) * build gloo from source code , test=develop	6 years ago
Kaipeng Deng	6375ad39cc	[cherry-pick] lr scheduler epoch2step (#28056 ) * hapi/model step learning rate on batch end. test=develop	6 years ago
TeslaZhao	11adb0f373	[cherry-pick] Add xpu transpose2 op.test=kunlun (#28096 )	6 years ago
lilong12	957e6fbe5f	add doc for ReduceOp (#28051 ) (#28082 ) * add doc, test=document_fix	6 years ago
Chengmo	91727ac899	Fix xpu error message (#28061 ) (#28092 ) * fix error message,test=kunlun * fix, test=kunlun	6 years ago
liuyuhui	5c2852a330	[API 2.0: doc] transfer from paddle.fluid.layers.assign() into creation.py (#27999 ) (#28074 ) * transfer from paddle.fluid.layers.assign() into creation.py,test=develop * fix ut fail,add support for paddle.assign,test=develop * fix,test=develop * fix UT coverage,test=coverage * fix UT fail,test=coverage * fix doc,test=develop	6 years ago
huangxu96	6bb6cb27b8	Allclose op (#27891 ) (#28069 ) * Fixed allclose_op bug, which cannot deal with some cases of fp64 inputs. * improved CUDA kernel performance. * Fixed a bug in cuda kernel which cannot deal with large dimension input, and added an unit test for it. * Add a test case for float32 input.	6 years ago
xiaoting	905b076553	rm max_input in conv2d for kunlun, test=kunlun (#28063 )	6 years ago
Double_V	8600f47439	error message opt for XPU, test=kunlun (#27972 ) (#28078 ) * add stack pool2d roi_align xpu op,test=kunlun * error message opt, test=kunlun * add xpu unittest,test=kunlun * skip check grad,test=kunlun * fix boostget , test=kunlun * error message opt for XPU, test=kunlun	6 years ago
Pei Yang	d89deae9e0	reduce trt warning message (#28011 ) (#28075 )	6 years ago
lilong12	316c97c7eb	put gloo initialization log to file (#27969 ) (#28076 ) * update, test=develop	6 years ago
pangyoki	46a1f69b3c	cherry pick 27861 Add truncated_gaussian_random XPU kernel, test=kunlun (#28060 ) * Add truncated_gaussian_random_op XPU kernel * Add truncated_gaussian_random_op XPU kernel, test=kunlun * little change, test=kunlun * change boost_get to BOOST_GET_CONST * change boost_get to BOOST_GET_CONST, test=kunlun * little change, test=kunlun * use Generator to generate random number and optimize format, test=kunlun * little change, test=kunlun * add TODO, test=kunlun	6 years ago
pangyoki	b21409e0c0	cherry pick 27853 Add gaussian_random XPU kernels, test=kunlun (#28059 ) * Add gaussian_random XPU kernels * commit kunlun, test=kunlun * new version, test=kunlun * change boost_get to BOOST_GET_CONST, test=kunlun * use Generator to generate random number and optimize format, test=kunlun * add TODO, test=kunlun	6 years ago
pangyoki	69ec13cdf0	cherry pick 27846 Add uniform_random XPU kernel, test=kunlun (#28057 ) * support uniform_random op on Baidu Kunlun * change dtype of attr shape from int to int64_t * kunlun ci, test=kunlun * new version, test=kunlun * change boost_get to BOOST_GET_CONST * change boost_get to BOOST_GET_CONST, test=kunlun * use Generator to generate random number and optimize format * run Kunlun CI, test=kunlun * add TODO, test=kunlun	6 years ago
pangyoki	386429beb0	cherry pick 27946 Fix error message of multinomial op (#28080 ) * fix multinomial doc * fix multinomial error message * little doc change * fix Categorical class doc * optimize format of error message * fix CPU Kernel error message format * fix isinf and isnan error in WindowsOPENBLAS CI * delete inf and nan * add manual_seed in sample code * little error message change * change error message to InvalidArgument * add full point for error message and add manual_seed in CPU environment	6 years ago
MRXLT	e3513a6395	[cherry pick] Fix fleet (#28067 ) * fleet support paddle.optimzier * bug fix * fix fleet_base * bug fix * fix coverage	6 years ago
LutaoChu	e3a88eb450	Fix diag OP bug on Windows Python3.8, cherry-pick from #28034 Fix diag OP bug on Windows Python3.8, remove the std::min	6 years ago
littletomatodonkey	edbaa027fe	fix pad api (#28045 ) * fix pad api * fix init	6 years ago
liuyuhui	77eddf9168	add cast/concat/assign xpu op (#27911 ) (#28050 ) * addd * add cast_op_xpu, test=kunlun * fix bug for cast_op_xpu,test=kunlun * add concat_op_xpu, test=kunlun * slove conflicts, test=kunlun * fix bug,test=kunlun * add assign_op_xpu, test=kunlun * fix bug,test=kunlun * test=kunlun;test=develop * fix concat bug,test=kunlun * fix check_dygraph set in test_concat_op_xpu.py,test=kunlun * fix error message,test=kunlun Co-authored-by: mapingshuo <mps2012@yeah.net> Co-authored-by: mapingshuo <mps2012@yeah.net>	6 years ago
Kaipeng Deng	f05b184f72	update yolo_box support h != w. test=develop (#28054 )	6 years ago
Zhou Wei	e9a1669e92	fix optimizer init (#27994 )	6 years ago
xiaoting	5c1babde56	[cherry-pick] polish kunlun error message for 2.0 rc (#28048 ) * polish error message,test=kunlun * polish error,test=kunlun * polish error,test=kunlun * polish error,test=kunlun	6 years ago
Guo Sheng	3f565903e8	[cherry-pick] Incorporate cudnn_lstm into LSTM api (#27217 ) (#28023 ) * Incorporate cudnn_lstm into LSTM api (#27217) * Incorporate cudnn_lstm into LSTM api. test=develop * Make coalesce_tensor support alignment optionally. test=develop * Reorganize RNN apis. test=develop * Fix cudnn rnn layout conversion. test=develop * Add sequence_length support for RNN cudnn implement. Add optional init_h and init_c gradient for cudnn_lstm_op. test=develop * Use create_parameter for rnn cudnn impl. test=develop * Move `self._flat_weight = self.create_parameter()` in RNNBase to main_program. test=develop * Update RNN api unittest to use set_device. test=develop * Fix set_place for unit tests of RNN apis. test=develop * Fix use_align in coalesce_tensor_op. test=develop * Adjust RNN apis arguments according to comments. test=develop * Polish documents for SimpleRNN apis. test=develop * Refine random seed in cudnn_lstm_op. Expose rnn params from sublayers to RNN. test=develop * Fix RNN saving for jit.save. Refine cudnn_lstm dropout behavior. test=develop * Fix doc of GRU. test=develop * Use ShareDataWith to avoid copying for cudnn_lstm_op test. test=develop * Remove updates on cudnn_lstm temporarily. test=develop * Use ShareDataWith to avoid copying for cudnn_lstm_op test. test=develop * Refine random seed in cudnn_lstm_op. test=develop * Fix test_lstm by adjust ConcreteProgram buffer getter. test=develop * Use create_parameter instead of create_var for rnn._flat_weight for static graph usage. test=develop * Remove W input for cudnn_lstm to pass unused_var_check. test=develop * Add test_predict for RNN unit tests coverage. test=develop * Fix code style of rnn. test=develop * Fix F.rnn usage in rnn.py. test=develop * Fix test_lstm unittest failed and Add more unittest (#28029) * fix test_lstm unittest failed * add more unittest * modify cmakelist * fix judgement Co-authored-by: Aurelius84 <zhangliujie@baidu.com>	6 years ago
Jack Zhou	a6c18075d6	add xpu error message description in some ops add xpu error message description	6 years ago
chentianyu03	2ca53fa65f	change paddle.fluid.layers.reduce_sum to paddle.sum in sample codes (#27998 ) (#28017 ) * change paddle.fluid.layers.reduce_sum to paddle.sum in sample codes * format codes	6 years ago
LiuChiachi	4316bd4d5a	Clean text.py and decode.py for API 2.0 (#26853 ) (#27958 ) * Make dynamic_decode support dygraph and expose to API 2.0 test=develop * update info about BeamSearchDecoder and dynamic_decode * remove all APIs in paddle.text, expose BeamSearchDecoder and dynamic_decode * update example code * delete test_text.py, decode.py, update some doc, fix example code float64 * delete decode import from paddle.nn * fix unittest bugs * use dygraph.Embedding instead of nn.Embedding, add paddle.enbale_static() * update, correct doc * move dynamic_decode, BeamSearchDecoder API to paddle.nn * fix code style * update unittest param, delete import pf text.py * set dtype of beamsearchtest float64 * update example code of BeamSearchDecoder, dynamic_decode Co-authored-by: LiuChiaChi <709153940@qq.com> Co-authored-by: Guo Sheng <whucsgs@163.com>	6 years ago
lidanqing	ea76fe310c	[oneDNN] Conv dilation support (#27914 ) (#28028 ) * conv dilated mkldnn support: forward and backward pass * add mkldnn conv_transpose dilation UT test=develop * remove unnecessary PADDLE_ENFORCE * add int8 and bf16 dilated conv UT * update according to reviews	6 years ago
tangwei12	c0550b54a5	Feature/large scale kv save base/delta (#27470 ) (#27990 ) * add size method for large scale * add large scale UT * add ut for checkpoint	6 years ago
Aurelius84	772c266822	[Dy2Stat] Fix Error when generating train_program in eval mode (#27975 ) (#28019 ) * Fix save in eval mode * remove assert statement * fix test_partial_program failed * add more test * modify back into _train_program	6 years ago
chentianyu03	77866effeb	change paddle.fluid.data to paddle.static.data in sample code (#27992 ) (#28014 )	6 years ago
chentianyu03	cafa35e1ea	Change reduce mean (#27997 ) (#28004 ) * change paddle.fluid.layers.reduce_mean to paddle.mean * change paddle.fluid.layers.reduce_mean to paddle.mean	6 years ago
chentianyu03	3251f9c1eb	change paddle.fluid.layers.fill_constant to paddle.full in sample codes (#27993 ) (#28000 )	6 years ago
Steffy-zxf	164b9aabc1	update code examples for paddle.sums update code examples for paddle.sums	6 years ago
mapingshuo	50d24899cf	fix kunlun kernel of reshape op (#27989 )	6 years ago
Zhou Wei	b57254ed61	[cherry-pick2.0]Add tensor clone 2.0 (#27982 ) * add tensor clone (#27953) * add tensor clone * fix unittest test_var_base * fix bug of tensor copy of CUDAPinnedPlace (#27966)	6 years ago
123malin	c0061ff56f	【paddle.fleet】geo send sparse optimize (#27719 ) (#27979 ) * test=develop, fix geo sgd communicator and gloo http_init for ps	6 years ago
Guanghua Yu	51dd268cfe	error message optimization in mean_xpu,softmax_with_cross_entropy_op_xpu,test=kunlun (#27968 )	6 years ago
Feiyu Chan	429c0b62b1	support channel last in BatchNormd (#27961 ) 1. support channel last in BatchNormd (#27875) 2. fix a bug in batch_norm_op cuda kernel by extracting ResizeToChannelFist(Last), TransToChannelFirst(Last) to operators/layer_utils.h	6 years ago
mapingshuo	39c31a20e5	reshape support bool, test=develop (#27944 ) (#27971 )	6 years ago
Qinghe JING	1f45c06e92	add reduce xpu op test=develop;test=kunlun (#27960 )	6 years ago
Yiqun Liu	ef2b12f11c	Reimplement paddle.utils.install_check. (#27771 ) (#27963 )	6 years ago
tianshuo78520a	b7fd4f9224	fix norm code format error (#27973 )	6 years ago
Thunderbrook	ba6a29070a	fix slice doc (#27941 ) (#27976 ) * test=document_fix * fix slice doctest=document_fix * test=document_fix * test=document_fix * test=document_fix * test=document_fix	6 years ago