[CustomOp] Support complex dtype in custom op (#31657)

* support custom complex op * fix detail error * add inference support * fix setup windows failed
4 years ago · 87852616aa
parent fe241fd02f
commit 87852616aa
16 changed files with 530 additions and 15 deletions
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -192,6 +192,12 @@ include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/fluid/extension/include/ext_dispatch.h
@ -68,6 +68,22 @@ namespace paddle {
    }                                                                         \
  }()
 ///////// Complex Dispatch Marco ///////////
 #define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                         \
  [&] {                                                                    \
    const auto& __dtype__ = TYPE;                                          \
    switch (__dtype__) {                                                   \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
                           ::paddle::complex64, __VA_ARGS__)               \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
                           ::paddle::complex128, __VA_ARGS__)              \
      default:                                                             \
        PD_THROW("function " #NAME " is not implemented for data type `" + \
                 ::paddle::ToString(__dtype__) + "`");                     \
    }                                                                      \
  }()
 ///////// Floating and Integral Dispatch Marco ///////////
 #define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
@ -93,6 +109,55 @@ namespace paddle {
    }                                                                         \
  }()
 ///////// Floating and Complex Dispatch Marco ///////////
 #define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)            \
  [&] {                                                                    \
    const auto& __dtype__ = TYPE;                                          \
    switch (__dtype__) {                                                   \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,       \
                           __VA_ARGS__)                                    \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,      \
                           __VA_ARGS__)                                    \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
                           ::paddle::complex64, __VA_ARGS__)               \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
                           ::paddle::complex128, __VA_ARGS__)              \
      default:                                                             \
        PD_THROW("function " #NAME " is not implemented for data type `" + \
                 ::paddle::ToString(__dtype__) + "`");                     \
    }                                                                      \
  }()
 ///////// Floating, Integral and Complex Dispatch Marco ///////////
 #define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
  [&] {                                                                       \
    const auto& __dtype__ = TYPE;                                             \
    switch (__dtype__) {                                                      \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
                           __VA_ARGS__)                                       \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
                           __VA_ARGS__)                                       \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
                           __VA_ARGS__)                                       \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
                           __VA_ARGS__)                                       \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
                           __VA_ARGS__)                                       \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
                           __VA_ARGS__)                                       \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
                           ::paddle::complex64, __VA_ARGS__)                  \
      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
                           ::paddle::complex128, __VA_ARGS__)                 \
      default:                                                                \
        PD_THROW("function " #NAME " is not implemented for data type `" +    \
                 ::paddle::ToString(__dtype__) + "`");                        \
    }                                                                         \
  }()
 // TODO(chenweihang): Add more Marcos in the future if needed
 }  // namespace paddle
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@ -16,10 +16,15 @@ limitations under the License. */
 #include <cstdint>
 #include <string>
 #include "complex128.h"     // NOLINT
 #include "complex64.h"      // NOLINT
 #include "ext_exception.h"  // NOLINT
 namespace paddle {
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 enum class DataType {
  BOOL,
  INT8,
@ -29,6 +34,8 @@ enum class DataType {
  INT64,
  FLOAT32,
  FLOAT64,
  COMPLEX64,
  COMPLEX128,
  // TODO(JiabinYang) support more data types if needed.
 };
@ -50,20 +57,26 @@ inline std::string ToString(DataType dtype) {
      return "float";
    case DataType::FLOAT64:
      return "double";
    case DataType::COMPLEX64:
      return "complex64";
    case DataType::COMPLEX128:
      return "complex128";
    default:
      PD_THROW("Unsupported paddle enum data type.");
  }
 }
-#define PD_FOR_EACH_DATA_TYPE(_) \
+#define PD_FOR_EACH_DATA_TYPE(_)    \
-  _(bool, DataType::BOOL)        \
+  _(bool, DataType::BOOL)           \
-  _(int8_t, DataType::INT8)      \
+  _(int8_t, DataType::INT8)         \
-  _(uint8_t, DataType::UINT8)    \
+  _(uint8_t, DataType::UINT8)       \
-  _(int16_t, DataType::INT16)    \
+  _(int16_t, DataType::INT16)       \
-  _(int, DataType::INT32)        \
+  _(int, DataType::INT32)           \
-  _(int64_t, DataType::INT64)    \
+  _(int64_t, DataType::INT64)       \
-  _(float, DataType::FLOAT32)    \
+  _(float, DataType::FLOAT32)       \
-  _(double, DataType::FLOAT64)
+  _(double, DataType::FLOAT64)      \
  _(complex64, DataType::COMPLEX64) \
  _(complex128, DataType::COMPLEX128)
 template <paddle::DataType T>
 struct DataTypeToCPPType;
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include <utility>
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/transform.h"
@ -162,6 +166,10 @@ DataType Tensor::type() const {
    return DataType::FLOAT64;
  } else if (type == framework::proto::VarType::BOOL) {
    return DataType::BOOL;
  } else if (type == framework::proto::VarType::COMPLEX64) {
    return DataType::COMPLEX64;
  } else if (type == framework::proto::VarType::COMPLEX128) {
    return DataType::COMPLEX128;
  }
  // TODO(JiabinYang) Support more dtype here
  return DataType::FLOAT32;
@ -217,6 +225,10 @@ template PD_DLL_DECL Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
    const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
    const PlaceType &target_place) const;
 template PD_DLL_DECL float *Tensor::data<float>() const;
 template PD_DLL_DECL double *Tensor::data<double>() const;
@ -226,6 +238,10 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
 template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
 template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
 template PD_DLL_DECL bool *Tensor::data<bool>() const;
 template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::data<paddle::platform::complex64>() const;
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::data<paddle::platform::complex128>() const;
 template PD_DLL_DECL float *Tensor::mutable_data<float>();
 template PD_DLL_DECL double *Tensor::mutable_data<double>();
@ -235,6 +251,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
 template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>();
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>();
 template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
 template PD_DLL_DECL double *Tensor::mutable_data<double>(
@ -250,6 +270,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
    const PlaceType &place);
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
 template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
 std::vector<int64_t> Tensor::shape() const {
  GET_CASTED_TENSOR
@ -310,6 +334,16 @@ Tensor Tensor::cast(const DataType &target_type) const {
      framework::VisitDataType(
          dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
      break;
    case framework::proto::VarType::COMPLEX64:
      framework::VisitDataType(
          dst_type,
          CastDataType<paddle::platform::complex64>(*tensor, rlt_tensor_, ctx));
      break;
    case framework::proto::VarType::COMPLEX128:
      framework::VisitDataType(dst_type,
                               CastDataType<paddle::platform::complex128>(
                                   *tensor, rlt_tensor_, ctx));
      break;
    // TODO(JiabinYang) Support more dtype here
    default:
      PADDLE_THROW(platform::errors::Unimplemented(
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -346,13 +346,16 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
 cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
 cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info)
 cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@ -757,10 +757,39 @@ void RegisterOperatorWithMetaInfo(
      return new CustomOperator(type, inputs, outputs, attrs);
    };
-    // Grad InferShape (gradient's shape is same with forward input default)
+    // Grad InferShape
-    grad_info.infer_shape_ = [grad_op_outputs](InferShapeContext* ctx) {
+    grad_info.infer_shape_ = [grad_op_inputs,
                              grad_op_outputs](InferShapeContext* ctx) {
      // 1. if forward input exists, gradient's shape is same with forward input
      // default
      //    [Suitable for most situations]
      // 2. if forward input not exists, and only contains one grad input and
      // output,
      //    use grad input shape as grad output shape
      //    [Suitable for the situation that forward input is not used as
      //    backward input]
      // TODO(chenweihang): support set grad op infershape func if needed
      for (auto& out_name : grad_op_outputs) {
-        ctx->ShareDim(detail::NoGrad(out_name), out_name);
+        auto fwd_name = detail::NoGrad(out_name);
        if (detail::IsDuplicableVar(fwd_name)) {
          // Duplicable forward var must as backward input
          ctx->ShareDim(fwd_name, out_name);
        } else {
          if (ctx->HasInput(fwd_name)) {
            ctx->ShareDim(fwd_name, out_name);
          } else {
            PADDLE_ENFORCE_EQ(
                grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
                true,
                platform::errors::Unavailable(
                    "Custom grad operator infershape error. "
                    "If a custom grad operator contains only one input and "
                    "only one output, the input shape will be directly set to "
                    "the output shape. Otherwise, Please set the forward input "
                    "as the grad operator's input."));
            ctx->ShareDim(grad_op_inputs[0], out_name);
          }
        }
      }
    };
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@ -109,6 +109,10 @@ void GroupTestCopy() {
  TestCopyTensor<int8_t>();
  VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<uint8_t>();
  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<paddle::complex64>();
  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<paddle::complex128>();
 }
 void GroupTestCast() {
@ -126,6 +130,10 @@ void GroupTestCast() {
  TestCast<uint8_t>(paddle::DataType::FLOAT32);
  VLOG(2) << "float cast";
  TestCast<float>(paddle::DataType::FLOAT32);
  VLOG(2) << "complex64 cast";
  TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
  VLOG(2) << "complex128 cast";
  TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
 }
 void GroupTestDtype() {
@ -136,6 +144,8 @@ void GroupTestDtype() {
  CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
  CHECK(TestDtype<int8_t>() == paddle::DataType::INT8);
  CHECK(TestDtype<uint8_t>() == paddle::DataType::UINT8);
  CHECK(TestDtype<paddle::complex64>() == paddle::DataType::COMPLEX64);
  CHECK(TestDtype<paddle::complex128>() == paddle::DataType::COMPLEX128);
 }
 void GroupTestDtypeConvert() {
@ -162,6 +172,12 @@ void GroupTestDtypeConvert() {
        paddle::framework::proto::VarType::INT16);
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL);
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::COMPLEX64) ==
        paddle::framework::proto::VarType::COMPLEX64);
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::COMPLEX128) ==
        paddle::framework::proto::VarType::COMPLEX128);
  // proto -> enum
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::FP64) ==
@ -185,6 +201,12 @@ void GroupTestDtypeConvert() {
        paddle::DataType::INT16);
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::COMPLEX64) ==
        paddle::DataType::COMPLEX64);
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::COMPLEX128) ==
        paddle::DataType::COMPLEX128);
 }
 TEST(CustomTensor, copyTest) {
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@ -56,6 +56,10 @@ class CustomTensorUtils {
        return framework::proto::VarType::INT64;
      case paddle::DataType::INT16:
        return framework::proto::VarType::INT16;
      case paddle::DataType::COMPLEX64:
        return framework::proto::VarType::COMPLEX64;
      case paddle::DataType::COMPLEX128:
        return framework::proto::VarType::COMPLEX128;
      case paddle::DataType::BOOL:
        return framework::proto::VarType::BOOL;
      default:
@ -83,6 +87,10 @@ class CustomTensorUtils {
        return paddle::DataType::UINT8;
      case framework::proto::VarType::INT16:
        return paddle::DataType::INT16;
      case framework::proto::VarType::COMPLEX64:
        return paddle::DataType::COMPLEX64;
      case framework::proto::VarType::COMPLEX128:
        return paddle::DataType::COMPLEX128;
      case framework::proto::VarType::BOOL:
        return paddle::DataType::BOOL;
      default:
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -36,6 +36,10 @@ endif()
 # fluid_modules exclude API-interface of inference/api and inference/capi
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 add_subdirectory(api)
 # Create static inference library if needed
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@ -1,3 +1,7 @@
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
  feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
  analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@ -26,6 +26,9 @@ set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120)
 py_test(test_custom_concat SRCS test_custom_concat.py)
 set_tests_properties(test_custom_concat PROPERTIES TIMEOUT 120)
 py_test(test_custom_conj SRCS test_custom_conj.py)
 set_tests_properties(test_custom_conj PROPERTIES TIMEOUT 120)
 py_test(test_check_abi SRCS test_check_abi.py)
 cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
--- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@ -0,0 +1,94 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WIdata_tHOUdata_t WARRANdata_tIES OR CONDIdata_tIONS OF ANY KIND, either
 // express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <iostream>
 #include <vector>
 #include "paddle/extension.h"
 #define CHECK_INPUT(x) \
  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
 template <typename data_t>
 using EnableComplex = typename std::enable_if<
    std::is_same<data_t, paddle::complex64>::value ||
    std::is_same<data_t, paddle::complex128>::value>::type;
 template <typename data_t>
 using DisableComplex = typename std::enable_if<
    !std::is_same<data_t, paddle::complex64>::value &&
    !std::is_same<data_t, paddle::complex128>::value>::type;
 template <typename data_t, typename Enable = void>
 struct ConjFunctor;
 template <typename data_t>
 struct ConjFunctor<data_t, EnableComplex<data_t>> {
  ConjFunctor(const data_t* input, int64_t numel, data_t* output)
      : input_(input), numel_(numel), output_(output) {}
  void operator()(size_t idx) const {
    output_[idx] = data_t(input_[idx].real, -input_[idx].imag);
  }
  const data_t* input_;
  int64_t numel_;
  data_t* output_;
 };
 template <typename data_t>
 struct ConjFunctor<data_t, DisableComplex<data_t>> {
  ConjFunctor(const data_t* input, int64_t numel, data_t* output)
      : input_(input), numel_(numel), output_(output) {}
  void operator()(size_t idx) const { output_[idx] = input_[idx]; }
  const data_t* input_;
  int64_t numel_;
  data_t* output_;
 };
 template <typename data_t>
 void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) {
  ConjFunctor<data_t> conj(x_data, numel, out_data);
  for (int64_t i = 0; i < numel; ++i) {
    conj(i);
  }
 }
 std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
  CHECK_INPUT(x);
  paddle::Tensor out(x.place());
  out.reshape(x.shape());
  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
      x.type(), "ConjCPUKernel", ([&] {
        ConjCPUKernel<data_t>(
            x.data<data_t>(), x.size(), out.mutable_data<data_t>());
      }));
  return {out};
 }
 PD_BUILD_OP(custom_conj)
    .Inputs({"X"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(ConjFunction));
 PD_BUILD_GRAD_OP(custom_conj)
    .Inputs({paddle::Grad("Out")})
    .Outputs({paddle::Grad("X")})
    .SetKernelFn(PD_KERNEL(ConjFunction));
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@ -62,3 +62,59 @@ PD_BUILD_OP(dispatch_test_float_and_integer)
    .Inputs({"X"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
 std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
  out.reshape(x.shape());
  PD_DISPATCH_COMPLEX_TYPES(
      x.type(), "assign_cpu_kernel", ([&] {
        assign_cpu_kernel<data_t>(
            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
      }));
  return {out};
 }
 PD_BUILD_OP(dispatch_test_complex)
    .Inputs({"X"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(DispatchTestComplex));
 std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
    const paddle::Tensor& x) {
  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
  out.reshape(x.shape());
  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
      x.type(), "assign_cpu_kernel", ([&] {
        assign_cpu_kernel<data_t>(
            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
      }));
  return {out};
 }
 PD_BUILD_OP(dispatch_test_float_and_complex)
    .Inputs({"X"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex));
 std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
    const paddle::Tensor& x) {
  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
  out.reshape(x.shape());
  PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
      x.type(), "assign_cpu_kernel", ([&] {
        assign_cpu_kernel<data_t>(
            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
      }));
  return {out};
 }
 PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
    .Inputs({"X"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
--- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@ -0,0 +1,136 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # 
 #     http://www.apache.org/licenses/LICENSE-2.0
 # 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import unittest
 import numpy as np
 import paddle
 import paddle.static as static
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
    get_build_directory())
 if os.name == 'nt' and os.path.isfile(file):
    cmd = 'del {}'.format(file)
    run_cmd(cmd, True)
 custom_ops = load(
    name='custom_conj_jit',
    sources=['custom_conj_op.cc'],
    extra_include_paths=paddle_includes,  # add for Coverage CI
    extra_cxx_cflags=extra_cc_args,  # test for cc flags
    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
    verbose=True)
 def is_complex(dtype):
    return dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX64 or \
      dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX128
 def to_complex(dtype):
    if dtype == "float32":
        return np.complex64
    elif dtype == "float64":
        return np.complex128
    else:
        return dtype
 def conj_dynamic(func, dtype, np_input):
    paddle.set_device("cpu")
    x = paddle.to_tensor(np_input)
    out = func(x)
    out.stop_gradient = False
    sum_out = paddle.sum(out)
    if is_complex(sum_out.dtype):
        sum_out.real().backward()
    else:
        sum_out.backward()
    return out.numpy(), x.grad
 def conj_static(func, shape, dtype, np_input):
    paddle.enable_static()
    paddle.set_device("cpu")
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name="x", shape=shape, dtype=dtype)
            x.stop_gradient = False
            out = func(x)
            sum_out = paddle.sum(out)
            static.append_backward(sum_out)
            exe = static.Executor()
            exe.run(static.default_startup_program())
            out_v, x_grad_v = exe.run(static.default_main_program(),
                                      feed={"x": np_input},
                                      fetch_list=[out.name, x.name + "@GRAD"])
    paddle.disable_static()
    return out_v, x_grad_v
 class TestCustomConjJit(unittest.TestCase):
    def setUp(self):
        self.dtypes = ['float32', 'float64']
        self.shape = [2, 20, 2, 3]
    def check_output(self, out, pd_out, name):
        self.assertTrue(
            np.array_equal(out, pd_out),
            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
                                                           pd_out))
    def run_dynamic(self, dtype, np_input):
        out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
        pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input)
        self.check_output(out, pd_out, "out")
        self.check_output(x_grad, pd_x_grad, "x's grad")
    def run_static(self, dtype, np_input):
        out, x_grad = conj_static(custom_ops.custom_conj, self.shape, dtype,
                                  np_input)
        pd_out, pd_x_grad = conj_static(paddle.conj, self.shape, dtype,
                                        np_input)
        self.check_output(out, pd_out, "out")
        self.check_output(x_grad, pd_x_grad, "x's grad")
    def test_dynamic(self):
        for dtype in self.dtypes:
            np_input = np.random.random(self.shape).astype(dtype)
            self.run_dynamic(dtype, np_input)
    def test_static(self):
        for dtype in self.dtypes:
            np_input = np.random.random(self.shape).astype(dtype)
            self.run_static(dtype, np_input)
    # complex only used in dynamic mode now
    def test_complex_dynamic(self):
        for dtype in self.dtypes:
            np_input = np.random.random(self.shape).astype(
                dtype) + 1j * np.random.random(self.shape).astype(dtype)
            self.run_dynamic(to_complex(dtype), np_input)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@ -55,6 +55,11 @@ class TestJitDispatch(unittest.TestCase):
        for dtype in dtypes:
            self.run_dispatch_test(dispatch_op.dispatch_test_integer, dtype)
    def test_dispatch_complex(self):
        dtypes = ["complex64", "complex128"]
        for dtype in dtypes:
            self.run_dispatch_test(dispatch_op.dispatch_test_complex, dtype)
    def test_dispatch_float_and_integer(self):
        dtypes = [
            "float32", "float64", "int32", "int64", "int8", "uint8", "int16"
@ -63,6 +68,21 @@ class TestJitDispatch(unittest.TestCase):
            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_integer,
                                   dtype)
    def test_dispatch_float_and_complex(self):
        dtypes = ["float32", "float64", "complex64", "complex128"]
        for dtype in dtypes:
            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_complex,
                                   dtype)
    def test_dispatch_float_and_integer_and_complex(self):
        dtypes = [
            "float32", "float64", "int32", "int64", "int8", "uint8", "int16",
            "complex64", "complex128"
        ]
        for dtype in dtypes:
            self.run_dispatch_test(
                dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
 if __name__ == '__main__':
    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@ -451,12 +451,30 @@ class InstallHeaders(Command):
                                   ('install_headers', 'install_dir'),
                                   ('force', 'force'))
    def copy_data_type_headers(self, header):
        if os.name == 'nt':
            data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h']
        else:
            data_type_headers = ['platform/complex64.h', 'platform/complex128.h']
        for dtype_header in data_type_headers:
            if dtype_header in header:
                if os.name == 'nt':
                    install_dir = os.path.join(self.install_dir, "paddle\\fluid\\extension\\include")
                else:
                    install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
                if not os.path.exists(install_dir):
                    self.mkpath(install_dir)
                return self.copy_file(header, install_dir)
    def mkdir_and_copy_file(self, header):
        if 'pb.h' in header:
            install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
        elif 'third_party' not in header:
-            # framework
+            # paddle headers
            install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
            # For paddle data type headers, we also need to copy to `extension/incude`,
            # used for new custom operator
            self.copy_data_type_headers(header)
        else:
            # third_party
            install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)