[2.0Custom OP]Support New Custom OP on Windows (#31063)

* [2.0.1]Support New Custom OP on windows * fix CI * fix code style * fix CI * fix CI * fix coverage * fix CI * fix CI
5 years ago · adaec0073d
parent 2168f08ac8
commit adaec0073d
20 changed files with 523 additions and 203 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -335,6 +335,8 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

+add_definitions(-DPADDLE_DLL_EXPORT)
+
 if(ON_INFER)
    # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
    message(STATUS "On inference mode, will take place some specific optimization.")
--- a/paddle/fluid/extension/include/all.h
+++ b/paddle/fluid/extension/include/all.h
@ -18,6 +18,12 @@ limitations under the License. */
 #error C++11 or later compatible compiler is required to use Paddle.
 #endif

+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
+#endif
+
 #include "paddle/fluid/extension/include/dispatch.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/op_meta_info.h"
--- a/paddle/fluid/extension/include/dll_decl.h
+++ b/paddle/fluid/extension/include/dll_decl.h
@ -0,0 +1,27 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(_WIN32)
+#ifndef PD_DLL_DECL
+#ifdef PADDLE_DLL_EXPORT
+#define PD_DLL_DECL __declspec(dllexport)
+#else
+#define PD_DLL_DECL __declspec(dllimport)
+#endif  // PADDLE_DLL_EXPORT
+#endif  // PD_DLL_DECL
+#else
+#define PD_DLL_DECL
+#endif  // _WIN32
--- a/paddle/fluid/extension/include/op_meta_info.h
+++ b/paddle/fluid/extension/include/op_meta_info.h
@ -14,12 +14,14 @@ limitations under the License. */

 #pragma once

+#include <iostream>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include <boost/any.hpp>

+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/tensor.h"

 /**
@ -31,7 +33,7 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
-class OpMetaInfoHelper;
+class PD_DLL_DECL OpMetaInfoHelper;
 }  // namespace framework

 using Tensor = paddle::Tensor;
@ -43,6 +45,26 @@ using Tensor = paddle::Tensor;
  classname& operator=(const classname&) = delete; \
  classname& operator=(classname&&) = delete

+#if defined _WIN32
+#define HANDLE_THE_ERROR try {
+#define END_HANDLE_THE_ERROR            \
+  }                                     \
+  catch (const std::exception& e) {     \
+    std::cerr << e.what() << std::endl; \
+    throw e;                            \
+  }
+#else
+#define HANDLE_THE_ERROR
+#define END_HANDLE_THE_ERROR
+#endif
+
+#define PD_THROW(err_msg)              \
+  do {                                 \
+    HANDLE_THE_ERROR                   \
+    throw std::runtime_error(err_msg); \
+    END_HANDLE_THE_ERROR               \
+  } while (0)
+
 ///////////////// Util Define and Function ////////////////

 inline std::string Grad(const std::string& var_name) {
@ -106,7 +128,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
                                                            attr_idx + 1>(
            inputs, attrs, pargs..., arg);
      } catch (boost::bad_any_cast&) {
-        throw std::runtime_error(
+        PD_THROW(
            "Attribute cast error in custom operator. Expected int value.");
      }
    }
@ -220,7 +242,7 @@ struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {

 ////////////////////// Op Meta Info //////////////////////

-class OpMetaInfo {
+class PD_DLL_DECL OpMetaInfo {
 public:
  explicit OpMetaInfo(const std::string& op_name) : name_(op_name) {}
  OpMetaInfo& Inputs(std::vector<std::string>&& inputs);
@ -246,7 +268,7 @@ class OpMetaInfo {

 //////////////// Op Meta Info Map /////////////////

-class OpMetaInfoMap {
+class PD_DLL_DECL OpMetaInfoMap {
 public:
  // this function's impl should keep in header file.
  // if move to cc file, meta info can not be added
@ -270,14 +292,14 @@ class OpMetaInfoMap {

 //////////////// Op Meta Info Builder /////////////////

-class OpMetaInfoBuilder {
+class PD_DLL_DECL OpMetaInfoBuilder {
 public:
  explicit OpMetaInfoBuilder(std::string&& name);
  OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
  OpMetaInfoBuilder& Outputs(std::vector<std::string>&& outputs);
-  OpMetaInfoBuilder& SetKernelFn(KernelFunc&& func);
-  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc&& func);
-  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc&& func);
+  OpMetaInfoBuilder& SetKernelFn(KernelFunc func);
+  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc func);
+  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc func);
  OpMetaInfoBuilder& SetBackwardOp(const std::string& bwd_op_name);

 private:
@ -317,8 +339,12 @@ void LoadCustomOperatorLib(const std::string& dso_name);
 extern "C" {
 #endif

+#if defined(_WIN32)
 // C-API to get global OpMetaInfoMap.
-paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap();
+__declspec(dllexport) inline paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
+  return paddle::OpMetaInfoMap::Instance();
+}
+#endif  // _WIN32

 #ifdef __cplusplus
 }
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@ -16,6 +16,7 @@ limitations under the License. */

 #include <memory>
 #include <vector>
+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/place.h"

@ -23,7 +24,7 @@ namespace paddle {
 namespace framework {
 class CustomTensorUtils;
 }  // namespace framework
-class Tensor {
+class PD_DLL_DECL Tensor {
 public:
  /// \brief Construct a Tensor on target Place for CustomOp.
  /// Generally it's only used for user to create Tensor.
--- a/paddle/fluid/extension/src/op_meta_info.cc
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@ -78,17 +78,17 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::Outputs(
  return *this;
 }

-OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
  info_ptr_->SetKernelFn(std::forward<KernelFunc>(func));
  return *this;
 }

-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
  info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
  return *this;
 }

-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
  info_ptr_->SetInferDtypeFn(std::forward<InferDtypeFunc>(func));
  return *this;
 }
@ -114,10 +114,17 @@ void LoadCustomOperatorLib(const std::string& dso_name) {
 }
 }  // namespace paddle

+#ifdef __cplusplus
 extern "C" {
+#endif

+#ifndef _WIN32
+// C-API to get global OpMetaInfoMap.
 paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
  return paddle::OpMetaInfoMap::Instance();
 }
+#endif

+#ifdef __cplusplus
 }  // end extern "C"
+#endif
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@ -207,73 +207,87 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
  return target;
 }

-template Tensor Tensor::copy_to<paddle::platform::float16>(
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::bfloat16>(
    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::bfloat16>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex64>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex128>(
-    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<float>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<double>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<float>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<double>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<bool>(const PlaceType &target_place) const;

-template float *Tensor::data<float>() const;
-template double *Tensor::data<double>() const;
-template int64_t *Tensor::data<int64_t>() const;
-template int32_t *Tensor::data<int32_t>() const;
-template uint8_t *Tensor::data<uint8_t>() const;
-template int8_t *Tensor::data<int8_t>() const;
-template paddle::platform::float16 *Tensor::data<paddle::platform::float16>()
-    const;
-template paddle::platform::bfloat16 *Tensor::data<paddle::platform::bfloat16>()
-    const;
-template paddle::platform::complex128 *
+template PD_DLL_DECL float *Tensor::data<float>() const;
+template PD_DLL_DECL double *Tensor::data<double>() const;
+template PD_DLL_DECL int64_t *Tensor::data<int64_t>() const;
+template PD_DLL_DECL int32_t *Tensor::data<int32_t>() const;
+template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
+template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
+template PD_DLL_DECL paddle::platform::bfloat16 *
+Tensor::data<paddle::platform::bfloat16>() const;
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::data<paddle::platform::complex128>() const;
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::data<paddle::platform::complex64>() const;
-template int16_t *Tensor::data<int16_t>() const;
-template bool *Tensor::data<bool>() const;
+template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
+template PD_DLL_DECL bool *Tensor::data<bool>() const;

-template float *Tensor::mutable_data<float>();
-template double *Tensor::mutable_data<double>();
-template int64_t *Tensor::mutable_data<int64_t>();
-template int32_t *Tensor::mutable_data<int32_t>();
-template uint8_t *Tensor::mutable_data<uint8_t>();
-template int8_t *Tensor::mutable_data<int8_t>();
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>();
+template PD_DLL_DECL double *Tensor::mutable_data<double>();
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>();
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>();
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>();
-template int16_t *Tensor::mutable_data<int16_t>();
-template bool *Tensor::mutable_data<bool>();
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>();

-template float *Tensor::mutable_data<float>(const PlaceType &place);
-template double *Tensor::mutable_data<double>(const PlaceType &place);
-template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
-template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
-template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
-template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
+template PD_DLL_DECL double *Tensor::mutable_data<double>(
+    const PlaceType &place);
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
+    const PlaceType &place);
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
-template int16_t *Tensor::mutable_data<int16_t>(const PlaceType &place);
-template bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
+    const PlaceType &place);
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);

 std::vector<int> Tensor::shape() const {
  GET_CASTED_TENSOR
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -345,9 +345,12 @@ if (LINUX)
 endif()

 if (WIN32)
+  set(FLUID_FRAMEWORK_IMPORT_LIB
+    ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.lib
+    CACHE INTERNAL "Fluid framework lib")
  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dll
-      CACHE INTERNAL "Fluid framework lib")
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.dll
+      CACHE INTERNAL "Fluid framework dll")
 endif()

 if(APPLE)
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@ -416,9 +416,6 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 #if defined(__APPLE__) || defined(__OSX__)
  PADDLE_THROW(platform::errors::Unimplemented(
      "Create custom cpp op outside framework do not support Apple."));
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Create custom cpp op outside framework do not support Windows."));
 #else
  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
 #endif
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@ -114,23 +114,24 @@ rem ------pre install python requirement----------
 where python
 where pip
 pip install wheel --user
-pip install -r %work_dir%\python\requirements.txt --user
-pip install -r %work_dir%\python\unittest_py\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
    echo pip install requirements.txt failed!
    exit /b 7
 )

 rem ------pre install clcache and init config----------
-pip install clcache --user
+rem pip install clcache --user
+pip uninstall -y clcache
 :: set USE_CLCACHE to enable clcache
-set USE_CLCACHE=1
+rem set USE_CLCACHE=1
 :: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-set CLCACHE_HARDLINK=1
+rem set CLCACHE_HARDLINK=1
 :: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
-clcache.exe -M 21474836480
+rem clcache.exe -M 21474836480

 rem ------show summary of current environment----------
 cmake --version
@ -281,7 +282,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 :: reset clcache zero stats for collect PR's actual hit rate
-clcache.exe -z
+rem clcache.exe -z

 echo Build Paddle the %build_times% time:
 if "%WITH_CLCACHE%"=="OFF" (
@ -305,7 +306,7 @@ echo 0 > %cache_dir%\error_code.txt
 type %cache_dir%\error_code.txt

 :: ci will collect clcache hit rate
-goto :collect_clcache_hits
+rem goto :collect_clcache_hits

 goto:eof

@ -346,13 +347,14 @@ set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
@ECHO ON
 pip uninstall -y paddlepaddle
 pip uninstall -y paddlepaddle-gpu
-pip install -U %PADDLE_WHL_FILE_WIN% --user
+pip install %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
    call paddle_winci\Scripts\deactivate.bat 2>NUL
    echo pip install whl package failed!
    exit /b 1
 )

+
 set CUDA_VISIBLE_DEVICES=0
 python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@ -9,7 +9,14 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)

-if(NOT APPLE AND NOT WIN32)
+# TODO: support New Custom OP on Mac
+if(Linux)
  add_subdirectory(custom_op)
 endif()
+
+# Windows CPU machine doesn't have CUDA, can't compile .cu file
+# if(WIN32 AND WITH_GPU)
+#   add_subdirectory(custom_op)
+# endif()
+
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@ -1,3 +1,36 @@
+# New custom OP can support Windows/Linux now
+# 'test_simple_custom_op_jit/test_simple_custom_op_setup' compile .cc and .cu file
+py_test(test_simple_custom_op_setup SRCS test_simple_custom_op_setup.py)
+py_test(test_simple_custom_op_jit SRCS test_simple_custom_op_jit.py)
+
+# Compiling shared library will cost some time, but running process is very fast.
+set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
+set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
+
+py_test(test_sysconfig SRCS test_sysconfig.py)
+
+# 'test_dispatch' compile .cc file
+py_test(test_dispatch SRCS test_dispatch.py)
+set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
+
+if(NOT Linux)
+    return()
+endif()
+
+# TODO(zhouwei): support test_check_abi and abi check on Windows
+py_test(test_check_abi SRCS test_check_abi.py)
+
+# Old custom OP only support Linux, only run on Linux
+py_test(test_custom_op SRCS test_custom_op.py)
+py_test(test_jit_load SRCS test_jit_load.py)
+py_test(test_setup_install SRCS test_setup_install.py)
+py_test(test_setup_build SRCS test_setup_build.py)
+
+set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
+
+
 if(WITH_ROCM)
    hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
 elseif(WITH_GPU)
@ -18,19 +51,3 @@ get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES)
 LIST(REMOVE_ITEM TARGET_LIBRARIES glog)
 LIST(REMOVE_ITEM TARGET_LIBRARIES gflags)
 set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES} )
-
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
-
-# Compiling .so will cost some time, but running process is very fast.
-set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
-set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
-
-set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
-set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
--- a/python/paddle/fluid/tests/custom_op/test_dispatch.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch.py
@ -16,8 +16,18 @@ import os
 import unittest
 import paddle
 import numpy as np
-from paddle.utils.cpp_extension import load
+from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+
+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)

 dispatch_op = load(
    name='dispatch_op',
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
@ -13,13 +13,24 @@
 # limitations under the License.

 import os
+import subprocess
 import unittest
 import paddle
 import numpy as np
-from paddle.utils.cpp_extension import load
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_compile_args
 from test_simple_custom_op_setup import relu2_dynamic, relu2_static

+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)
+
 # Compile and load custom op Just-In-Time.
 custom_module = load(
    name='simple_jit_relu2',
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@ -91,7 +91,12 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
    def setUp(self):
        cur_dir = os.path.dirname(os.path.abspath(__file__))
        # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && python setup_install_simple.py install'.format(cur_dir)
+        if os.name == 'nt':
+            cmd = 'cd /d {} && python setup_install_simple.py install'.format(
+                cur_dir)
+        else:
+            cmd = 'cd {} && python setup_install_simple.py install'.format(
+                cur_dir)
        run_cmd(cmd)

        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@ -99,7 +104,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
        # sys.path has been updated. So we update it manually.

        # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
-        site_dir = site.getsitepackages()[0]
+        if os.name == 'nt':
+            # NOTE(zhouwei25): getsitepackages on windows will return a list: [python install dir, site packages dir]
+            site_dir = site.getsitepackages()[1]
+        else:
+            site_dir = site.getsitepackages()[0]
        custom_egg_path = [
            x for x in os.listdir(site_dir) if 'simple_setup_relu2' in x
        ]
--- a/python/paddle/fluid/tests/custom_op/utils.py
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@ -23,8 +23,8 @@ site_packages_path = get_python_lib()
 # paddle include directory. Because the following path is generated after insalling
 # PaddlePaddle whl. So here we specific `include_dirs` to avoid errors in CI.
 paddle_includes = [
-    os.path.join(site_packages_path, 'paddle/include'),
-    os.path.join(site_packages_path, 'paddle/include/third_party')
+    os.path.join(site_packages_path, 'paddle', 'include'),
+    os.path.join(site_packages_path, 'paddle', 'include', 'third_party')
 ]

 # TODO(Aurelius84): Memory layout is different if build paddle with PADDLE_WITH_MKLDNN=ON,
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
@ -3,7 +3,8 @@ numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows"
 numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows"
 protobuf>=3.1.0
-gast>=0.3.3
+gast>=0.3.3 ; platform_system != "Windows"
+gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
 decorator
--- a/python/setup.py.in
+++ b/python/setup.py.in
@ -335,11 +335,16 @@ if '${WITH_XPU_BKCL}' == 'ON':
    shutil.copy('${XPU_BKCL_LIB}', libs_path)
    package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']

-# copy libfuild_framework.so to libs
-if os.name != 'nt' and sys.platform != 'darwin':
-    paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
-    shutil.copy(paddle_framework_lib, libs_path)
-    package_data['paddle.libs'] += [('libpaddle_framework' if os.name != 'nt' else 'paddle_framework') + ext_name]
+# copy libpaddle_framework.so to libs on linux
+if sys.platform.startswith('linux'):
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['libpaddle_framework.so']
+
+# copy paddle_framework.lib/paddle_framework.dll to libs on windows
+if os.name == 'nt':
+    shutil.copy('${FLUID_FRAMEWORK_IMPORT_LIB}', libs_path)
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['paddle_framework.lib', 'paddle_framework.dll']

 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
@ -410,9 +415,9 @@ if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
 class InstallCommand(InstallCommandBase):
    def finalize_options(self):
        ret = InstallCommandBase.finalize_options(self)
-        self.install_headers = os.path.join(self.install_purelib, 'paddle',
-                                            'include')
        self.install_lib = self.install_platlib
+        self.install_headers = os.path.join(self.install_platlib, 'paddle',
+                                            'include')
        return ret


@ -463,11 +468,6 @@ class InstallHeaders(Command):
        return self.copy_file(header, install_dir)

    def run(self):
-        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
-        if os.name == 'nt' or sys.platform == 'darwin':
-            if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
-                self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
-            return
        hdrs = self.distribution.headers
        if not hdrs:
            return