Merge remote-tracking branch 'dzhwinter/windows/support' into windows/support

7 years ago · 5993155d67
parent f9e7cfb03c e41a3fcd68
commit 5993155d67
11 changed files with 675 additions and 403 deletions
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  if (WIN32)
+  # // if (WIN32)
-    windows_symbolic(tensor_util SRCS tensor_util.cu)
+  # //   windows_symbolic(tensor_util SRCS tensor_util.cu)
-    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+  # //   nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
-    add_dependencies(tensor tensor_util)
+  # //   add_dependencies(tensor tensor_util)
-  else()
+  # // else()
    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
-  endif(WIN32)
+   # endif(WIN32)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry device_context math_function)
 if(WITH_GPU)
-  if (WIN32)
+  # if (WIN32)
-    # windows treat symbolic file as a real file, which is different with unix
+  #   # windows treat symbolic file as a real file, which is different with unix
-    # We create a hidden file and compile it instead of origin source file.
+  #   # We create a hidden file and compile it instead of origin source file.
-      windows_symbolic(hidden_file SRCS data_type_transform.cu)
+  #     windows_symbolic(hidden_file SRCS data_type_transform.cu)
-      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+  #     nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
-      add_dependencies(data_type_transform hidden_file)
+  #     add_dependencies(data_type_transform hidden_file)
-  else()
+  # else()
      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
-  endif(WIN32)
+  # endif(WIN32)
  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
@ -1,15 +1,106 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-//
+
-// Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+You may obtain a copy of the License at
-//
+
-//     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-//
+
-// Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-// limitations under the License.
+limitations under the License. */
-
+
-data_type_transform.cc
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/transform.h"
 namespace paddle {
 namespace framework {
 template <typename InType, typename OutType>
 struct CastDataTypeFunctor {
  HOSTDEVICE inline OutType operator()(InType in) const {
    return static_cast<OutType>(in);
  }
 };
 template <typename InType>
 struct CastDataType {
  CastDataType(const framework::Tensor& in, framework::Tensor* out,
               const platform::DeviceContext* ctx)
      : in_(in), out_(out), ctx_(ctx) {}
  const framework::Tensor in_;
  framework::Tensor* out_;
  const platform::DeviceContext* ctx_;
  template <typename OutType>
  void apply() {
    auto* in_begin = in_.data<InType>();
    auto* in_end = in_begin + in_.numel();
    auto* out_begin = out_->mutable_data<OutType>(in_.place());
    if (platform::is_cpu_place(in_.place())) {
      platform::Transform<platform::CPUDeviceContext> trans;
      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
            CastDataTypeFunctor<InType, OutType>());
 #ifdef __NVCC__
    } else if (platform::is_gpu_place(in_.place())) {
      platform::Transform<platform::CUDADeviceContext> trans;
      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
            CastDataTypeFunctor<InType, OutType>());
      context->Wait();
 #endif
    } else {
      PADDLE_THROW("Unsupported place!");
    }
  }
 };
 void TransDataType(const OpKernelType& kernel_type_for_var,
                   const OpKernelType& expected_kernel_type, const Tensor& in,
                   Tensor* out) {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  out->Resize(in.dims());
  auto src_type = kernel_type_for_var.data_type_;
  auto dst_type = expected_kernel_type.data_type_;
  auto ctx = pool.Get(in.place());
  switch (src_type) {
    case proto::VarType::FP16:
      framework::VisitDataType(dst_type,
                               CastDataType<platform::float16>(in, out, ctx));
      break;
    case proto::VarType::FP32:
      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
      break;
    case proto::VarType::FP64:
      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
      break;
    case proto::VarType::INT32:
      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
      break;
    case proto::VarType::INT64:
      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
      break;
    case proto::VarType::BOOL:
      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
      break;
    case proto::VarType::INT16:
      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
      break;
    case proto::VarType::UINT8:
      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
      break;
    default:
      PADDLE_THROW("Not support type %d", src_type);
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@ -17,8 +17,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-
+#if !defined(_WIN32)
-constexpr char Node::kControlDepVarName[];
+constexpr char Node::kControlDepVarName[] =  "__control_var";
 #else
 const char Node::kControlDepVarName[] = "__control_var";
 #endif
 int Node::count_ = 0;
 }  // namespace ir
 }  // namespace framework
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -27,7 +27,11 @@ namespace ir {
 class Node {
 public:
  enum class Type { kOperation, kVariable };
 #if !defined(_WIN32) // msvc not support constexpr correctly.
  static constexpr char kControlDepVarName[] = "__control_var";
 #else 
  static const char kControlDepVarName[];
 #endif
  explicit Node(const std::string& name, Type type)
      : name_(name),
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
--- a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
--- a/paddle/fluid/inference/api/demo_ci/naive_model_test.cc
+++ b/paddle/fluid/inference/api/demo_ci/naive_model_test.cc
@ -1,97 +0,0 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <chrono>
 #include <iostream>
 #include <fstream>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 namespace paddle {
 std::string DIRNAME = "./LB_icnet_model";
 //std::string DIRNAME = "./infer_models";
 NativeConfig GetConfig() {
  NativeConfig config;
  config.prog_file=DIRNAME + "/__model__";
  config.param_file=DIRNAME + "/__params__";
  config.fraction_of_gpu_memory = 0.8;
  config.use_gpu = true;
  config.device = 0;
  return config;
 }
 using Time = decltype(std::chrono::high_resolution_clock::now());
 Time time() { return std::chrono::high_resolution_clock::now(); };
 double time_diff(Time t1, Time t2) {
  typedef std::chrono::microseconds ms;
  auto diff = t2 - t1;
  ms counter = std::chrono::duration_cast<ms>(diff);
  return counter.count() / 1000.0;
 }
 void test_naive(int batch_size){
  NativeConfig config = GetConfig();
  // config.model_dir = model_path;
  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
  int height = 449;
  int width = 581;
  //int height = 3;
  //int width = 3;
  int num_sum = height * width * 3 * batch_size;
  std::vector<float> data;
  for(int i = 0; i < num_sum; i++) {
    data.push_back(0.0);
  }
  PaddleTensor tensor;
  tensor.shape = std::vector<int>({batch_size, 3, height, width});
  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
  tensor.dtype = PaddleDType::FLOAT32;
  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
  PaddleTensor tensor_out;
  std::vector<PaddleTensor> outputs(1, tensor_out);
  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
  std::cout << "start predict123:" << std::endl;
  auto time1 = time(); 
  for(size_t i = 0; i < 2; i++) {
    predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
    std::cout << "pass " << i;
  } 
  auto time2 = time(); 
  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
  std::cout << outputs.size() << std::endl;
  /*
  int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
  for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
    ofresult << std::to_string(data_o[j]) << " ";
  }
  ofresult << std::endl;
  ofresult.close();
  */
 }
 }  // namespace paddle
 int main(int argc, char** argv) {
  paddle::test_naive(1 << 0);
  return 0;
 }
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@ -43,6 +43,7 @@ template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    VLOG(3) << "inside cudnn";
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use CUDAPlace.");
    auto* input = ctx.Input<Tensor>("Input");
@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
+    VLOG(3) << "get all inputs";
    // ------------------- cudnn descriptors ---------------------
    ScopedTensorDescriptor input_desc;
    ScopedTensorDescriptor output_desc;
@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);
-
+    VLOG(3) << "create tensor descriptor";
 #if CUDNN_VERSION_MIN(7, 0, 1)
    // cudnn 7 can support groups, no need to do it mannually
    // FIXME(typhoonzero): find a better way to disable groups
@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
        cudnn_conv_desc, groups));
    groups = 1;
 #endif
-
+    VLOG(3) << "before create tensor descriptor";
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize2int(input->dims()), groups);
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
      output_height = output->dims()[2];
      output_width = output->dims()[3];
    }
-
+    VLOG(3) << "after create tensor descriptor";
    int group_offset_in =
        input_channels / groups * input_height * input_width * input_depth;
    int group_offset_out =
@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
    VLOG(3) << "set cudnn algorithm";
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
    }
 #endif
-
+    VLOG(3) << "before get workspace";
    // get workspace size able to allocate
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    // the limit because the algo is overrided to use tensor core.
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");
-
+    VLOG(3) << "after get workspace";
    // Allocate on GPU memory
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    workspace_size_in_bytes = 1024;
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    VLOG(3) << "allocate memory";
    // ------------------- cudnn conv forward ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
    for (int i = 0; i < groups; i++) {
@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
          &beta, cudnn_output_desc, output_data + i * group_offset_out));
    }
    VLOG(3) << "cudnn forward";
    // Release the cudnn workspace
    paddle::memory::Free(gpu, cudnn_workspace);
    VLOG(3) << "cudnn pass";
  }
 };
@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    // Already on GPU
    void* cudnn_workspace = nullptr;
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    workspace_size_in_bytes = 1024;
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv backward data ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase {
    auto filename = Attr<std::string>("file_path");
    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    std::ifstream fin(filename);
+    std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
+    PADDLE_ENFORCE(!fin.bad(),
                   "Cannot open file %s for load_combine op", filename);
    auto out_var_names = Outputs("Out");
@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase {
    auto &dev_ctx = *pool.Get(place);
    for (size_t i = 0; i < out_var_names.size(); i++) {
      VLOG(3) << "load " << out_var_names[i];
      auto *out_var = scope.FindVar(out_var_names[i]);
      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                     out_var_names[i]);
      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
+      VLOG(3) << "Get Tensor";
      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+      PADDLE_ENFORCE(!fin.bad(), "Cannot read more from file %s",
                     filename);
-
+      VLOG(3) << "before deserialization";
      // Get data from fin to tensor
      DeserializeFromStream(fin, tensor, dev_ctx); 
-
+      VLOG(3) << "after deserialization";
      auto in_dtype = framework::ToDataType(tensor->type());
      auto out_dtype =
          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase {
        tensor->set_lod(fp16_tensor.lod());
        tensor->ShareDataWith(fp16_tensor);
      }
      VLOG(3) << "load " << out_var_names[i] << " finished";
    }
  }
 };
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 #if !defined(_WIN32)
 #define CUDNN_ENFORCE(condition)                                     \
  do {                                                               \
    cudnnStatus_t status = condition;                                \
@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
    }                                                                \
  } while (false)
 #else
 #define CUDNN_ENFORCE(condition)
 #endif
 enum class DataLayout {  // Not use
  kNHWC,