fix update to develop hang problem.

7 years ago · e41a3fcd68
parent 804dd7da04
commit e41a3fcd68
11 changed files with 675 additions and 403 deletions
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  if (WIN32)
-    windows_symbolic(tensor_util SRCS tensor_util.cu)
-    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
-    add_dependencies(tensor tensor_util)
-  else()
+  # // if (WIN32)
+  # //   windows_symbolic(tensor_util SRCS tensor_util.cu)
+  # //   nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+  # //   add_dependencies(tensor tensor_util)
+  # // else()
    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
-  endif(WIN32)
+   # endif(WIN32)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry device_context math_function)

 if(WITH_GPU)
-  if (WIN32)
-    # windows treat symbolic file as a real file, which is different with unix
-    # We create a hidden file and compile it instead of origin source file.
-      windows_symbolic(hidden_file SRCS data_type_transform.cu)
-      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
-      add_dependencies(data_type_transform hidden_file)
-  else()
+  # if (WIN32)
+  #   # windows treat symbolic file as a real file, which is different with unix
+  #   # We create a hidden file and compile it instead of origin source file.
+  #     windows_symbolic(hidden_file SRCS data_type_transform.cu)
+  #     nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+  #     add_dependencies(data_type_transform hidden_file)
+  # else()
      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
-  endif(WIN32)
+  # endif(WIN32)
  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
@ -1,15 +1,106 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-data_type_transform.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type_transform.h"
+
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void apply() {
+    auto* in_begin = in_.data<InType>();
+    auto* in_end = in_begin + in_.numel();
+    auto* out_begin = out_->mutable_data<OutType>(in_.place());
+
+    if (platform::is_cpu_place(in_.place())) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+#ifdef __NVCC__
+    } else if (platform::is_gpu_place(in_.place())) {
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+      context->Wait();
+#endif
+    } else {
+      PADDLE_THROW("Unsupported place!");
+    }
+  }
+};
+
+void TransDataType(const OpKernelType& kernel_type_for_var,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
+                   Tensor* out) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  out->Resize(in.dims());
+  auto src_type = kernel_type_for_var.data_type_;
+  auto dst_type = expected_kernel_type.data_type_;
+  auto ctx = pool.Get(in.place());
+
+  switch (src_type) {
+    case proto::VarType::FP16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::float16>(in, out, ctx));
+      break;
+    case proto::VarType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
+      break;
+    case proto::VarType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
+      break;
+    case proto::VarType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
+      break;
+    case proto::VarType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
+      break;
+    case proto::VarType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::INT16:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::UINT8:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@ -17,8 +17,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-
-constexpr char Node::kControlDepVarName[];
+#if !defined(_WIN32)
+constexpr char Node::kControlDepVarName[] =  "__control_var";
+#else
+const char Node::kControlDepVarName[] = "__control_var";
+#endif
 int Node::count_ = 0;
 }  // namespace ir
 }  // namespace framework
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -27,7 +27,11 @@ namespace ir {
 class Node {
 public:
  enum class Type { kOperation, kVariable };
+#if !defined(_WIN32) // msvc not support constexpr correctly.
  static constexpr char kControlDepVarName[] = "__control_var";
+#else 
+  static const char kControlDepVarName[];
+#endif

  explicit Node(const std::string& name, Type type)
      : name_(name),
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -689,7 +689,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,

  auto expected_kernel_key =
      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(3) << "expected_kernel_key: " << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
--- a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
--- a/paddle/fluid/inference/api/demo_ci/naive_model_test.cc
+++ b/paddle/fluid/inference/api/demo_ci/naive_model_test.cc
@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <chrono>
-#include <iostream>
-#include <fstream>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
-namespace paddle {
-
-std::string DIRNAME = "./LB_icnet_model";
-//std::string DIRNAME = "./infer_models";
-NativeConfig GetConfig() {
-  NativeConfig config;
-  config.prog_file=DIRNAME + "/__model__";
-  config.param_file=DIRNAME + "/__params__";
-  config.fraction_of_gpu_memory = 0.8;
-  config.use_gpu = true;
-  config.device = 0;
-  return config;
-}
-
-using Time = decltype(std::chrono::high_resolution_clock::now());
-Time time() { return std::chrono::high_resolution_clock::now(); };
-double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
-
-void test_naive(int batch_size){
-  NativeConfig config = GetConfig();
-  // config.model_dir = model_path;
-  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-  int height = 449;
-  int width = 581;
-  //int height = 3;
-  //int width = 3;
-  int num_sum = height * width * 3 * batch_size;
-  
-  std::vector<float> data;
-  
-  for(int i = 0; i < num_sum; i++) {
-    data.push_back(0.0);
-  }
-  
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({batch_size, 3, height, width});
-  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
-  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
-  tensor.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-  PaddleTensor tensor_out;
-
-  std::vector<PaddleTensor> outputs(1, tensor_out);
-
-  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-  std::cout << "start predict123:" << std::endl;
-  auto time1 = time(); 
-  
-  for(size_t i = 0; i < 2; i++) {
-    predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-    std::cout << "pass " << i;
-  } 
-
-  auto time2 = time(); 
-  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
-
-  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
-  std::cout << outputs.size() << std::endl;
-  /*
-  int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
-  for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
-    ofresult << std::to_string(data_o[j]) << " ";
-  }
-  ofresult << std::endl;
-  ofresult.close();
-  */
-}
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  paddle::test_naive(1 << 0);
-  return 0;
-}
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@ -43,6 +43,7 @@ template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    VLOG(3) << "inside cudnn";
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use CUDAPlace.");
    auto* input = ctx.Input<Tensor>("Input");
@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
+    VLOG(3) << "get all inputs";
    // ------------------- cudnn descriptors ---------------------
    ScopedTensorDescriptor input_desc;
    ScopedTensorDescriptor output_desc;
@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {

    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);
-
+    VLOG(3) << "create tensor descriptor";
 #if CUDNN_VERSION_MIN(7, 0, 1)
    // cudnn 7 can support groups, no need to do it mannually
    // FIXME(typhoonzero): find a better way to disable groups
@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
        cudnn_conv_desc, groups));
    groups = 1;
 #endif
-
+    VLOG(3) << "before create tensor descriptor";
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize2int(input->dims()), groups);
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
      output_height = output->dims()[2];
      output_width = output->dims()[3];
    }
-
+    VLOG(3) << "after create tensor descriptor";
    int group_offset_in =
        input_channels / groups * input_height * input_width * input_depth;
    int group_offset_out =
@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();

+    VLOG(3) << "set cudnn algorithm";
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
    }
 #endif
-
+    VLOG(3) << "before get workspace";
    // get workspace size able to allocate
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    // the limit because the algo is overrided to use tensor core.
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");
-
+    VLOG(3) << "after get workspace";
    // Allocate on GPU memory
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    workspace_size_in_bytes = 1024;
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    VLOG(3) << "allocate memory";
    // ------------------- cudnn conv forward ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
    for (int i = 0; i < groups; i++) {
@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
          &beta, cudnn_output_desc, output_data + i * group_offset_out));
    }
+    VLOG(3) << "cudnn forward";
    // Release the cudnn workspace
    paddle::memory::Free(gpu, cudnn_workspace);
+    VLOG(3) << "cudnn pass";
  }
 };

@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    // Already on GPU
    void* cudnn_workspace = nullptr;
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    workspace_size_in_bytes = 1024;
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv backward data ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase {
    auto filename = Attr<std::string>("file_path");
    auto load_as_fp16 = Attr<bool>("load_as_fp16");

-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
+    std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
+    PADDLE_ENFORCE(!fin.bad(),
                   "Cannot open file %s for load_combine op", filename);

    auto out_var_names = Outputs("Out");
@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase {
    auto &dev_ctx = *pool.Get(place);

    for (size_t i = 0; i < out_var_names.size(); i++) {
+      VLOG(3) << "load " << out_var_names[i];
      auto *out_var = scope.FindVar(out_var_names[i]);

      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                     out_var_names[i]);

      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
+      VLOG(3) << "Get Tensor";
      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+      PADDLE_ENFORCE(!fin.bad(), "Cannot read more from file %s",
                     filename);
-
+      VLOG(3) << "before deserialization";
      // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
-
+      DeserializeFromStream(fin, tensor, dev_ctx); 
+      VLOG(3) << "after deserialization";
      auto in_dtype = framework::ToDataType(tensor->type());
      auto out_dtype =
          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase {
        tensor->set_lod(fp16_tensor.lod());
        tensor->ShareDataWith(fp16_tensor);
      }
+      VLOG(3) << "load " << out_var_names[i] << " finished";
    }
  }
 };
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))

+#if !defined(_WIN32)
 #define CUDNN_ENFORCE(condition)                                     \
  do {                                                               \
    cudnnStatus_t status = condition;                                \
@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
    }                                                                \
  } while (false)
+#else
+#define CUDNN_ENFORCE(condition)
+#endif

 enum class DataLayout {  // Not use
  kNHWC,