Merge remote-tracking branch 'dzhwinter/windows/support' into windows/support

revert-14324-fix_vlog
wanghaoshuang 7 years ago
commit 5993155d67

@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU) if(WITH_GPU)
if (WIN32) # // if (WIN32)
windows_symbolic(tensor_util SRCS tensor_util.cu) # // windows_symbolic(tensor_util SRCS tensor_util.cu)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) # // nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util) # // add_dependencies(tensor tensor_util)
else() # // else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
endif(WIN32) # endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
endif() endif()
@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function) DEPS operator op_registry device_context math_function)
if(WITH_GPU) if(WITH_GPU)
if (WIN32) # if (WIN32)
# windows treat symbolic file as a real file, which is different with unix # # windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file. # # We create a hidden file and compile it instead of origin source file.
windows_symbolic(hidden_file SRCS data_type_transform.cu) # windows_symbolic(hidden_file SRCS data_type_transform.cu)
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) # nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
add_dependencies(data_type_transform hidden_file) # add_dependencies(data_type_transform hidden_file)
else() # else()
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
endif(WIN32) # endif(WIN32)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else() else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)

@ -1,15 +1,106 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
// You may obtain a copy of the License at You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and See the License for the specific language governing permissions and
// limitations under the License. limitations under the License. */
data_type_transform.cc #include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace framework {
template <typename InType, typename OutType>
struct CastDataTypeFunctor {
HOSTDEVICE inline OutType operator()(InType in) const {
return static_cast<OutType>(in);
}
};
template <typename InType>
struct CastDataType {
CastDataType(const framework::Tensor& in, framework::Tensor* out,
const platform::DeviceContext* ctx)
: in_(in), out_(out), ctx_(ctx) {}
const framework::Tensor in_;
framework::Tensor* out_;
const platform::DeviceContext* ctx_;
template <typename OutType>
void apply() {
auto* in_begin = in_.data<InType>();
auto* in_end = in_begin + in_.numel();
auto* out_begin = out_->mutable_data<OutType>(in_.place());
if (platform::is_cpu_place(in_.place())) {
platform::Transform<platform::CPUDeviceContext> trans;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
#ifdef __NVCC__
} else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
context->Wait();
#endif
} else {
PADDLE_THROW("Unsupported place!");
}
}
};
void TransDataType(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type, const Tensor& in,
Tensor* out) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
out->Resize(in.dims());
auto src_type = kernel_type_for_var.data_type_;
auto dst_type = expected_kernel_type.data_type_;
auto ctx = pool.Get(in.place());
switch (src_type) {
case proto::VarType::FP16:
framework::VisitDataType(dst_type,
CastDataType<platform::float16>(in, out, ctx));
break;
case proto::VarType::FP32:
framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
break;
case proto::VarType::FP64:
framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
break;
case proto::VarType::INT32:
framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
break;
case proto::VarType::INT64:
framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
break;
case proto::VarType::BOOL:
framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
break;
case proto::VarType::INT16:
framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
break;
case proto::VarType::UINT8:
framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
break;
default:
PADDLE_THROW("Not support type %d", src_type);
}
}
} // namespace framework
} // namespace paddle

@ -17,8 +17,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
#if !defined(_WIN32)
constexpr char Node::kControlDepVarName[]; constexpr char Node::kControlDepVarName[] = "__control_var";
#else
const char Node::kControlDepVarName[] = "__control_var";
#endif
int Node::count_ = 0; int Node::count_ = 0;
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework

@ -27,7 +27,11 @@ namespace ir {
class Node { class Node {
public: public:
enum class Type { kOperation, kVariable }; enum class Type { kOperation, kVariable };
#if !defined(_WIN32) // msvc not support constexpr correctly.
static constexpr char kControlDepVarName[] = "__control_var"; static constexpr char kControlDepVarName[] = "__control_var";
#else
static const char kControlDepVarName[];
#endif
explicit Node(const std::string& name, Type type) explicit Node(const std::string& name, Type type)
: name_(name), : name_(name),

@ -689,7 +689,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto expected_kernel_key = auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key: " << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,97 +0,0 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include <iostream>
#include <fstream>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
std::string DIRNAME = "./LB_icnet_model";
//std::string DIRNAME = "./infer_models";
NativeConfig GetConfig() {
NativeConfig config;
config.prog_file=DIRNAME + "/__model__";
config.param_file=DIRNAME + "/__params__";
config.fraction_of_gpu_memory = 0.8;
config.use_gpu = true;
config.device = 0;
return config;
}
using Time = decltype(std::chrono::high_resolution_clock::now());
Time time() { return std::chrono::high_resolution_clock::now(); };
double time_diff(Time t1, Time t2) {
typedef std::chrono::microseconds ms;
auto diff = t2 - t1;
ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0;
}
void test_naive(int batch_size){
NativeConfig config = GetConfig();
// config.model_dir = model_path;
auto predictor = CreatePaddlePredictor<NativeConfig>(config);
int height = 449;
int width = 581;
//int height = 3;
//int width = 3;
int num_sum = height * width * 3 * batch_size;
std::vector<float> data;
for(int i = 0; i < num_sum; i++) {
data.push_back(0.0);
}
PaddleTensor tensor;
tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
tensor.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
PaddleTensor tensor_out;
std::vector<PaddleTensor> outputs(1, tensor_out);
predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
std::cout << "start predict123:" << std::endl;
auto time1 = time();
for(size_t i = 0; i < 2; i++) {
predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
std::cout << "pass " << i;
}
auto time2 = time();
std::ofstream ofresult("naive_test_result.txt", std::ios::app);
std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
std::cout << outputs.size() << std::endl;
/*
int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
ofresult << std::to_string(data_o[j]) << " ";
}
ofresult << std::endl;
ofresult.close();
*/
}
} // namespace paddle
int main(int argc, char** argv) {
paddle::test_naive(1 << 0);
return 0;
}

@ -43,6 +43,7 @@ template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> { class CUDNNConvOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
VLOG(3) << "inside cudnn";
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
VLOG(3) << "get all inputs";
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc; ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc; ScopedTensorDescriptor output_desc;
@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
VLOG(3) << "create tensor descriptor";
#if CUDNN_VERSION_MIN(7, 0, 1) #if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually // cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, groups)); cudnn_conv_desc, groups));
groups = 1; groups = 1;
#endif #endif
VLOG(3) << "before create tensor descriptor";
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()), groups); layout, framework::vectorize2int(input->dims()), groups);
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_height = output->dims()[2]; output_height = output->dims()[2];
output_width = output->dims()[3]; output_width = output->dims()[3];
} }
VLOG(3) << "after create tensor descriptor";
int group_offset_in = int group_offset_in =
input_channels / groups * input_height * input_width * input_depth; input_channels / groups * input_height * input_width * input_depth;
int group_offset_out = int group_offset_out =
@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
VLOG(3) << "set cudnn algorithm";
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, CUDNN_DEFAULT_MATH)); cudnn_conv_desc, CUDNN_DEFAULT_MATH));
} }
#endif #endif
VLOG(3) << "before get workspace";
// get workspace size able to allocate // get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// the limit because the algo is overrided to use tensor core. // the limit because the algo is overrided to use tensor core.
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
VLOG(3) << "after get workspace";
// Allocate on GPU memory // Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
workspace_size_in_bytes = 1024;
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
VLOG(3) << "allocate memory";
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out)); &beta, cudnn_output_desc, output_data + i * group_offset_out));
} }
VLOG(3) << "cudnn forward";
// Release the cudnn workspace // Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace); paddle::memory::Free(gpu, cudnn_workspace);
VLOG(3) << "cudnn pass";
} }
}; };
@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
workspace_size_in_bytes = 1024;
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;

@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto load_as_fp16 = Attr<bool>("load_as_fp16"); auto load_as_fp16 = Attr<bool>("load_as_fp16");
std::ifstream fin(filename); std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), PADDLE_ENFORCE(!fin.bad(),
"Cannot open file %s for load_combine op", filename); "Cannot open file %s for load_combine op", filename);
auto out_var_names = Outputs("Out"); auto out_var_names = Outputs("Out");
@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase {
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < out_var_names.size(); i++) { for (size_t i = 0; i < out_var_names.size(); i++) {
VLOG(3) << "load " << out_var_names[i];
auto *out_var = scope.FindVar(out_var_names[i]); auto *out_var = scope.FindVar(out_var_names[i]);
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
out_var_names[i]); out_var_names[i]);
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
VLOG(3) << "Get Tensor";
// Error checking // Error checking
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s", PADDLE_ENFORCE(!fin.bad(), "Cannot read more from file %s",
filename); filename);
VLOG(3) << "before deserialization";
// Get data from fin to tensor // Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx); DeserializeFromStream(fin, tensor, dev_ctx);
VLOG(3) << "after deserialization";
auto in_dtype = framework::ToDataType(tensor->type()); auto in_dtype = framework::ToDataType(tensor->type());
auto out_dtype = auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase {
tensor->set_lod(fp16_tensor.lod()); tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor); tensor->ShareDataWith(fp16_tensor);
} }
VLOG(3) << "load " << out_var_names[i] << " finished";
} }
} }
}; };

@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
#define CUDNN_VERSION_MIN(major, minor, patch) \ #define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
#if !defined(_WIN32)
#define CUDNN_ENFORCE(condition) \ #define CUDNN_ENFORCE(condition) \
do { \ do { \
cudnnStatus_t status = condition; \ cudnnStatus_t status = condition; \
@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
} \ } \
} while (false) } while (false)
#else
#define CUDNN_ENFORCE(condition)
#endif
enum class DataLayout { // Not use enum class DataLayout { // Not use
kNHWC, kNHWC,

Loading…
Cancel
Save