Merge pull request #14284 from PaddlePaddle/revert-14043-conv_cudnn_cache

Revert " Exhaustive search for cuDNN conv."
7 years ago · 0953cd3e16
parent ce7d9b0799 db8c52da5e
commit 0953cd3e16
14 changed files with 74 additions and 381 deletions
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <algorithm>
 #include <array>
 #include <string>
 #include <vector>
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@ -13,8 +13,6 @@
 // limitations under the License.
 #pragma once
 #include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@ -16,14 +16,13 @@
 #include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle_inference_api.h"
 namespace paddle {
 namespace inference {
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@ -59,8 +59,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 bool IsPersistable(const framework::VarDesc* var) {
  if (var->Persistable() &&
      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
      var->GetType() != framework::proto::VarType::RAW) {
    return true;
  }
  return false;
--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@ -66,10 +66,9 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
      for (int j = 0; j < max_length; ++j) {
        for (int k = 0; k < half_size; ++k) {
-          const double val =
+          const double val = (half_size > 1)
-              (half_size > 1)
+                                 ? j / pow(10000.0, double(k) / (half_size - 1))
-                  ? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
+                                 : j / 10000.0;
                  : j / 10000.0;
          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
          dst_ptr[half_size + k] =
              src_ptr[half_size + k] * alpha + cos(val) * beta;
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@ -1,90 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <functional>
 #include <unordered_map>
 #include <vector>
 namespace paddle {
 namespace operators {
 template <typename TAlgorithm>
 class AlgorithmsCache {
 public:
  // Caches the best algorithm for a given
  // combination of tensor dimensions & compute data type.
  TAlgorithm GetAlgorithm(
      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
      const std::vector<int>& strides, const std::vector<int>& paddings,
      const std::vector<int>& dilations,
      int algorithmFlags,  // can set for different data type
      std::function<TAlgorithm()> gen_func);
 private:
  std::unordered_map<int64_t, TAlgorithm> hash_;
  std::mutex mutex_;
 };
 template <typename TAlgorithm>
 TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
    const std::vector<int>& strides, const std::vector<int>& paddings,
    const std::vector<int>& dilations, int algorithmFlags,
    std::function<TAlgorithm()> gen_func) {
  std::lock_guard<std::mutex> lock(mutex_);
  int64_t seed = 0;
  // Hash all of the inputs, use to try and look up a previously
  // discovered algorithm, or fall back to generating a new one.
  std::hash<int64_t> hashFn;
  // do hash like boost
  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
  for (const auto num : dims1) {
    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
  }
  for (const auto num : dims2) {
    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
  }
  for (const auto num : strides) {
    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
            (seed >> 2) + 2;
  }
  for (const auto num : paddings) {
    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
            (seed >> 2) + 3;
  }
  for (const auto num : dilations) {
    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
            (seed >> 2) + 4;
  }
  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
          (seed << 6) + (seed >> 2) + 5;
  if (seed == 0) return gen_func();
  if (hash_.find(seed) == hash_.end()) {
    TAlgorithm value = gen_func();
    hash_[seed] = value;
  }
  return hash_[seed];
 }
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -189,11 +189,6 @@ void Conv2DOpMaker::Make() {
               "workspace size can increase performance but also requires "
               "better hardware. This size should be chosen carefully.")
      .SetDefault(4096);
  AddAttr<bool>("exhaustive_search",
                "(bool, default false) cuDNN has many algorithm to calculation "
                "convolution, whether enable exhaustive search ",
                "for cuDNN convolution or not, defalut is False.")
      .SetDefault(false);
  AddComment(R"DOC(
 Convolution Operator.
@ -288,11 +283,7 @@ void Conv3DOpMaker::Make() {
               "workspace size can increase performance but also requires "
               "better hardware. This size should be chosen carefully.")
      .SetDefault(4096);
-  AddAttr<bool>("exhaustive_search",
+
                "(bool, default false) cuDNN has many algorithm to calculation "
                "convolution, whether enable exhaustive search ",
                "for cuDNN convolution or not, defalut is False.")
      .SetDefault(false);
  AddComment(R"DOC(
 Convolution3D Operator.
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -204,10 +204,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
                          << "." << (driver_version_ % 100) / 10
                          << ", Runtime Version: " << runtime_version_ / 1000
                          << "." << (runtime_version_ % 100) / 10;
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+
  LOG(INFO) << "device: " << place_.device
            << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
            << (cudnn_dso_ver % 100) / 10 << ".";
  callback_manager_.reset(new StreamCallbackManager(stream_));
 }
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@ -65,54 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 * include all needed cudnn functions in HPPL
 * different cudnn version has different interfaces
 **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)              \
-  __macro(cudnnSetTensor4dDescriptor);                    \
+  __macro(cudnnSetTensor4dDescriptor);               \
-  __macro(cudnnSetTensor4dDescriptorEx);                  \
+  __macro(cudnnSetTensor4dDescriptorEx);             \
-  __macro(cudnnSetTensorNdDescriptor);                    \
+  __macro(cudnnSetTensorNdDescriptor);               \
-  __macro(cudnnGetTensorNdDescriptor);                    \
+  __macro(cudnnGetTensorNdDescriptor);               \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);    \
-  __macro(cudnnGetConvolutionForwardAlgorithm);           \
+  __macro(cudnnGetConvolutionForwardAlgorithm);      \
-  __macro(cudnnCreateTensorDescriptor);                   \
+  __macro(cudnnCreateTensorDescriptor);              \
-  __macro(cudnnDestroyTensorDescriptor);                  \
+  __macro(cudnnDestroyTensorDescriptor);             \
-  __macro(cudnnCreateFilterDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);              \
-  __macro(cudnnSetFilter4dDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);               \
-  __macro(cudnnSetFilterNdDescriptor);                    \
+  __macro(cudnnSetFilterNdDescriptor);               \
-  __macro(cudnnGetFilterNdDescriptor);                    \
+  __macro(cudnnGetFilterNdDescriptor);               \
-  __macro(cudnnSetPooling2dDescriptor);                   \
+  __macro(cudnnSetPooling2dDescriptor);              \
-  __macro(cudnnSetPoolingNdDescriptor);                   \
+  __macro(cudnnSetPoolingNdDescriptor);              \
-  __macro(cudnnGetPoolingNdDescriptor);                   \
+  __macro(cudnnGetPoolingNdDescriptor);              \
-  __macro(cudnnDestroyFilterDescriptor);                  \
+  __macro(cudnnDestroyFilterDescriptor);             \
-  __macro(cudnnCreateConvolutionDescriptor);              \
+  __macro(cudnnCreateConvolutionDescriptor);         \
-  __macro(cudnnCreatePoolingDescriptor);                  \
+  __macro(cudnnCreatePoolingDescriptor);             \
-  __macro(cudnnDestroyPoolingDescriptor);                 \
+  __macro(cudnnDestroyPoolingDescriptor);            \
-  __macro(cudnnSetConvolution2dDescriptor);               \
+  __macro(cudnnSetConvolution2dDescriptor);          \
-  __macro(cudnnDestroyConvolutionDescriptor);             \
+  __macro(cudnnDestroyConvolutionDescriptor);        \
-  __macro(cudnnSetConvolutionNdDescriptor);               \
+  __macro(cudnnSetConvolutionNdDescriptor);          \
-  __macro(cudnnGetConvolutionNdDescriptor);               \
+  __macro(cudnnGetConvolutionNdDescriptor);          \
-  __macro(cudnnDeriveBNTensorDescriptor);                 \
+  __macro(cudnnDeriveBNTensorDescriptor);            \
-  __macro(cudnnCreateSpatialTransformerDescriptor);       \
+  __macro(cudnnCreateSpatialTransformerDescriptor);  \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);   \
-  __macro(cudnnDestroySpatialTransformerDescriptor);      \
+  __macro(cudnnDestroySpatialTransformerDescriptor); \
-  __macro(cudnnSpatialTfGridGeneratorForward);            \
+  __macro(cudnnSpatialTfGridGeneratorForward);       \
-  __macro(cudnnSpatialTfGridGeneratorBackward);           \
+  __macro(cudnnSpatialTfGridGeneratorBackward);      \
-  __macro(cudnnSpatialTfSamplerForward);                  \
+  __macro(cudnnSpatialTfSamplerForward);             \
-  __macro(cudnnSpatialTfSamplerBackward);                 \
+  __macro(cudnnSpatialTfSamplerBackward);            \
-  __macro(cudnnCreate);                                   \
+  __macro(cudnnCreate);                              \
-  __macro(cudnnDestroy);                                  \
+  __macro(cudnnDestroy);                             \
-  __macro(cudnnSetStream);                                \
+  __macro(cudnnSetStream);                           \
-  __macro(cudnnActivationForward);                        \
+  __macro(cudnnActivationForward);                   \
-  __macro(cudnnConvolutionForward);                       \
+  __macro(cudnnConvolutionForward);                  \
-  __macro(cudnnConvolutionBackwardBias);                  \
+  __macro(cudnnConvolutionBackwardBias);             \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);  \
-  __macro(cudnnTransformTensor);                          \
+  __macro(cudnnTransformTensor);                     \
-  __macro(cudnnPoolingForward);                           \
+  __macro(cudnnPoolingForward);                      \
-  __macro(cudnnPoolingBackward);                          \
+  __macro(cudnnPoolingBackward);                     \
-  __macro(cudnnSoftmaxBackward);                          \
+  __macro(cudnnSoftmaxBackward);                     \
-  __macro(cudnnSoftmaxForward);                           \
+  __macro(cudnnSoftmaxForward);                      \
-  __macro(cudnnGetVersion);                               \
+  __macro(cudnnGetVersion);                          \
  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
  __macro(cudnnGetErrorString);
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -127,8 +127,7 @@ def __bootstrap__():
    if core.is_compiled_with_cuda():
        read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
+            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
            'conv_workspace_size_limit', 'cudnn_exhaustive_search'
        ]
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -27,7 +27,6 @@ from .tensor import concat
 from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
 __all__ = [
    'fc',
@ -1665,20 +1664,6 @@ def conv2d(input,
    pre_bias = helper.create_variable_for_type_inference(dtype)
    if use_cudnn:
        helper.create_variable(
            name="kCUDNNFwdAlgoCache",
            persistable=True,
            type=core.VarDesc.VarType.RAW)
        helper.create_variable(
            name="kCUDNNBwdDataAlgoCache",
            persistable=True,
            type=core.VarDesc.VarType.RAW)
        helper.create_variable(
            name="kCUDNNBwdFilterAlgoCache",
            persistable=True,
            type=core.VarDesc.VarType.RAW)
    helper.append_op(
        type=l_type,
        inputs={
@ -1692,7 +1677,7 @@ def conv2d(input,
            'dilations': dilation,
            'groups': groups,
            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
+            'use_mkldnn': False
        })
    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@ -67,7 +67,6 @@ class TestConv2dOp(OpTest):
    def setUp(self):
        self.op_type = "conv2d"
        self.use_cudnn = False
        self.exhaustive_search = False
        self.use_cuda = False
        self.use_mkldnn = False
        self.data_format = "AnyLayout"
@ -99,8 +98,7 @@ class TestConv2dOp(OpTest):
            'dilations': self.dilations,
            'use_cudnn': self.use_cudnn,
            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
+            'data_format': self.data_format
            'exhaustive_search': self.exhaustive_search
        }
        self.outputs = {'Output': output}
@ -394,12 +392,6 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
        self.op_type = "depthwise_conv2d"
 class TestCUDNNExhaustiveSearch(TestCUDNN):
    def init_kernel_type(self):
        self.use_cudnn = True
        self.exhaustive_search = True
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@ -335,12 +335,6 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
                self.check_output_with_place(place, atol=2e-2)
 class TestCUDNNExhaustiveSearch(TestCUDNN):
    def init_kernel_type(self):
        self.use_cudnn = True
        self.exhaustive_search = True
 # FIXME(typhoonzero): find a way to determine if
 # using cudnn > 6 in python
 # class TestWithDilationCUDNN(TestWithDilation):