Merge pull request #14284 from PaddlePaddle/revert-14043-conv_cudnn_cache

Revert " Exhaustive search for cuDNN conv."
7 years ago · 0953cd3e16
parent ce7d9b0799 db8c52da5e
commit 0953cd3e16
14 changed files with 74 additions and 381 deletions
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <algorithm>
 #include <array>
 #include <string>
 #include <vector>
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@ -13,8 +13,6 @@
 // limitations under the License.

 #pragma once
-#include <algorithm>
-#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@ -16,14 +16,13 @@

 #include <glog/logging.h>
 #include <sys/time.h>
-#include <algorithm>
 #include <chrono>  // NOLINT
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle_inference_api.h"

 namespace paddle {
 namespace inference {
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@ -59,8 +59,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 bool IsPersistable(const framework::VarDesc* var) {
  if (var->Persistable() &&
      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != framework::proto::VarType::RAW) {
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
    return true;
  }
  return false;
--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@ -66,10 +66,9 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
      for (int j = 0; j < max_length; ++j) {
        for (int k = 0; k < half_size; ++k) {
-          const double val =
-              (half_size > 1)
-                  ? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
-                  : j / 10000.0;
+          const double val = (half_size > 1)
+                                 ? j / pow(10000.0, double(k) / (half_size - 1))
+                                 : j / 10000.0;
          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
          dst_ptr[half_size + k] =
              src_ptr[half_size + k] * alpha + cos(val) * beta;
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-template <typename TAlgorithm>
-class AlgorithmsCache {
- public:
-  // Caches the best algorithm for a given
-  // combination of tensor dimensions & compute data type.
-  TAlgorithm GetAlgorithm(
-      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::vector<int>& dilations,
-      int algorithmFlags,  // can set for different data type
-      std::function<TAlgorithm()> gen_func);
-
- private:
-  std::unordered_map<int64_t, TAlgorithm> hash_;
-  std::mutex mutex_;
-};
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    const std::vector<int>& dilations, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  int64_t seed = 0;
-  // Hash all of the inputs, use to try and look up a previously
-  // discovered algorithm, or fall back to generating a new one.
-  std::hash<int64_t> hashFn;
-  // do hash like boost
-  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
-  for (const auto num : dims1) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-  }
-
-  for (const auto num : dims2) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
-  }
-
-  for (const auto num : strides) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 2;
-  }
-
-  for (const auto num : paddings) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 3;
-  }
-
-  for (const auto num : dilations) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 4;
-  }
-
-  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
-          (seed << 6) + (seed >> 2) + 5;
-
-  if (seed == 0) return gen_func();
-
-  if (hash_.find(seed) == hash_.end()) {
-    TAlgorithm value = gen_func();
-    hash_[seed] = value;
-  }
-  return hash_[seed];
-}
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -189,11 +189,6 @@ void Conv2DOpMaker::Make() {
               "workspace size can increase performance but also requires "
               "better hardware. This size should be chosen carefully.")
      .SetDefault(4096);
-  AddAttr<bool>("exhaustive_search",
-                "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search ",
-                "for cuDNN convolution or not, defalut is False.")
-      .SetDefault(false);
  AddComment(R"DOC(
 Convolution Operator.

@ -288,11 +283,7 @@ void Conv3DOpMaker::Make() {
               "workspace size can increase performance but also requires "
               "better hardware. This size should be chosen carefully.")
      .SetDefault(4096);
-  AddAttr<bool>("exhaustive_search",
-                "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search ",
-                "for cuDNN convolution or not, defalut is False.")
-      .SetDefault(false);
+
  AddComment(R"DOC(
 Convolution3D Operator.

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -204,10 +204,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
                          << "." << (driver_version_ % 100) / 10
                          << ", Runtime Version: " << runtime_version_ / 1000
                          << "." << (runtime_version_ % 100) / 10;
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  LOG(INFO) << "device: " << place_.device
-            << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-            << (cudnn_dso_ver % 100) / 10 << ".";
+
  callback_manager_.reset(new StreamCallbackManager(stream_));
 }

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@ -65,54 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 * include all needed cudnn functions in HPPL
 * different cudnn version has different interfaces
 **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor);                    \
-  __macro(cudnnSetTensor4dDescriptorEx);                  \
-  __macro(cudnnSetTensorNdDescriptor);                    \
-  __macro(cudnnGetTensorNdDescriptor);                    \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
-  __macro(cudnnGetConvolutionForwardAlgorithm);           \
-  __macro(cudnnCreateTensorDescriptor);                   \
-  __macro(cudnnDestroyTensorDescriptor);                  \
-  __macro(cudnnCreateFilterDescriptor);                   \
-  __macro(cudnnSetFilter4dDescriptor);                    \
-  __macro(cudnnSetFilterNdDescriptor);                    \
-  __macro(cudnnGetFilterNdDescriptor);                    \
-  __macro(cudnnSetPooling2dDescriptor);                   \
-  __macro(cudnnSetPoolingNdDescriptor);                   \
-  __macro(cudnnGetPoolingNdDescriptor);                   \
-  __macro(cudnnDestroyFilterDescriptor);                  \
-  __macro(cudnnCreateConvolutionDescriptor);              \
-  __macro(cudnnCreatePoolingDescriptor);                  \
-  __macro(cudnnDestroyPoolingDescriptor);                 \
-  __macro(cudnnSetConvolution2dDescriptor);               \
-  __macro(cudnnDestroyConvolutionDescriptor);             \
-  __macro(cudnnSetConvolutionNdDescriptor);               \
-  __macro(cudnnGetConvolutionNdDescriptor);               \
-  __macro(cudnnDeriveBNTensorDescriptor);                 \
-  __macro(cudnnCreateSpatialTransformerDescriptor);       \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);        \
-  __macro(cudnnDestroySpatialTransformerDescriptor);      \
-  __macro(cudnnSpatialTfGridGeneratorForward);            \
-  __macro(cudnnSpatialTfGridGeneratorBackward);           \
-  __macro(cudnnSpatialTfSamplerForward);                  \
-  __macro(cudnnSpatialTfSamplerBackward);                 \
-  __macro(cudnnCreate);                                   \
-  __macro(cudnnDestroy);                                  \
-  __macro(cudnnSetStream);                                \
-  __macro(cudnnActivationForward);                        \
-  __macro(cudnnConvolutionForward);                       \
-  __macro(cudnnConvolutionBackwardBias);                  \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
-  __macro(cudnnTransformTensor);                          \
-  __macro(cudnnPoolingForward);                           \
-  __macro(cudnnPoolingBackward);                          \
-  __macro(cudnnSoftmaxBackward);                          \
-  __macro(cudnnSoftmaxForward);                           \
-  __macro(cudnnGetVersion);                               \
-  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
-  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)              \
+  __macro(cudnnSetTensor4dDescriptor);               \
+  __macro(cudnnSetTensor4dDescriptorEx);             \
+  __macro(cudnnSetTensorNdDescriptor);               \
+  __macro(cudnnGetTensorNdDescriptor);               \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);    \
+  __macro(cudnnGetConvolutionForwardAlgorithm);      \
+  __macro(cudnnCreateTensorDescriptor);              \
+  __macro(cudnnDestroyTensorDescriptor);             \
+  __macro(cudnnCreateFilterDescriptor);              \
+  __macro(cudnnSetFilter4dDescriptor);               \
+  __macro(cudnnSetFilterNdDescriptor);               \
+  __macro(cudnnGetFilterNdDescriptor);               \
+  __macro(cudnnSetPooling2dDescriptor);              \
+  __macro(cudnnSetPoolingNdDescriptor);              \
+  __macro(cudnnGetPoolingNdDescriptor);              \
+  __macro(cudnnDestroyFilterDescriptor);             \
+  __macro(cudnnCreateConvolutionDescriptor);         \
+  __macro(cudnnCreatePoolingDescriptor);             \
+  __macro(cudnnDestroyPoolingDescriptor);            \
+  __macro(cudnnSetConvolution2dDescriptor);          \
+  __macro(cudnnDestroyConvolutionDescriptor);        \
+  __macro(cudnnSetConvolutionNdDescriptor);          \
+  __macro(cudnnGetConvolutionNdDescriptor);          \
+  __macro(cudnnDeriveBNTensorDescriptor);            \
+  __macro(cudnnCreateSpatialTransformerDescriptor);  \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);   \
+  __macro(cudnnDestroySpatialTransformerDescriptor); \
+  __macro(cudnnSpatialTfGridGeneratorForward);       \
+  __macro(cudnnSpatialTfGridGeneratorBackward);      \
+  __macro(cudnnSpatialTfSamplerForward);             \
+  __macro(cudnnSpatialTfSamplerBackward);            \
+  __macro(cudnnCreate);                              \
+  __macro(cudnnDestroy);                             \
+  __macro(cudnnSetStream);                           \
+  __macro(cudnnActivationForward);                   \
+  __macro(cudnnConvolutionForward);                  \
+  __macro(cudnnConvolutionBackwardBias);             \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);  \
+  __macro(cudnnTransformTensor);                     \
+  __macro(cudnnPoolingForward);                      \
+  __macro(cudnnPoolingBackward);                     \
+  __macro(cudnnSoftmaxBackward);                     \
+  __macro(cudnnSoftmaxForward);                      \
+  __macro(cudnnGetVersion);                          \
  __macro(cudnnGetErrorString);
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)

--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -127,8 +127,7 @@ def __bootstrap__():

    if core.is_compiled_with_cuda():
        read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
-            'conv_workspace_size_limit', 'cudnn_exhaustive_search'
+            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
        ]
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -27,7 +27,6 @@ from .tensor import concat
 from . import utils
 from .. import unique_name
 from functools import reduce
-from .. import core

 __all__ = [
    'fc',
@ -1665,20 +1664,6 @@ def conv2d(input,

    pre_bias = helper.create_variable_for_type_inference(dtype)

-    if use_cudnn:
-        helper.create_variable(
-            name="kCUDNNFwdAlgoCache",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        helper.create_variable(
-            name="kCUDNNBwdDataAlgoCache",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        helper.create_variable(
-            name="kCUDNNBwdFilterAlgoCache",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
    helper.append_op(
        type=l_type,
        inputs={
@ -1692,7 +1677,7 @@ def conv2d(input,
            'dilations': dilation,
            'groups': groups,
            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
+            'use_mkldnn': False
        })

    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@ -67,7 +67,6 @@ class TestConv2dOp(OpTest):
    def setUp(self):
        self.op_type = "conv2d"
        self.use_cudnn = False
-        self.exhaustive_search = False
        self.use_cuda = False
        self.use_mkldnn = False
        self.data_format = "AnyLayout"
@ -99,8 +98,7 @@ class TestConv2dOp(OpTest):
            'dilations': self.dilations,
            'use_cudnn': self.use_cudnn,
            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
-            'exhaustive_search': self.exhaustive_search
+            'data_format': self.data_format
        }
        self.outputs = {'Output': output}

@ -394,12 +392,6 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
        self.op_type = "depthwise_conv2d"


-class TestCUDNNExhaustiveSearch(TestCUDNN):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.exhaustive_search = True
-
-
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@ -335,12 +335,6 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
                self.check_output_with_place(place, atol=2e-2)


-class TestCUDNNExhaustiveSearch(TestCUDNN):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.exhaustive_search = True
-
-
 # FIXME(typhoonzero): find a way to determine if
 # using cudnn > 6 in python
 # class TestWithDilationCUDNN(TestWithDilation):