diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cu
new file mode 100644
index 0000000000..9ffcf3245a
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cu
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cuh"
+#include "runtime/device/gpu/cuda_common.h"
+
+template <typename T>
+__global__ void CalReLUGradKernel(int size, T *dy, T *y, T *dx) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    dx[pos] = y[pos] > static_cast<T>(0) ? dy[pos] : static_cast<T>(0);
+  }
+}
+
+template <typename T>
+void CalReLUGrad(int size, T *dy, T *y, T *dx, cudaStream_t cuda_stream) {
+  CalReLUGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dy, y, dx);
+  return;
+}
+
+template void CalReLUGrad(int size, double *dy, double *y, double *dx, cudaStream_t cuda_stream);
+template void CalReLUGrad(int size, float *dy, float *y, float *dx, cudaStream_t cuda_stream);
+template void CalReLUGrad(int size, half *dy, half *y, half *dx, cudaStream_t cuda_stream);
+template void CalReLUGrad(int size, int8_t *dy, int8_t *y, int8_t *dx, cudaStream_t cuda_stream);
+template void CalReLUGrad(int size, int16_t *dy, int16_t *y, int16_t *dx, cudaStream_t cuda_stream);
+template void CalReLUGrad(int size, int32_t *dy, int32_t *y, int32_t *dx, cudaStream_t cuda_stream);
+template void CalReLUGrad(int size, int64_t *dy, int64_t *y, int64_t *dx, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cuh
new file mode 100644
index 0000000000..1d1fbbde7c
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cuh
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T>
+void CalReLUGrad(int input_size, T *dy, T *y, T *dx, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_impl.cu
index d7290dc4a4..961ad156d2 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/relu_impl.cu
@@ -38,7 +38,6 @@ template void CalReLU(int size, int8_t *input_addr, int8_t *output_addr, cudaStr
 template void CalReLU(int size, int16_t *input_addr, int16_t *output_addr, cudaStream_t cuda_stream);
 template void CalReLU(int size, int32_t *input_addr, int32_t *output_addr, cudaStream_t cuda_stream);
 template void CalReLU(int size, int64_t *input_addr, int64_t *output_addr, cudaStream_t cuda_stream);
-template void CalReLU(int size, uint8_t *input_addr, uint8_t *output_addr, cudaStream_t cuda_stream);
 
 template <typename T>
 __global__ void ReluV2Kernel(const size_t num, const T *x, T *y, uint32_t *mask) {
@@ -79,7 +78,6 @@ template void ReluV2(const size_t num, const int8_t *x, int8_t *y, uint32_t *mas
 template void ReluV2(const size_t num, const int16_t *x, int16_t *y, uint32_t *mask, cudaStream_t cuda_stream);
 template void ReluV2(const size_t num, const int32_t *x, int32_t *y, uint32_t *mask, cudaStream_t cuda_stream);
 template void ReluV2(const size_t num, const int64_t *x, int64_t *y, uint32_t *mask, cudaStream_t cuda_stream);
-template void ReluV2(const size_t num, const uint8_t *x, uint8_t *y, uint32_t *mask, cudaStream_t cuda_stream);
 
 template void ReluGradV2(const size_t num, const double *dy, const uint32_t *mask, double *dx,
         cudaStream_t cuda_stream);
@@ -93,5 +91,3 @@ template void ReluGradV2(const size_t num, const int32_t *dy, const uint32_t *ma
         cudaStream_t cuda_stream);
 template void ReluGradV2(const size_t num, const int64_t *dy, const uint32_t *mask, int64_t *dx,
         cudaStream_t cuda_stream);
-template void ReluGradV2(const size_t num, const uint8_t *dy, const uint32_t *mask, uint8_t *dx,
-        cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.cc
index e4d781db17..e69167cae1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.cc
@@ -18,34 +18,6 @@
 
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad,
-  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
-  ActivationGradGpuKernel, double)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad,
-  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  ActivationGradGpuKernel, float)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad,
-  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  ActivationGradGpuKernel, half)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
-  ActivationGradGpuKernel, int64_t)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  ActivationGradGpuKernel, int32_t)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
-  ActivationGradGpuKernel, int16_t)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
-  ActivationGradGpuKernel, int8_t)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
-  ActivationGradGpuKernel, uint8_t)
-
 MS_REG_GPU_KERNEL_ONE(
   ReLU6Grad,
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
index b97f7b93bd..30fabdb452 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
@@ -42,7 +42,7 @@ class ActivationGradGpuKernel : public GpuKernel {
     }
     T *dy = nullptr;
     T *y = nullptr;
-    if (mode_ == CUDNN_ACTIVATION_RELU || mode_ == CUDNN_ACTIVATION_ELU || mode_ == CUDNN_ACTIVATION_CLIPPED_RELU) {
+    if (mode_ == CUDNN_ACTIVATION_ELU || mode_ == CUDNN_ACTIVATION_CLIPPED_RELU) {
       dy = GetDeviceAddress<T>(inputs, 0);
       y = GetDeviceAddress<T>(inputs, 1);
     } else {
@@ -125,7 +125,7 @@ class ActivationGradGpuKernel : public GpuKernel {
   void ResetResource() noexcept override {
     cudnn_handle_ = nullptr;
     activation_desc_ = nullptr;
-    mode_ = CUDNN_ACTIVATION_RELU;
+    mode_ = CUDNN_ACTIVATION_SIGMOID;
     data_descriptor_ = nullptr;
     is_null_input_ = false;
     input_size_list_.clear();
@@ -154,8 +154,7 @@ class ActivationGradGpuKernel : public GpuKernel {
   }
 
  private:
-  std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU},
-                                                             {"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU},
+  std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU},
                                                              {"TanhGrad", CUDNN_ACTIVATION_TANH},
                                                              {"EluGrad", CUDNN_ACTIVATION_ELU},
                                                              {"SigmoidGrad", CUDNN_ACTIVATION_SIGMOID}};
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_gpu_kernel.cc
index 2556df5bc8..ebc04534d4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_gpu_kernel.cc
@@ -32,7 +32,5 @@ MS_REG_GPU_KERNEL_ONE(ReLU, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutpu
                       ReLUGpuFwdKernel, int16_t)
 MS_REG_GPU_KERNEL_ONE(ReLU, KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), ReLUGpuFwdKernel,
                       int8_t)
-MS_REG_GPU_KERNEL_ONE(ReLU, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
-                      ReLUGpuFwdKernel, uint8_t)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_gpu_kernel.cc
new file mode 100644
index 0000000000..fbfb62e887
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_gpu_kernel.cc
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/nn/relu_grad_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(
+  ReluGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  ReluGradGpuFwdKernel, double)
+MS_REG_GPU_KERNEL_ONE(
+  ReluGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ReluGradGpuFwdKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  ReluGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  ReluGradGpuFwdKernel, half)
+MS_REG_GPU_KERNEL_ONE(
+  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+  ReluGradGpuFwdKernel, int64_t)
+MS_REG_GPU_KERNEL_ONE(
+  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  ReluGradGpuFwdKernel, int32_t)
+MS_REG_GPU_KERNEL_ONE(
+  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
+  ReluGradGpuFwdKernel, int16_t)
+MS_REG_GPU_KERNEL_ONE(
+  ReluGrad, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
+  ReluGradGpuFwdKernel, int8_t)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_gpu_kernel.h
new file mode 100644
index 0000000000..a00c459bfe
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_gpu_kernel.h
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_RELU_GRAD_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_RELU_GRAD_KERNEL_H_
+
+#include <vector>
+#include <map>
+#include <string>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/kernel_constants.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class ReluGradGpuFwdKernel : public GpuKernel {
+ public:
+  ReluGradGpuFwdKernel() { ResetResource(); }
+  ~ReluGradGpuFwdKernel() override = default;
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    if (is_null_input_) {
+      return true;
+    }
+    T *dy = GetDeviceAddress<T>(inputs, 0);
+    T *y = GetDeviceAddress<T>(inputs, 1);
+    T *dx = GetDeviceAddress<T>(outputs, 0);
+
+    const int size = input_size_ / sizeof(T);
+    CalReLUGrad(size, dy, y, dx, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    InitResource();
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 2) {
+      MS_LOG(ERROR) << "Argument number is " << input_num << ", but ReluGradGpuKernel needs 2.";
+      return false;
+    }
+    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
+    is_null_input_ = CHECK_NULL_INPUT(input_shape);
+    if (is_null_input_) {
+      MS_LOG(WARNING) << "ActivationGradGpuKernel input is null.";
+    }
+    size_t size = 1;
+    for (size_t i = 0; i < input_shape.size(); i++) {
+      size *= input_shape[i];
+    }
+    input_size_ = size * sizeof(T);
+
+    InitSizeLists();
+    return true;
+  }
+  void ResetResource() noexcept override {
+    is_null_input_ = false;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+    input_size_ = 0;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(input_size_);
+    output_size_list_.push_back(input_size_);
+    input_size_list_.push_back(input_size_);
+  }
+
+ private:
+  bool is_null_input_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  size_t input_size_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_RELU_GRAD_KERNEL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_v2_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_v2_gpu_kernel.cc
index 3e85ceda30..b0dd73a8a2 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_v2_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_grad_v2_gpu_kernel.cc
@@ -45,10 +45,5 @@ MS_REG_GPU_KERNEL_ONE(
   ReluGradV2,
   KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeInt64),
   ReluGradV2GpuKernel, int64_t)
-MS_REG_GPU_KERNEL_ONE(
-  ReluGradV2,
-  KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt8),
-  ReluGradV2GpuKernel, uint8_t)
-
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_v2_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_v2_gpu_kernel.cc
index 9fa07bd1ff..900f4e2cd3 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_v2_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/relu_v2_gpu_kernel.cc
@@ -42,8 +42,5 @@ MS_REG_GPU_KERNEL_ONE(
 MS_REG_GPU_KERNEL_ONE(
   ReLUV2, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt32),
   ReluV2GpuKernel, int64_t)
-MS_REG_GPU_KERNEL_ONE(
-  ReLUV2, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt32),
-  ReluV2GpuKernel, uint8_t)
 }  // namespace kernel
 }  // namespace mindspore