gpu add combine cast fusion

4 years ago · f36c2721af
parent 2bac83ba1b
commit f36c2721af
6 changed files with 370 additions and 0 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cu
@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include "backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh"
+
+template <typename T, typename S>
+__global__ void CastAll(T** inputs, S** output, const size_t num, const size_t *size) {
+    for (size_t i = 0; i < num; i++) {
+        for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size[i]; pos += blockDim.x * gridDim.x) {
+            output[i][pos] = static_cast<S>(inputs[i][pos]);
+        }
+    }
+}
+
+template <typename T, typename S>
+void CastAllKernel(T** inputs, S** output, const size_t max, const size_t num, const size_t *size,
+                   cudaStream_t stream) {
+    CastAll<<<GET_BLOCKS(max), GET_THREADS, 0, stream>>>(inputs, output, num, size);
+    return;
+}
+template void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num,
+                            const size_t *size, cudaStream_t stream);
+template void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num,
+                            const size_t *size, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh
@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
+
+#include <vector>
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T, typename S>
+void CastAllKernel(T **inputs, S **output, const size_t max, const size_t num, const size_t *size, cudaStream_t stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.cc
@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(
+  CastAll, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat16),
+  CastAllGpuFwdKernel, float, half)
+MS_REG_GPU_KERNEL_TWO(
+  CastAll, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat32),
+  CastAllGpuFwdKernel, half, float)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h
@ -0,0 +1,104 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H
+
+#include <memory>
+#include <vector>
+#include <map>
+#include <string>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh"
+namespace mindspore {
+namespace kernel {
+template <typename T, typename S>
+class CastAllGpuFwdKernel : public GpuKernel {
+ public:
+  CastAllGpuFwdKernel() : max_(0), input_size_(0), output_size_(0), num_input_(0) {}
+  ~CastAllGpuFwdKernel() override = default;
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
+    auto in_addr = std::make_unique<T *[]>(num_input_);
+    auto out_addr = std::make_unique<S *[]>(num_input_);
+    for (size_t i = 0; i < num_input_; i++) {
+      in_addr[i] = GetDeviceAddress<T>(inputs, i);
+      out_addr[i] = GetDeviceAddress<S>(outputs, i);
+    }
+    T **inputs_dev = GetDeviceAddress<T *>(workspace, 0);
+    S **outputs_dev = GetDeviceAddress<S *>(workspace, 1);
+    size_t *size_dev = GetDeviceAddress<size_t>(workspace, 2);
+    CHECK_CUDA_RET_WITH_EXCEPT(
+      cudaMemcpyAsync(inputs_dev, in_addr.get(), sizeof(T *) * num_input_, cudaMemcpyHostToDevice, stream),
+      "cudaMemCPY failed")
+    CHECK_CUDA_RET_WITH_EXCEPT(
+      cudaMemcpyAsync(outputs_dev, out_addr.get(), sizeof(S *) * num_input_, cudaMemcpyHostToDevice, stream),
+      "cudaMemCPY failed")
+    CHECK_CUDA_RET_WITH_EXCEPT(
+      cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream),
+      "cudaMemCPY failed")
+    CastAllKernel(inputs_dev, outputs_dev, max_, num_input_, size_dev, stream);
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    num_input_ = GetAttr<size_t>(kernel_node, "n");
+    size_ = std::make_unique<size_t[]>(num_input_);
+    for (size_t i = 0; i < num_input_; i++) {
+      auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
+      size_t s = 1;
+      for (auto x : shape) {
+        s = s * x;
+      }
+      if (max_ < s) {
+        max_ = s;
+      }
+      size_[i] = s;
+      input_size_ = sizeof(T) * s;
+      output_size_ = sizeof(S) * s;
+      InitSizeLists();
+    }
+    workspace_size_list_.push_back(sizeof(T *) * num_input_);
+    workspace_size_list_.push_back(sizeof(S *) * num_input_);
+    workspace_size_list_.push_back(sizeof(size_t) * num_input_);
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(input_size_);
+    output_size_list_.push_back(output_size_);
+  }
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  std::unique_ptr<size_t[]> size_;
+  size_t max_;
+  size_t input_size_;
+  size_t output_size_;
+  size_t num_input_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.cc
@ -0,0 +1,140 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/combine_cast_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+kernel::KernelBuildInfoPtr GenerateKernelBuildInfo(const std::vector<AnfNodePtr> &node_list) {
+  std::vector<std::string> inputs_device_format;
+  std::vector<std::string> outputs_device_format;
+  std::vector<TypeId> inputs_device_type;
+  std::vector<TypeId> outputs_device_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+  for (size_t idx = 0; idx < node_list.size(); ++idx) {
+    auto cnode = utils::cast<CNodePtr>(node_list[idx]);
+    MS_EXCEPTION_IF_NULL(cnode);
+    for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(cnode); ++input_index) {
+      inputs_device_format.push_back(kOpFormat_DEFAULT);
+      inputs_device_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index));
+    }
+    for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(cnode); ++output_index) {
+      outputs_device_format.push_back(kOpFormat_DEFAULT);
+      outputs_device_type.push_back(AnfAlgo::GetOutputInferDataType(cnode, output_index));
+      outputs_shape.push_back(AnfAlgo::GetOutputInferShape(cnode, output_index));
+    }
+  }
+  builder.SetInputsFormat(inputs_device_format);
+  builder.SetOutputsFormat(outputs_device_format);
+  builder.SetInputsDeviceType(inputs_device_type);
+  builder.SetOutputsDeviceType(outputs_device_type);
+  return builder.Build();
+}
+
+bool GetDealList(const std::vector<AnfNodePtr> &node_list, std::vector<std::vector<AnfNodePtr>> *deal_list) {
+  std::vector<AnfNodePtr> cast_32to16_list;
+  std::vector<AnfNodePtr> cast_16to32_list;
+  for (auto &cast_node : node_list) {
+    // currently, we only deal with the construct : [Param->Cast->] to avoid being a cycle.
+    if (cast_node != nullptr && cast_node->isa<CNode>() && AnfAlgo::GetCNodeName(cast_node) == "Cast" &&
+        (AnfAlgo::GetInputNode(utils::cast<CNodePtr>(cast_node), 0))->isa<Parameter>()) {
+      auto dst = AnfAlgo::GetOutputInferDataType(cast_node, 0);
+      auto src = AnfAlgo::GetPrevNodeOutputInferDataType(cast_node, 0);
+      if (dst == kNumberTypeFloat16 && src == kNumberTypeFloat32) {
+        cast_32to16_list.push_back(cast_node);
+      } else if (dst == kNumberTypeFloat32 && src == kNumberTypeFloat16) {
+        cast_16to32_list.push_back(cast_node);
+      }
+    }
+  }
+  if (cast_32to16_list.size() <= 1 && cast_16to32_list.size() <= 1) {
+    return false;
+  }
+  if (cast_32to16_list.size() > 1) {
+    deal_list->push_back(cast_32to16_list);
+  }
+  if (cast_16to32_list.size() > 1) {
+    deal_list->push_back(cast_16to32_list);
+  }
+  return true;
+}
+}  // namespace
+bool CastAllFusion::Run(const FuncGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto manager = graph->manager();
+  std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return());
+  // 1 get all the cast node
+  std::vector<std::vector<AnfNodePtr>> deal_list;
+  if (!GetDealList(node_list, &deal_list)) {
+    return false;
+  }
+  for (auto cast_list : deal_list) {
+    // 2 create node CastAll
+    auto prim = std::make_shared<Primitive>("CastAll");
+    std::vector<AnfNodePtr> inputs = {NewValueNode(prim)};
+    // set inputs for CastAll
+    for (size_t idx = 0; idx < cast_list.size(); ++idx) {
+      inputs.push_back(AnfAlgo::GetInputNode(utils::cast<CNodePtr>(cast_list[idx]), 0));
+    }
+    auto cast_all = graph->NewCNode(inputs);
+    auto kernel_info = std::make_shared<device::KernelInfo>();
+    MS_EXCEPTION_IF_NULL(kernel_info);
+    cast_all->set_kernel_info(kernel_info);
+    AbstractBasePtrList abstract_list;
+    for (size_t idx = 0; idx < cast_list.size(); ++idx) {
+      auto cnode = utils::cast<CNodePtr>(cast_list[idx]);
+      MS_EXCEPTION_IF_NULL(cnode);
+      abstract_list.push_back(cnode->abstract());
+    }
+    auto kernel_build_info = GenerateKernelBuildInfo(cast_list);
+    AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info, cast_all.get());
+    auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
+    MS_EXCEPTION_IF_NULL(abstract_tuple);
+    cast_all->set_abstract(abstract_tuple);
+    AnfAlgo::SetNodeAttr("n", MakeValue(cast_list.size()), cast_all);
+    // 3 replace all the cast by CastAllv tuplegetitem[castall, idx]
+    for (size_t idx = 0; idx < cast_list.size(); ++idx) {
+      std::vector<AnfNodePtr> tuple_getitem_input;
+      tuple_getitem_input.push_back(NewValueNode(prim::kPrimTupleGetItem));
+      tuple_getitem_input.push_back(cast_all);
+      auto index = NewValueNode(SizeToInt(idx));
+      auto imm = std::make_shared<Int32Imm>(idx);
+      auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm);
+      MS_EXCEPTION_IF_NULL(abstract_scalar);
+      index->set_abstract(abstract_scalar);
+      tuple_getitem_input.push_back(index);
+      AnfNodePtr tuple_getitem = graph->NewCNode(tuple_getitem_input);
+      MS_EXCEPTION_IF_NULL(tuple_getitem);
+      tuple_getitem->set_abstract(cast_list[idx]->abstract());
+      if (!manager->Replace(cast_list[idx], tuple_getitem)) {
+        MS_LOG(EXCEPTION) << "manager replace node failed";
+      }
+    }
+  }
+  return true;
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.h
@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class CastAllFusion : public Pass {
+ public:
+  explicit CastAllFusion(const std::string &name) : Pass("cast_all") {}
+  ~CastAllFusion() override = default;
+  bool Run(const FuncGraphPtr &graph) override;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_