From ee87860eeb7107420296e39ed9259f287649ebe1 Mon Sep 17 00:00:00 2001 From: yang_chun Date: Thu, 25 Mar 2021 13:06:08 +0800 Subject: [PATCH] Reduce/Transpose/TensorAdd CPU kernel performance improve! --- .../cpu/mkldnn/tensoradd_cpu_kernel.cc | 65 ---- .../kernel_compiler/cpu/reduce_cpu_kernel.cc | 277 +++++++++--------- .../kernel_compiler/cpu/reduce_cpu_kernel.h | 17 +- .../cpu/tensoradd_cpu_kernel.cc | 150 ++++++++++ .../cpu/{mkldnn => }/tensoradd_cpu_kernel.h | 15 +- .../cpu/transpose_cpu_kernel.cc | 122 +++++--- .../cpu/transpose_cpu_kernel.h | 8 +- 7 files changed, 388 insertions(+), 266 deletions(-) delete mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.cc create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc rename mindspore/ccsrc/backend/kernel_compiler/cpu/{mkldnn => }/tensoradd_cpu_kernel.h (68%) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.cc deleted file mode 100644 index 0e6330192e..0000000000 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.cc +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.h" -#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" -#include "runtime/device/cpu/cpu_device_address.h" -#include "utils/ms_utils.h" - -namespace mindspore { -namespace kernel { -void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - std::vector src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - std::vector src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); - std::vector dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - need_swap_ = BinaryBroadCast(&src0_shape, &src1_shape, &dst_shape); - dnnl::memory::desc src0_desc; - dnnl::memory::desc src1_desc; - if (need_swap_) { - src0_desc = GetDefaultMemDesc(src1_shape); - src1_desc = GetDefaultMemDesc(src0_shape); - } else { - src0_desc = GetDefaultMemDesc(src0_shape); - src1_desc = GetDefaultMemDesc(src1_shape); - } - dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape); - dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_desc, src1_desc, dst_desc); - auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine()); - primitive_ = std::make_shared(prim_desc); - AddArgument(DNNL_ARG_SRC_0, src0_desc); - AddArgument(DNNL_ARG_SRC_1, src1_desc); - AddArgument(DNNL_ARG_DST, dst_desc); -} - -bool TensorAddCPUKernel::Launch(const std::vector &inputs, - const std::vector & /*workspace*/, - const std::vector &outputs) { - if (inputs.size() < 2 || outputs.empty()) { - MS_LOG(EXCEPTION) << "TensorAdd error input output size!"; - } - if (need_swap_) { - SetArgumentHandle(DNNL_ARG_SRC_0, inputs[1]->addr); - SetArgumentHandle(DNNL_ARG_SRC_1, inputs[0]->addr); - } else { - SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr); - SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr); - } - SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); - ExecutePrimitive(); - return true; -} -} // namespace kernel -} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc index 55100037d7..d24c15662c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc @@ -14,186 +14,171 @@ * limitations under the License. */ +#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h" #include #include #include -#include -#include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h" -#include "runtime/device/cpu/cpu_device_address.h" +#include namespace mindspore { namespace kernel { -const size_t kReduceTypeMax = 1; -const size_t kReduceTypeMean = 2; -const size_t kReduceTypeSum = 3; -const size_t kReduceTypeMin = 4; -const size_t kMaxDim = 100; -static std::map reduce_types_map_ = { - {"ReduceMax", 1}, {"ReduceMean", 2}, {"ReduceSum", 3}, {"ReduceMin", 4}}; - +namespace { +const size_t kMaxDim = 10; +} // namespace template void ReduceCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); - std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); - - reduce_type_ = reduce_types_map_[kernel_name]; - if (reduce_type_ == 0) { - MS_LOG(EXCEPTION) << "Array reduce kernel type " << kernel_name << " is not supported."; - } - shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - CheckAxis(kernel_node); - if (shape_.empty()) { - shape_.push_back(1); - } - for (size_t i = 0; i < shape_.size(); ++i) { - if (shape_[i] <= 0) { - MS_LOG(EXCEPTION) << "shape value is invalid."; - } - left_dims_ *= shape_[i]; - } - for (size_t i = 0; i < axis_.size(); ++i) { - stride_ *= shape_[axis_[i]]; + input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS); + if (axis_addr->isa() || axis_addr->isa()) { + axis_ = AnfAlgo::GetNodeAttr>(kernel_node, AXIS); + } else if (axis_addr->isa()) { + axis_.emplace_back(AnfAlgo::GetNodeAttr(kernel_node, AXIS)); + } else { + MS_LOG(EXCEPTION) << "Attribute is invalid"; } - if (stride_ <= 0) { - MS_LOG(EXCEPTION) << "stride_ must greater than zero."; + int dimension = input_shape_.size(); + std::transform(axis_.begin(), axis_.end(), axis_.begin(), + [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); + sort(axis_.begin(), axis_.end()); + auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); + if (kernel_name == "ReduceMax") { + reduce_type_ = 1; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); }; + } else if (kernel_name == "ReduceMin") { + reduce_type_ = 2; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); }; + } else if (kernel_name == "ReduceSum") { + reduce_type_ = 3; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; + } else if (kernel_name == "ReduceMean") { + reduce_type_ = 4; + reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; + } else { + MS_LOG(EXCEPTION) << "unsupported reduce type: " << reduce_type_; } - left_dims_ = left_dims_ / stride_; + + CheckParameter(); } template bool ReduceCPUKernel::Launch(const std::vector &inputs, const std::vector & /*workspaces*/, const std::vector &outputs) { - size_t out_size = left_dims_ * sizeof(T); - size_t in_size = stride_ * out_size; - if (inputs[0]->size != in_size || outputs[0]->size != out_size) { - MS_LOG(EXCEPTION) << "invalid input or output data size!"; - } - auto input = reinterpret_cast(inputs[0]->addr); - auto output = reinterpret_cast(outputs[0]->addr); - int size = inputs[0]->size / sizeof(T); - std::vector new_input(IntToSize(size), 0.0); - std::vector transpose_axis; - for (size_t i = 0; i < shape_.size(); ++i) { - bool insert = true; - for (size_t j = 0; j < axis_.size(); ++j) { - if (axis_[j] == i) { - insert = false; - break; - } + size_t input_size = inputs[0]->size / sizeof(T); + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); + if (axis_.empty()) { + // Get one ret + *output_addr = input_addr[0]; + for (size_t i = 1; i < input_size; ++i) { + reduce_func_(input_addr, i, output_addr); } - if (insert) { - transpose_axis.push_back(i); + if (reduce_type_ == 4) { // 4 is reduce mean + *output_addr /= input_size; + } + } else { + // transpose->calculate strides->calculate ret + std::vector out_shape; + std::vector strides; + std::vector back_strides; + size_t stride; + CalculateTransposeInfo(&out_shape, &strides, &back_strides, &stride); + int dimension = input_shape_.size(); + std::vector coordinates(dimension); + auto get_next_pos = [&coordinates, &out_shape, &strides, &back_strides, &dimension](size_t &curr_pos) { + for (int i = dimension - 1; i >= 0; --i) { + if (coordinates[i] + 1 == out_shape[i]) { + coordinates[i] = 0; + curr_pos -= back_strides[i]; + } else { + coordinates[i]++; + curr_pos += strides[i]; + break; + } + } + }; + size_t output_size = outputs[0]->size / sizeof(T); + size_t pos = 0; + for (size_t i = 0; i < output_size; ++i) { + if (i != 0) { + get_next_pos(pos); + } + output_addr[i] = input_addr[pos]; + for (size_t j = 1; j < stride; ++j) { + get_next_pos(pos); + reduce_func_(input_addr, pos, &output_addr[i]); + } + if (reduce_type_ == 4) { // 4 is reduce mean + output_addr[i] /= stride; + } } } - (void)transpose_axis.insert(transpose_axis.end(), axis_.begin(), axis_.end()); - Transpose(size, input, shape_, transpose_axis, SizeToInt(shape_.size()), &new_input[0]); - ConvertDataToOutput(&new_input[0], output); return true; } template -void ReduceCPUKernel::CheckAxis(const CNodePtr &kernel_node) { - auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS); - if (axis_addr->isa() || axis_addr->isa()) { - std::vector attr_axis; - std::vector attr_axis_me = AnfAlgo::GetNodeAttr>(kernel_node, AXIS); - (void)std::transform(attr_axis_me.begin(), attr_axis_me.end(), std::back_inserter(attr_axis), - [](const int64_t &value) { return static_cast(value); }); - if (attr_axis.size() > shape_.size()) { - MS_LOG(EXCEPTION) << "invalid axis size: " << axis_.size(); - } else if (attr_axis.empty()) { - for (size_t i = 0; i < shape_.size(); ++i) { - axis_.push_back(i); - } +void ReduceCPUKernel::CalculateTransposeInfo(std::vector *new_shape, std::vector *strides, + std::vector *back_strides, size_t *stride) const { + int dimension = input_shape_.size(); + std::vector input_strides(dimension); + input_strides[dimension - 1] = 1; + for (int i = dimension - 2; i >= 0; --i) { + input_strides[i] = input_shape_[i + 1] * input_strides[i + 1]; + } + + // Calculate transpose axes and stride + std::vector axes(dimension); + int j = 0; + int k = 0; + *stride = 1; + for (int i = 0; i < dimension; ++i) { + if (i != axis_[j]) { + axes[k] = i; + ++k; } else { - for (auto axis : attr_axis) { - while (axis < 0) { - axis += SizeToInt(shape_.size()); - } - if (IntToSize(axis) >= (shape_.size())) { - MS_LOG(EXCEPTION) << "axis value is oversize."; - } - axis_.push_back(IntToSize(axis)); - } - } - } else if (axis_addr->isa()) { - int axis = static_cast(AnfAlgo::GetNodeAttr(kernel_node, AXIS)); - while (axis < 0) { - axis += SizeToInt(shape_.size()); - } - if (IntToSize(axis) >= shape_.size()) { - MS_LOG(EXCEPTION) << "axis value is oversize."; + *stride *= input_shape_[i]; + ++j; } - axis_.push_back(IntToSize(axis)); - } else { - MS_LOG(EXCEPTION) << "Attribute axis type is invalid."; } -} + for (auto &it : axis_) { + axes[k] = it; + ++k; + } -template -void ReduceCPUKernel::ConvertDataToOutput(const T *new_input, T *output) { - if (reduce_type_ == kReduceTypeMax || reduce_type_ == kReduceTypeMin) { - for (size_t i = 0; i < left_dims_; ++i) { - T value = new_input[i * stride_]; - for (size_t k = 0; k < stride_; ++k) { - if (reduce_type_ == kReduceTypeMax) { - if (value < new_input[i * stride_ + k]) { - value = new_input[i * stride_ + k]; - } - } else { - if (value > new_input[i * stride_ + k]) { - value = new_input[i * stride_ + k]; - } - } - } - output[i] = value; - } - } else if (reduce_type_ == kReduceTypeMean || reduce_type_ == kReduceTypeSum) { - for (size_t i = 0; i < left_dims_; ++i) { - T value = 0.0; - for (size_t k = 0; k < stride_; ++k) { - value += new_input[i * stride_ + k]; - } - if (reduce_type_ == kReduceTypeMean) { - output[i] = value / stride_; - } else { - output[i] = value; - } - } - } else { - MS_LOG(EXCEPTION) << "Array reduce kernel type " << reduce_type_ << " is not supported."; + // Calculate strides, new_shape, back strides + strides->resize(dimension); + new_shape->resize(dimension); + back_strides->resize(dimension); + for (int i = dimension - 1; i >= 0; --i) { + (*strides)[i] = input_strides[axes[i]]; + (*new_shape)[i] = input_shape_[axes[i]]; + (*back_strides)[i] = ((*new_shape)[i] - 1) * (*strides)[i]; } } template -void ReduceCPUKernel::Transpose(const int size, const T *input, const std::vector &input_shape, - const std::vector &input_axis, const int shape_size, T *output) { - int size_offset[kMaxDim]; - size_offset[0] = size / SizeToInt(input_shape[0]); - for (int i = 1; i < shape_size; ++i) { - size_offset[i] = size_offset[i - 1] / SizeToInt(input_shape[i]); +void ReduceCPUKernel::CheckParameter() const { + if (input_shape_.empty() || input_shape_.size() > kMaxDim) { + MS_LOG(EXCEPTION) << "Invalid input tensor of dimension: " << input_shape_.size(); } - auto task = [&](size_t start, size_t end) { - int pos_array[kMaxDim]; - for (size_t position = start; position < end; position += 1) { - size_t temp_position = position; - pos_array[0] = temp_position / size_offset[0]; - for (int i = 1; i < shape_size; ++i) { - temp_position -= pos_array[i - 1] * size_offset[i - 1]; - pos_array[i] = temp_position / size_offset[i]; - } - size_t new_position = pos_array[SizeToInt(input_axis[shape_size - 1])]; - size_t new_position_size = 1; - for (int j = shape_size - 2; j >= 0; j--) { - new_position_size *= SizeToInt(input_shape[SizeToInt(input_axis[j + 1])]); - new_position += pos_array[SizeToInt(input_axis[j])] * new_position_size; - } - output[new_position] = input[position]; + + if (axis_.empty()) { + MS_LOG(INFO) << "axis is empty"; + return; + } + + std::unordered_set checker(axis_.begin(), axis_.end()); + if (checker.size() != axis_.size()) { + MS_LOG(EXCEPTION) << "Duplicate value in axis"; + } + + int maxDimension = input_shape_.size(); + for (auto &axis : axis_) { + if (axis >= maxDimension) { + MS_LOG(EXCEPTION) << "Invalid value in axis: " << axis; } - }; - CPUKernelUtils::ParallelFor(task, size); - return; + } } } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h index 6f5b356275..acbb3ad070 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h @@ -18,6 +18,7 @@ #include #include #include +#include #include "backend/kernel_compiler/cpu/cpu_kernel.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" @@ -33,15 +34,13 @@ class ReduceCPUKernel : public CPUKernel { const std::vector &outputs) override; private: - void Transpose(const int size, const T *input, const std::vector &input_shape, - const std::vector &input_axis, const int shape_size, T *output); - void ConvertDataToOutput(const T *input, T *output); - void CheckAxis(const CNodePtr &kernel_node); - size_t reduce_type_ = 0; - std::vector axis_; - std::vector shape_; - size_t left_dims_ = 1; - size_t stride_ = 1; + void CheckParameter() const; + void CalculateTransposeInfo(std::vector *new_shape, std::vector *strides, + std::vector *back_strides, size_t *stride) const; + std::vector input_shape_; + std::vector axis_; + int reduce_type_{0}; + std::function reduce_func_; }; MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc new file mode 100644 index 0000000000..33525de99c --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc @@ -0,0 +1,150 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h" +#include + +namespace mindspore { +namespace kernel { +namespace { +struct Iterator { + std::vector coordinates_; + std::vector input_shape_a_; + std::vector input_shape_b_; + std::vector output_shape_; + std::vector input_strides_a_; + std::vector input_strides_b_; + int output_dimension_pos_{0}; + size_t pos_{0}; + Iterator(const std::vector &input_shape_a, const std::vector &input_shape_b, + const std::vector &output_shape, const std::vector &input_strides_a, + const std::vector &input_strides_b, size_t pos) + : input_shape_a_(input_shape_a), + input_shape_b_(input_shape_b), + output_shape_(output_shape), + input_strides_a_(input_strides_a), + input_strides_b_(input_strides_b), + pos_{pos} { + output_dimension_pos_ = output_shape.size() - 1; + // Calculate coordinate with pos + coordinates_.resize(output_dimension_pos_ + 1); + int tmp = pos_; + for (int i = output_dimension_pos_; i >= 0 && tmp != 0; --i) { + coordinates_[i] = tmp % output_shape_[i]; + tmp /= output_shape_[i]; + } + } + + void UpdateCoordinates() { + // Calculate output next coordinate + for (int i = output_dimension_pos_; i >= 0; --i) { + if (coordinates_[i] + 1 == output_shape_[i]) { + coordinates_[i] = 0; + } else { + ++coordinates_[i]; + break; + } + } + } + + void GenPoints(std::array *position) { + auto &idx = *position; + idx = {0, 0}; + for (int k = 0; k < output_dimension_pos_; ++k) { + if (input_shape_a_[k] > 1) { + idx[0] += coordinates_[k] * input_strides_a_[k]; + } + if (input_shape_b_[k] > 1) { + idx[1] += coordinates_[k] * input_strides_b_[k]; + } + } + if (input_shape_a_[output_dimension_pos_] > 1) { + idx[0] += coordinates_[output_dimension_pos_]; + } + if (input_shape_b_[output_dimension_pos_] > 1) { + idx[1] += coordinates_[output_dimension_pos_]; + } + } +}; +} // namespace + +void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + // Init shape ans strides + input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1); + output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); +} + +bool TensorAddCPUKernel::Launch(const std::vector &inputs, + const std::vector & /*workspace*/, + const std::vector &outputs) { + auto input_addr_a = reinterpret_cast(inputs[0]->addr); + auto input_addr_b = reinterpret_cast(inputs[1]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); + auto output_size = outputs[0]->size / sizeof(float); + if (input_shape_a_ == input_shape_b_) { + NormalProcess(input_addr_a, input_addr_b, output_addr, output_size); + } else { // Broadcast + BroadcastProcess(input_addr_a, input_addr_b, output_addr, output_size); + } + return true; +} + +void TensorAddCPUKernel::NormalProcess(const float *input_a, const float *input_b, float *output, size_t size) { + auto task = [output, input_a, input_b](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + output[i] = input_a[i] + input_b[i]; + } + }; + CPUKernelUtils::ParallelFor(task, size); +} + +void TensorAddCPUKernel::BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size) { + // Broadcast shape + int dimension = output_shape_.size(); + int input_dimension_a = input_shape_a_.size(); + if (input_dimension_a < dimension) { + input_shape_a_.insert(input_shape_a_.begin(), dimension - input_dimension_a, 1); + } + int input_dimension_b = input_shape_b_.size(); + if (input_dimension_b < dimension) { + input_shape_b_.insert(input_shape_b_.begin(), dimension - input_dimension_b, 1); + } + + // Calculate strides + CalculateStrides(input_shape_a_, &input_strides_a_); + CalculateStrides(input_shape_b_, &input_strides_b_); + + auto task = [this, input_a, input_b, output](size_t start, size_t end) { + Iterator iter(input_shape_a_, input_shape_b_, output_shape_, input_strides_a_, input_strides_b_, start); + std::array position{0}; + for (size_t i = start; i < end; ++i) { + iter.GenPoints(&position); + output[i] = input_a[position[0]] + input_b[position[1]]; + iter.UpdateCoordinates(); + } + }; + CPUKernelUtils::ParallelFor(task, size); +} + +void TensorAddCPUKernel::CalculateStrides(const std::vector &shape, std::vector *strides) { + strides->resize(shape.size(), 1); + for (int i = shape.size() - 2; i >= 0; --i) { + (*strides)[i] = shape[i + 1] * (*strides)[i + 1]; + } +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h similarity index 68% rename from mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.h rename to mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h index 8c3730a2c7..3d65ea6f67 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h @@ -18,11 +18,12 @@ #include #include -#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" namespace mindspore { namespace kernel { -class TensorAddCPUKernel : public MKLCPUKernel { +class TensorAddCPUKernel : public CPUKernel { public: TensorAddCPUKernel() = default; ~TensorAddCPUKernel() override = default; @@ -33,7 +34,15 @@ class TensorAddCPUKernel : public MKLCPUKernel { const std::vector &outputs) override; private: - bool need_swap_{false}; + static void NormalProcess(const float *input_a, const float *input_b, float *output, size_t size); + void BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size); + static void CalculateStrides(const std::vector &, std::vector *); + std::vector input_shape_a_; + std::vector input_shape_b_; + // Define follow var for Broadcast + std::vector output_shape_; + std::vector input_strides_a_; + std::vector input_strides_b_; }; MS_REG_CPU_KERNEL( diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc index c819dfeecc..8428038392 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc @@ -16,19 +16,22 @@ #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" #include +#include +#include #include "runtime/device/cpu/cpu_device_address.h" + namespace mindspore { namespace kernel { -const size_t kMaxDim = 100; +namespace { +const size_t kMaxDim = 10; +} + void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); - shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - std::vector axis_me = AnfAlgo::GetNodeAttr>(kernel_node, "perm"); - (void)std::transform(axis_me.begin(), axis_me.end(), std::back_inserter(axis_), - [](const int64_t &value) { return static_cast(value); }); - if (shape_.size() != axis_.size()) { - MS_LOG(EXCEPTION) << "The size of input shape and transpose axis shape must be equal."; - } + input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); + output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); + axes_ = AnfAlgo::GetNodeAttr>(kernel_node, "perm"); + CheckParameter(); dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0); if (dtype_ == kTypeUnknown) { dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); @@ -53,45 +56,84 @@ void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { } } +bool TransposeCPUFwdKernel::Launch(const std::vector &inputs, + const std::vector & /*workspace*/, + const std::vector &outputs) { + launch_func_(this, inputs, outputs); + return true; +} + +void TransposeCPUFwdKernel::CheckParameter() const { + if (input_shape_.size() > kMaxDim) { + MS_LOG(EXCEPTION) << "Input tensor is " << input_shape_.size() << ", out of bound max dimension 10"; + } + + if (input_shape_.empty()) { + MS_LOG(EXCEPTION) << "Input tensor is empty"; + } + + if (input_shape_.size() != axes_.size()) { + MS_LOG(EXCEPTION) << "Input perm size is not equal with input shape"; + } + + // Input axes include the same axis + std::unordered_set unique_axes{axes_.begin(), axes_.end()}; + if (unique_axes.size() != axes_.size()) { + MS_LOG(EXCEPTION) << "Input perm is illegal, it has the same axis"; + } + + // Input axes not in ture range(input_shape_.size()) + int64_t shape_size = input_shape_.size(); + for (auto &axis : axes_) { + if (axis < 0 || axis >= shape_size) { + MS_LOG(EXCEPTION) << "Input perm axis is out of bound input shape size"; + } + } +} + template void TransposeCPUFwdKernel::LaunchKernel(const std::vector &inputs, const std::vector &outputs) { - auto input = reinterpret_cast(inputs[0]->addr); - auto output = reinterpret_cast(outputs[0]->addr); - size_t size = IntToSize(inputs[0]->size / sizeof(T)); - size_t shape_size = IntToSize(shape_.size()); - if (shape_size > kMaxDim) { - MS_LOG(EXCEPTION) << "Input is " << shape_size << "-D, but transpose supports max " << kMaxDim << "-D inputs."; + int dimension = input_shape_.size(); + // Calculate input tensor strides + std::array input_strides{0}; + input_strides[dimension - 1] = 1; + for (int i = dimension - 2; i >= 0; --i) { + input_strides[i] = input_shape_[i + 1] * input_strides[i + 1]; } - size_t pos_array[kMaxDim]; - size_t size_offset[kMaxDim]; - size_offset[0] = size / shape_[0]; - for (size_t i = 1; i < shape_size; i++) { - size_offset[i] = size_offset[SizeToInt(i) - 1] / shape_[i]; + + // Calculate output strides and back strides + std::array strides{0}; + std::array back_strides{0}; + for (int i = dimension - 1; i >= 0; --i) { + strides[i] = input_strides[axes_[i]]; + back_strides[i] = (output_shape_[i] - 1) * strides[i]; } - for (size_t position = 0; position < size; position += 1) { - size_t temp_position = position; - pos_array[0] = temp_position / size_offset[0]; - for (size_t i = 1; i < shape_size; i++) { - temp_position -= pos_array[SizeToInt(i) - 1] * size_offset[i - 1]; - pos_array[i] = temp_position / size_offset[i]; - } - size_t new_position = pos_array[axis_[SizeToInt(shape_size) - 1]]; - size_t new_position_size = 1; - for (int j = shape_size - 2; j >= 0; j--) { - new_position_size *= shape_[axis_[j + 1]]; - new_position += pos_array[axis_[j]] * new_position_size; + + std::array coordinates{0}; + auto get_next_pos = [&coordinates, &strides, &back_strides, &dimension, this](int curr_pos) { + for (int i = dimension - 1; i >= 0; --i) { + if (coordinates[i] + 1 == output_shape_[i]) { + coordinates[i] = 0; + curr_pos -= back_strides[i]; + } else { + coordinates[i]++; + curr_pos += strides[i]; + break; + } } - output[new_position] = input[position]; - } -} + return curr_pos; + }; -bool TransposeCPUFwdKernel::Launch(const std::vector &inputs, - const std::vector & /*workspace*/, - const std::vector &outputs) { - launch_func_(this, inputs, outputs); - return true; + auto input = reinterpret_cast(inputs[0]->addr); + auto output = reinterpret_cast(outputs[0]->addr); + size_t size = IntToSize(inputs[0]->size / sizeof(T)); + output[0] = input[0]; + int pos = 0; + for (size_t i = 1; i < size; ++i) { + pos = get_next_pos(pos); + output[i] = input[pos]; + } } - } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h index 198e8bb4c4..6656db53c5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h @@ -33,12 +33,14 @@ class TransposeCPUFwdKernel : public CPUKernel { bool Launch(const std::vector &inputs, const std::vector &workspace, const std::vector &outputs) override; + private: + void CheckParameter() const; template void LaunchKernel(const std::vector &inputs, const std::vector &outputs); - private: - std::vector shape_; - std::vector axis_; + std::vector input_shape_; + std::vector output_shape_; + std::vector axes_; TypeId dtype_{kTypeUnknown}; using TypeKernel = std::function &, const std::vector &)>;