Merge branch 'master' of https://gitee.com/mindspore/mindspore into export

pull/11135/head
yoni 4 years ago
commit 53ace98343

@ -78,6 +78,7 @@ IF NOT EXIST "%BUILD_PATH%/mindspore" (
cd %BUILD_PATH%/mindspore
IF "%1%" == "lite" (
(git log -1 | findstr "^commit") > %BUILD_PATH%\.commit_id
cmake -DPLATFORM_ARM64=off -DSUPPORT_TRAIN=off ^
-DENABLE_TOOLS=on -DENABLE_CONVERTER=on -DBUILD_TESTCASES=off ^
-DCMAKE_BUILD_TYPE=Release -DSUPPORT_GPU=off -DBUILD_MINDDATA=off -DOFFLINE_COMPILE=off ^

@ -510,6 +510,11 @@ get_version() {
VERSION_STR=${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION}
}
write_commit_file() {
COMMIT_STR=$(git log -1 | grep commit)
echo ${COMMIT_STR} > "${BASEPATH}/mindspore/lite/build/.commit_id"
}
build_lite()
{
get_version
@ -542,6 +547,7 @@ build_lite()
fi
mkdir -pv build
cd build
write_commit_file
BUILD_TYPE="Release"
if [[ "${DEBUG_MODE}" == "on" ]]; then
BUILD_TYPE="Debug"

File diff suppressed because it is too large Load Diff

@ -76,6 +76,16 @@ void Reciprocal(const T *in, T *out, size_t start, size_t end) {
out[i] = static_cast<T>(1.0 / in[i]);
}
}
template <typename T>
void Gelu(const T *in, T *out, size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T x = in[i];
auto double_x = static_cast<T>(x);
T tanh_res = (T)std::tanh(0.7978845608 * (double_x + 0.044715 * double_x * double_x * double_x));
out[i] = x * ((T)1.0 + tanh_res) / (T)2.0;
}
}
} // namespace
void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@ -95,6 +105,8 @@ void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
operate_type_ = FLOOR;
} else if (kernel_name == prim::kPrimReciprocal->name()) {
operate_type_ = RECIPROCAL;
} else if (kernel_name == prim::kPrimGelu->name()) {
operate_type_ = GELU;
}
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
}
@ -150,6 +162,8 @@ void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs
threads.emplace_back(std::thread(Floor<T>, input, output, start, end));
} else if (operate_type_ == RECIPROCAL) {
threads.emplace_back(std::thread(Reciprocal<T>, input, output, start, end));
} else if (operate_type_ == GELU) {
threads.emplace_back(std::thread(Gelu<T>, input, output, start, end));
}
start += once_compute_size;
}

@ -62,6 +62,8 @@ MS_REG_CPU_KERNEL(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutput
ArithmeticSelfCPUKernel);
MS_REG_CPU_KERNEL(Reciprocal, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
ArithmeticSelfCPUKernel);
MS_REG_CPU_KERNEL(Gelu, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
ArithmeticSelfCPUKernel);
} // namespace kernel
} // namespace mindspore

@ -89,6 +89,8 @@ enum OperateType {
GREATER,
GREATEREQUAL,
RECIPROCAL,
GELU,
GELUGRAD,
};
class CPUKernel : public kernel::KernelMod {

@ -78,6 +78,18 @@ void EltWiseGradCPUKernel::TanhGrad(const T *input1, const T *input2, T *out, si
}
}
template <typename T>
void EltWiseGradCPUKernel::GeluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T x = input2[i];
auto double_x = static_cast<T>(x);
T tanh_res = (T)std::tanh(0.7978845608 * (double_x + 0.044715 * double_x * double_x * double_x));
T mul_right = (T)(0.7978845608 + 0.1070322244 * double_x * double_x);
T y_res = (((T)1.0 + tanh_res) + x * ((T)1.0 - tanh_res * tanh_res) * mul_right) / (T)2.0;
out[i] = input1[i] * y_res;
}
}
void EltWiseGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
@ -93,6 +105,8 @@ void EltWiseGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
operate_type_ = TANHGRAD;
} else if (kernel_name == "SqrtGrad") {
operate_type_ = SQRTGRAD;
} else if (kernel_name == "GeluGrad") {
operate_type_ = GELUGRAD;
} else {
MS_LOG(EXCEPTION) << "Not support " << kernel_name;
}
@ -172,6 +186,8 @@ void EltWiseGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, c
threads.emplace_back(std::thread(&EltWiseGradCPUKernel::TanhGrad<T>, this, input1, input2, output, start, end));
} else if (operate_type_ == SQRTGRAD) {
threads.emplace_back(std::thread(&EltWiseGradCPUKernel::SqrtGrad<T>, this, input1, input2, output, start, end));
} else if (operate_type_ == GELUGRAD) {
threads.emplace_back(std::thread(&EltWiseGradCPUKernel::GeluGrad<T>, this, input1, input2, output, start, end));
} else {
MS_LOG(EXCEPTION) << "Not support " << operate_type_;
}

@ -47,6 +47,8 @@ class EltWiseGradCPUKernel : public CPUKernel {
void SqrtGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
template <typename T>
void TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
template <typename T>
void GeluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
std::vector<size_t> input_shape0_;
std::vector<size_t> input_shape1_;
std::vector<size_t> input_element_num0_;
@ -81,6 +83,13 @@ MS_REG_CPU_KERNEL(
TanhGrad,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
EltWiseGradCPUKernel);
MS_REG_CPU_KERNEL(GeluGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
EltWiseGradCPUKernel);
} // namespace kernel
} // namespace mindspore

@ -0,0 +1,105 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/layer_norm_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
void LayerNormCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto begin_norm_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_norm_axis");
auto begin_params_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_params_axis");
if (begin_norm_axis < 0) {
begin_norm_axis += x_shape.size();
}
if (begin_params_axis < 0) {
begin_params_axis += x_shape.size();
}
for (size_t i = 0; i < IntToSize(begin_norm_axis); i++) {
block_num_ *= x_shape[i];
}
for (size_t i = IntToSize(begin_norm_axis); i < x_shape.size(); i++) {
block_size_ *= x_shape[i];
}
for (size_t i = IntToSize(begin_params_axis); i < x_shape.size(); i++) {
param_num_ *= x_shape[i];
}
if (block_num_ <= 0 || block_size_ <= 0) {
MS_LOG(EXCEPTION) << "LayerNormCPUKernel input shape error, input shape: " << x_shape;
}
}
bool LayerNormCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) {
LaunchKernel<float>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << "input dtype only support float16, float32, float64";
}
return true;
}
template <typename T>
void LayerNormCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
size_t f_size = sizeof(T);
if (inputs[1]->size != f_size * param_num_ || inputs[2]->size != f_size * param_num_) {
MS_LOG(EXCEPTION) << "The product of gamma and beta's shape must be " << param_num_;
}
if (outputs[1]->size != f_size * block_num_ || outputs[2]->size != f_size * block_num_) {
MS_LOG(EXCEPTION) << "The product of mean and var's shape must be " << block_num_;
}
auto x = reinterpret_cast<T *>(inputs[0]->addr);
auto gamma = reinterpret_cast<T *>(inputs[1]->addr);
auto beta = reinterpret_cast<T *>(inputs[2]->addr);
auto y = reinterpret_cast<T *>(outputs[0]->addr);
auto mean = reinterpret_cast<T *>(outputs[1]->addr);
auto var = reinterpret_cast<T *>(outputs[2]->addr);
for (size_t i = 0; i < block_num_; ++i) {
T sum = (T)0.0;
T square_sum = (T)0.0;
for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
sum += x[j];
square_sum += x[j] * x[j];
}
T block_mean = sum / block_size_;
T block_var = square_sum / block_size_ - block_mean * block_mean;
for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
auto param_shift = j % param_num_;
y[j] = (x[j] - block_mean) / (T)std::sqrt(static_cast<double>(block_var) + eps_) * gamma[param_shift] +
beta[param_shift];
}
mean[i] = block_mean;
var[i] = block_var;
}
}
void LayerNormCPUKernel::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(EXCEPTION) << "LayerNormCPUKernel needs 3 inputs, but gets " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 3) {
MS_LOG(EXCEPTION) << "LayerNormCPUKernel expects 3 output, but gets" << output_num;
}
}
} // namespace kernel
} // namespace mindspore

@ -0,0 +1,70 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_
#include <memory>
#include <unordered_map>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class LayerNormCPUKernel : public CPUKernel {
public:
LayerNormCPUKernel() = default;
~LayerNormCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
private:
void CheckParam(const CNodePtr &kernel_node);
TypeId dtype_{kTypeUnknown};
float eps_{1e-12};
size_t block_num_{1};
size_t block_size_{1};
size_t param_num_{1};
};
MS_REG_CPU_KERNEL(LayerNorm,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
LayerNormCPUKernel);
MS_REG_CPU_KERNEL(LayerNorm,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
LayerNormCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_

@ -0,0 +1,124 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
void LayerNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto begin_norm_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_norm_axis");
auto begin_params_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_params_axis");
if (begin_norm_axis < 0) {
begin_norm_axis += x_shape.size();
}
if (begin_params_axis < 0) {
begin_params_axis += x_shape.size();
}
for (size_t i = 0; i < IntToSize(begin_norm_axis); i++) {
block_num_ *= x_shape[i];
}
for (size_t i = IntToSize(begin_norm_axis); i < x_shape.size(); i++) {
block_size_ *= x_shape[i];
}
for (size_t i = 0; i < IntToSize(begin_params_axis); i++) {
param_size_ *= x_shape[i];
}
for (size_t i = begin_params_axis; i < x_shape.size(); i++) {
param_num_ *= x_shape[i];
}
if (block_num_ <= 0 || block_size_ <= 0) {
MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel input shape error, input shape: " << x_shape;
}
}
bool LayerNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) {
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, workspace, outputs);
} else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) {
LaunchKernel<float>(inputs, workspace, outputs);
} else {
MS_LOG(EXCEPTION) << "input dtype only support float16, float32, float64";
}
return true;
}
template <typename T>
void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
auto x = reinterpret_cast<T *>(inputs[0]->addr);
auto dy = reinterpret_cast<T *>(inputs[1]->addr);
auto var = reinterpret_cast<T *>(inputs[2]->addr);
auto mean = reinterpret_cast<T *>(inputs[3]->addr);
auto gamma = reinterpret_cast<T *>(inputs[4]->addr);
auto dx = reinterpret_cast<T *>(outputs[0]->addr);
auto dg = reinterpret_cast<T *>(outputs[1]->addr);
auto db = reinterpret_cast<T *>(outputs[2]->addr);
for (size_t i = 0; i < param_num_; ++i) {
T dgamma = (T)0.0;
T dbeta = (T)0.0;
for (size_t j = i; j < param_size_ * param_num_; j += param_num_) {
auto norm_shift = static_cast<int>(j / block_size_);
dgamma += dy[j] * (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -0.5) * (x[j] - mean[norm_shift]);
dbeta += dy[j];
}
dg[i] = dgamma;
db[i] = dbeta;
}
for (size_t i = 0; i < block_num_; ++i) {
T sum1 = (T)0.0;
T sum2 = (T)0.0;
T sum3 = (T)0.0;
for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
auto param_shift = j % param_num_;
auto norm_shift = static_cast<int>(j / block_size_);
auto dxm = x[j] - mean[norm_shift];
auto dyg = dy[j] * gamma[param_shift];
sum1 += (T)(-0.5) * dyg * dxm * (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -1.5);
sum2 += dyg;
sum3 += (T)(-2.0) * dxm;
}
for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
auto param_shift = j % param_num_;
auto norm_shift = static_cast<int>(j / block_size_);
auto var_sqrt = (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -0.5);
auto dx1 = dy[j] * gamma[param_shift] * var_sqrt;
auto dx2 = sum1 * (T)2.0 / block_size_ * (x[j] - mean[norm_shift]);
auto dx3 = ((T)(-1.0) * var_sqrt * sum2 + ((T)1.0 / block_size_) * sum1 * sum3) * ((T)1.0 / block_size_);
dx[j] = dx1 + dx2 + dx3;
}
}
}
void LayerNormGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 5) {
MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel needs 5 inputs, but gets " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 3) {
MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel expects 3 output, but gets" << output_num;
}
}
} // namespace kernel
} // namespace mindspore

@ -0,0 +1,76 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_
#include <memory>
#include <unordered_map>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class LayerNormGradCPUKernel : public CPUKernel {
public:
LayerNormGradCPUKernel() = default;
~LayerNormGradCPUKernel() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs);
private:
void CheckParam(const CNodePtr &kernel_node);
TypeId dtype_{kTypeUnknown};
float eps_{1e-12};
size_t block_num_{1};
size_t block_size_{1};
size_t param_num_{1};
size_t param_size_{1};
};
MS_REG_CPU_KERNEL(LayerNormGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
LayerNormGradCPUKernel);
MS_REG_CPU_KERNEL(LayerNormGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
LayerNormGradCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_

@ -53,6 +53,9 @@ void BnupdateEltwiseEltwiseFusionPass::MatchBnupdateAddRelu(const CNodePtr &cnod
auto add = relu_input->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(add);
auto tuple_getitem = add->input(1);
std::vector<int64_t> add_output_used_num;
add_output_used_num.emplace_back(SizeToLong(manager->node_users()[add].size()));
AnfAlgo::SetNodeAttr(kAttrOutputUsedNum, MakeValue(add_output_used_num), add);
MS_EXCEPTION_IF_NULL(tuple_getitem);
if (tuple_getitem->isa<CNode>() && AnfAlgo::GetCNodeName(tuple_getitem) == prim::kPrimTupleGetItem->name()) {
auto getitem = tuple_getitem->cast<CNodePtr>();

@ -141,15 +141,6 @@ class InlinerBase : public AnfVisitor {
}
if (IsUniqueUse(nullptr, fg, nullptr)) {
// The other branch calling the last after block.
if (fg->has_flag(FUNC_GRAPH_FLAG_AFTER_BLOCK)) {
// Check if parameters' changed.
auto param_simplified_caller = SimplifyAfterParameter(fg, node, args);
if (param_simplified_caller != nullptr) {
return param_simplified_caller;
}
}
// For the single used fg, including non-after and after not matched above,
// we move the whole fg nodes.
if (use_move_) {
@ -160,6 +151,15 @@ class InlinerBase : public AnfVisitor {
mng->MoveAllCNodeDropGraph(fg, node->func_graph(), inputs[0]->scope());
return out_node;
}
// The other branch calling the last after block.
if (fg->has_flag(FUNC_GRAPH_FLAG_AFTER_BLOCK)) {
// Check if parameters' changed.
auto param_simplified_caller = SimplifyAfterParameter(fg, node, args);
if (param_simplified_caller != nullptr) {
return param_simplified_caller;
}
}
} else {
// We don't expand the middle multiple used after block, except the last one.
if (GraphHasBranch(fg)) {

@ -298,6 +298,49 @@ def _generate_pip_args(obj, *args, method="construct"):
return args_names, args_list
def _get_auto_split_param_names(parameter_layout_dict):
auto_split_params = {}
for key, value in parameter_layout_dict.items():
for dim in value[1]:
if dim != -1:
auto_split_params[key] = value
break
auto_split_param_names = (param_name for param_name in auto_split_params)
return auto_split_param_names
def _build_broadcast_graph(broadcast_params_dict, broadcast_phase):
"""Build broadcast graph."""
from mindspore.nn.wrap.cell_wrapper import _BroadCastCell
if not broadcast_params_dict:
broadcast_params_dict = {}
broadcast_params = []
for param in broadcast_params_dict.values():
broadcast_params.append(Tensor(param.asnumpy()))
_broadcast_net = _BroadCastCell(broadcast_params)
_broadcast_net.phase = broadcast_phase
broadcasted_params = _broadcast_net()
for param_name, param in zip(broadcast_params_dict.keys(), broadcasted_params):
broadcast_params_dict[param_name].set_data(param)
def _parameter_broadcast(obj, auto_parallel_mode):
"""Parameter broadcast."""
auto_split_param_names = []
if auto_parallel_mode:
auto_split_param_names = _get_auto_split_param_names(obj.parameter_layout_dict)
broadcast_params_dict = obj.parameters_broadcast_dict()
if auto_split_param_names and broadcast_params_dict:
broadcast_params_dict = OrderedDict()
for param_name, param in obj.parameters_broadcast_dict().items():
if param_name not in auto_split_param_names:
broadcast_params_dict[param_name] = param
broadcast_phase = "_broadcast_subgraph"
_build_broadcast_graph(broadcast_params_dict, broadcast_phase)
class _PynativeExecutor:
"""
An pynative executor used to compile/manage/run graph.
@ -339,6 +382,10 @@ class _PynativeExecutor:
def leave_construct(self, cell):
self._executor.leave_construct(cell)
def parameter_broadcast(self, obj, phase, auto_parallel_mode):
if BROADCAST_PHASE not in phase and _get_parameter_broadcast():
_parameter_broadcast(obj, auto_parallel_mode)
def __call__(self, obj, *args, **kwargs):
args = args + tuple(kwargs.values())
return self._executor(obj, args, "")
@ -391,31 +438,6 @@ class _Executor:
def _build_data_graph(self, obj, phase):
self._executor.build_data_graph(obj.parameters_dict(), phase, obj.parameters_broadcast_dict())
def _get_auto_split_param_names(self, parameter_layout_dict):
auto_split_params = {}
for key, value in parameter_layout_dict.items():
for dim in value[1]:
if dim != -1:
auto_split_params[key] = value
break
auto_split_param_names = (param_name for param_name in auto_split_params)
return auto_split_param_names
def _build_broadcast_graph(self, broadcast_params_dict, broadcast_phase):
"""Build broadcast graph."""
from mindspore.nn.wrap.cell_wrapper import _BroadCastCell
if not broadcast_params_dict:
broadcast_params_dict = {}
broadcast_params = []
for param in broadcast_params_dict.values():
broadcast_params.append(Tensor(param.asnumpy()))
_broadcast_net = _BroadCastCell(broadcast_params)
_broadcast_net.phase = broadcast_phase
broadcasted_params = _broadcast_net()
for param_name, param in zip(broadcast_params_dict.keys(), broadcasted_params):
broadcast_params_dict[param_name].set_data(param)
def _set_dataset_mode(self, args_list):
"""set dataset mode."""
# decide whether to sink based on whether the inputs is virtual or args_list is ()
@ -500,18 +522,7 @@ class _Executor:
elif not enable_ge and "export" in phase:
self._build_data_graph(obj, phase)
elif BROADCAST_PHASE not in phase and _get_parameter_broadcast():
auto_split_param_names = []
if auto_parallel_mode:
auto_split_param_names = self._get_auto_split_param_names(obj.parameter_layout_dict)
broadcast_params_dict = obj.parameters_broadcast_dict()
if auto_split_param_names and broadcast_params_dict:
broadcast_params_dict = OrderedDict()
for param_name, param in obj.parameters_broadcast_dict().items():
if param_name not in auto_split_param_names:
broadcast_params_dict[param_name] = param
broadcast_phase = "_broadcast_subgraph"
self._build_broadcast_graph(broadcast_params_dict, broadcast_phase)
_parameter_broadcast(obj, auto_parallel_mode)
return phase, True

@ -114,7 +114,7 @@ std::vector<int> NetRunner::FillInputData(const std::vector<DataLabelTuple> &dat
int label = 0;
char *data = nullptr;
std::tie(data, label) = dataset[idx];
std::copy(data, data + data_size, input_data + i * data_size);
std::copy(data, data + data_size_, input_data + i * data_size_);
labels[i * num_of_classes_ + label] = 1.0; // Model expects labels in onehot representation
labels_vec.push_back(label);
}

@ -36,13 +36,9 @@ class ConcatFp16CPUKernel : public LiteKernel {
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
concat_param_ = reinterpret_cast<ConcatParameter *>(op_parameter_);
}
~ConcatFp16CPUKernel() = default;
int Init() override;
int ReSize() override;
int Run() override;
private:

@ -207,18 +207,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) {
}
int Convolution1x1FP16CPUKernel::Run() {
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get executor tensor failed.";
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return ret;
}
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
pack_input_ = reinterpret_cast<float16_t *>(
ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t)));
if (pack_input_ == nullptr) {
MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
@ -232,6 +226,7 @@ int Convolution1x1FP16CPUKernel::Run() {
input_ptr_ = batch_in;
}
int ret = RET_ERROR;
if (multi_thread_by_hw_) {
ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_);
} else {
@ -240,16 +235,12 @@ int Convolution1x1FP16CPUKernel::Run() {
}
if (ret != RET_OK) {
MS_LOG(ERROR) << "ParallelLaunch failed.";
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
ctx_->allocator->Free(pack_input_);
pack_input_ = nullptr;
return ret;
}
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
ctx_->allocator->Free(pack_input_);
pack_input_ = nullptr;
return RET_OK;

@ -33,19 +33,10 @@ ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() {
}
int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
// ===================input====================//
auto input_tensor = in_tensors_.at(kInputIndex);
in_data_type_ = input_tensor->data_type();
MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16);
execute_input_ = ConvertInputFp32toFp16(input_tensor, context_);
// ==================output====================//
auto out_tensor = out_tensors_.at(kOutputIndex);
out_data_type_ = out_tensor->data_type();
MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16);
execute_output_ = MallocOutputFp16(out_tensor, context_);
auto input_tensor = in_tensors_.at(0);
auto output_tensor = out_tensors_.at(0);
execute_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
execute_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
return RET_OK;
}
@ -78,25 +69,4 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() {
}
return RET_OK;
}
void ConvolutionBaseFP16CPUKernel::IfCastOutput() {
if (out_data_type_ == kNumberTypeFloat32) {
auto out_tensor = out_tensors_.at(kOutputIndex);
auto out_ele_num = out_tensor->ElementsNum();
auto output_addr = reinterpret_cast<float *>(out_tensor->MutableData());
Float16ToFloat32(execute_output_, output_addr, out_ele_num);
}
}
void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() {
if (in_data_type_ == kNumberTypeFloat32) {
context_->allocator->Free(execute_input_);
execute_input_ = nullptr;
}
if (out_data_type_ == kNumberTypeFloat32) {
context_->allocator->Free(execute_output_);
execute_output_ = nullptr;
}
}
} // namespace mindspore::kernel

@ -38,16 +38,12 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
int RunImpl(int task_id) { return mindspore::lite::RET_OK; }
virtual int GetExecuteTensor();
virtual int GetExecuteFilter();
virtual void IfCastOutput();
void FreeTmpBuffer();
protected:
float16_t *fp16_weight_ = nullptr;
float16_t *execute_input_ = nullptr;
float16_t *execute_weight_ = nullptr;
float16_t *execute_output_ = nullptr;
TypeId in_data_type_;
TypeId out_data_type_;
};
} // namespace mindspore::kernel

@ -114,19 +114,13 @@ static int ConvDwFp16Run(void *cdata, int task_id) {
}
int ConvolutionDepthwiseFp16CPUKernel::Run() {
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
}
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return ret;
}

@ -149,13 +149,8 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
return ret;
}
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
FreePackedInputOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return ret;
}
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
@ -172,8 +167,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreePackedInputOutput();
return ret;
}

@ -128,17 +128,11 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) {
}
int ConvolutionFP16CPUKernel::Run() {
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return ret;
}
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
ret = InitTmpBuffer();
auto ret = InitTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreeTmpBuffer();
return RET_ERROR;
}
@ -147,8 +141,7 @@ int ConvolutionFP16CPUKernel::Run() {
if (ret != RET_OK) {
MS_LOG(ERROR) << "conv fp16 error ret[" << ret << "]";
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreeTmpBuffer();
return ret;
}

@ -195,17 +195,11 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) {
}
int ConvolutionWinogradFP16CPUKernel::Run() {
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return ret;
}
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
ret = InitTmpBuffer();
auto ret = InitTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreeTmpBuffer();
return RET_ERROR;
}
@ -215,8 +209,6 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]";
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreeTmpBuffer();
return ret;
}

@ -162,13 +162,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR;
}
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
FreePackedInputOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return ret;
}
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
@ -189,8 +184,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreePackedInputOutput();
return ret;
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save