parent
b5393e6628
commit
0ce8708dee
@ -0,0 +1,56 @@
|
||||
[查看中文](./README_CN.md)
|
||||
|
||||
## What Is MindSpore Lite
|
||||
|
||||
MindSpore lite is a high-performance, lightweight open source reasoning framework that can be used to meet the needs of AI applications on mobile devices. MindSpore Lite focuses on how to deploy AI technology more effectively on devices. It has been integrated into HMS (Huawei Mobile Services) to provide inferences for applications such as image classification, object detection and OCR. MindSpore Lite will promote the development and enrichment of the AI software/hardware application ecosystem.
|
||||
|
||||
<img src="../../docs/MindSpore-Lite-architecture.png" alt="MindSpore Lite Architecture" width="600"/>
|
||||
|
||||
For more details please check out our [MindSpore Lite Architecture Guide](https://www.mindspore.cn/lite/docs/en/master/architecture.html).
|
||||
|
||||
### MindSpore Lite features
|
||||
|
||||
1. Cooperative work with MindSpore training
|
||||
- Provides training, optimization, and deployment.
|
||||
- The unified IR realizes the device-cloud AI application integration.
|
||||
|
||||
2. Lightweight
|
||||
- Provides model compress, which could help to improve performance as well.
|
||||
- Provides the ultra-lightweight reasoning solution MindSpore Micro to meet the deployment requirements in extreme environments such as smart watches and headphones.
|
||||
|
||||
3. High-performance
|
||||
- The built-in high-performance kernel computing library NNACL supports multiple convolution optimization algorithms such as Slide window, im2col+gemm, winograde, etc.
|
||||
- Assembly code to improve performance of kernel operators. Supports CPU, GPU, and NPU.
|
||||
4. Versatility
|
||||
- Supports IOS, Android.
|
||||
- Supports Lite OS.
|
||||
- Supports mobile device, smart screen, pad, and IOT devices.
|
||||
- Supports third party models such as TFLite, CAFFE and ONNX.
|
||||
|
||||
## MindSpore Lite AI deployment procedure
|
||||
|
||||
1. Model selection and personalized training
|
||||
|
||||
Select a new model or use an existing model for incremental training using labeled data. When designing a model for mobile device, it is necessary to consider the model size, accuracy and calculation amount.
|
||||
|
||||
The MindSpore team provides a series of pre-training models used for image classification, object detection. You can use these pre-trained models in your application.
|
||||
|
||||
The pre-trained models provided by MindSpore include: [Image Classification](https://download.mindspore.cn/model_zoo/official/lite/) and [Object Detection](https://download.mindspore.cn/model_zoo/official/lite/). More models will be provided in the feature.
|
||||
|
||||
MindSpore allows you to retrain pre-trained models to perform other tasks. For example: using a pre-trained image classification model, it can be retrained to recognize new image types. See [Retraining](https://www.mindspore.cn/lite/tutorial/zh-CN/master/advanced_use/retraining_of_quantized_network.html).
|
||||
|
||||
2. Model converter and optimization
|
||||
|
||||
If you use MindSpore or a third-party model, you need to use [MindSpore Lite Model Converter Tool](https://www.mindspore.cn/lite/tutorial/zh-CN/master/use/converter_tool.html) to convert the model into MindSpore Lite model. The MindSpore Lite model converter tool provides the converter of TensorFlow Lite, Caffe, ONNX to MindSpore Lite model, fusion and quantization could be introduced during convert procedure.
|
||||
|
||||
MindSpore also provides a tool to convert models running on IoT devices .
|
||||
|
||||
3. Model deployment
|
||||
|
||||
This stage mainly realizes model deployment, including model management, deployment, operation and maintenance monitoring, etc.
|
||||
|
||||
4. Inference
|
||||
|
||||
Load the model and perform inference. [Inference](https://www.mindspore.cn/lite/tutorial/zh-CN/master/use/runtime.html) is the process of running input data through the model to get output.
|
||||
|
||||
MindSpore provides a series of pre-trained models that can be deployed on mobile device [example](#TODO).
|
@ -0,0 +1,169 @@
|
||||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global ConvDwInt8PostAlign4
|
||||
#ifndef __APPLE__
|
||||
.type ConvDwInt8PostAlign4, %function
|
||||
#endif
|
||||
|
||||
// void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
|
||||
// int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
|
||||
// x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
|
||||
// x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
|
||||
|
||||
ConvDwInt8PostAlign4:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
ldr x8, [sp]
|
||||
|
||||
dup v26.4s, w5
|
||||
dup v27.4s, w4
|
||||
dup v28.4s, w6
|
||||
|
||||
dup v29.4s, w3
|
||||
dup v30.4s, w7
|
||||
dup v31.4s, w8
|
||||
|
||||
cmp x2, 16
|
||||
blt LoopDepth8
|
||||
|
||||
LoopDepth16:
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
ld1 {v2.4s}, [x1], #16
|
||||
ld1 {v3.4s}, [x1], #16
|
||||
|
||||
sqshl v0.4s, v0.4s, v26.4s
|
||||
sqshl v1.4s, v1.4s, v26.4s
|
||||
sqshl v2.4s, v2.4s, v26.4s
|
||||
sqshl v3.4s, v3.4s, v26.4s
|
||||
|
||||
sqrdmulh v0.4s, v0.4s, v27.4s
|
||||
sqrdmulh v1.4s, v1.4s, v27.4s
|
||||
sqrdmulh v2.4s, v2.4s, v27.4s
|
||||
sqrdmulh v3.4s, v3.4s, v27.4s
|
||||
|
||||
and v16.16b, v28.16b, v0.16b
|
||||
sshr v16.4s, v16.4s, #31
|
||||
sqadd v0.4s, v0.4s, v16.4s
|
||||
srshl v0.4s, v0.4s, v28.4s
|
||||
and v17.16b, v28.16b, v1.16b
|
||||
sshr v17.4s, v17.4s, #31
|
||||
sqadd v1.4s, v1.4s, v17.4s
|
||||
srshl v1.4s, v1.4s, v28.4s
|
||||
and v18.16b, v28.16b, v2.16b
|
||||
sshr v18.4s, v18.4s, #31
|
||||
sqadd v2.4s, v2.4s, v18.4s
|
||||
srshl v2.4s, v2.4s, v28.4s
|
||||
and v19.16b, v28.16b, v3.16b
|
||||
sshr v19.4s, v19.4s, #31
|
||||
sqadd v3.4s, v3.4s, v19.4s
|
||||
srshl v3.4s, v3.4s, v28.4s
|
||||
|
||||
add v0.4s, v0.4s, v29.4s
|
||||
add v1.4s, v1.4s, v29.4s
|
||||
add v2.4s, v2.4s, v29.4s
|
||||
add v3.4s, v3.4s, v29.4s
|
||||
|
||||
smax v0.4s, v0.4s, v30.4s
|
||||
smax v1.4s, v1.4s, v30.4s
|
||||
smax v2.4s, v2.4s, v30.4s
|
||||
smax v3.4s, v3.4s, v30.4s
|
||||
|
||||
smin v0.4s, v0.4s, v31.4s
|
||||
smin v1.4s, v1.4s, v31.4s
|
||||
smin v2.4s, v2.4s, v31.4s
|
||||
smin v3.4s, v3.4s, v31.4s
|
||||
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn v1.4h, v1.4s
|
||||
sqxtn v2.4h, v2.4s
|
||||
sqxtn v3.4h, v3.4s
|
||||
|
||||
sqxtn v0.8b, v0.8h
|
||||
sqxtn v1.8b, v1.8h
|
||||
sqxtn v2.8b, v2.8h
|
||||
sqxtn v3.8b, v3.8h
|
||||
|
||||
st1 {v0.s}[0], [x0], #4
|
||||
st1 {v1.s}[0], [x0], #4
|
||||
st1 {v2.s}[0], [x0], #4
|
||||
st1 {v3.s}[0], [x0], #4
|
||||
|
||||
sub x2, x2, #16
|
||||
cmp x2, #16
|
||||
bge LoopDepth16
|
||||
|
||||
LoopDepth8:
|
||||
cmp x2, #8
|
||||
blt LoopDepth4
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
ld1 {v1.4s}, [x1], #16
|
||||
|
||||
sqshl v0.4s, v0.4s, v26.4s
|
||||
sqshl v1.4s, v1.4s, v26.4s
|
||||
|
||||
sqrdmulh v0.4s, v0.4s, v27.4s
|
||||
sqrdmulh v1.4s, v1.4s, v27.4s
|
||||
|
||||
and v16.16b, v28.16b, v0.16b
|
||||
sshr v16.4s, v16.4s, #31
|
||||
sqadd v0.4s, v0.4s, v16.4s
|
||||
srshl v0.4s, v0.4s, v28.4s
|
||||
and v17.16b, v28.16b, v1.16b
|
||||
sshr v17.4s, v17.4s, #31
|
||||
sqadd v1.4s, v1.4s, v17.4s
|
||||
srshl v1.4s, v1.4s, v28.4s
|
||||
|
||||
add v0.4s, v0.4s, v29.4s
|
||||
add v1.4s, v1.4s, v29.4s
|
||||
|
||||
smax v0.4s, v0.4s, v30.4s
|
||||
smax v1.4s, v1.4s, v30.4s
|
||||
|
||||
smin v0.4s, v0.4s, v31.4s
|
||||
smin v1.4s, v1.4s, v31.4s
|
||||
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn v1.4h, v1.4s
|
||||
|
||||
sqxtn v0.8b, v0.8h
|
||||
sqxtn v1.8b, v1.8h
|
||||
|
||||
st1 {v0.s}[0], [x0], #4
|
||||
st1 {v1.s}[0], [x0], #4
|
||||
|
||||
sub x2, x2, #8
|
||||
cmp x2, #8
|
||||
bge LoopDepth8
|
||||
|
||||
LoopDepth4:
|
||||
cmp x2, #4
|
||||
blt End
|
||||
ld1 {v0.4s}, [x1], #16
|
||||
|
||||
sqshl v0.4s, v0.4s, v26.4s
|
||||
sqrdmulh v0.4s, v0.4s, v27.4s
|
||||
|
||||
and v16.16b, v28.16b, v0.16b
|
||||
sshr v16.4s, v16.4s, #31
|
||||
sqadd v0.4s, v0.4s, v16.4s
|
||||
srshl v0.4s, v0.4s, v28.4s
|
||||
|
||||
add v0.4s, v0.4s, v29.4s
|
||||
smax v0.4s, v0.4s, v30.4s
|
||||
smin v0.4s, v0.4s, v31.4s
|
||||
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn v0.8b, v0.8h
|
||||
|
||||
st1 {v0.s}[0], [x0], #4
|
||||
|
||||
sub x2, x2, #4
|
||||
bge LoopDepth4
|
||||
End:
|
||||
ret
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue