Compare commits

...

100 Commits
master ... r0.2

Author SHA1 Message Date
mindspore-ci-bot dba9481337 !2825 add libtiff notice info to r0.2
6 years ago
xulei2020 adffd67714 add libtiff notice to r0.2
6 years ago
mindspore-ci-bot f5b794c802 !1047 remove dataset link in README
6 years ago
wandongdong 229c8559fa remove dataset link
6 years ago
mindspore-ci-bot 6de2733d52 !1041 delete externel link in example ciafr10_resent50
6 years ago
gengdongjie 3949de02bb delete externel link in resnet50_cifar10 example
6 years ago
mindspore-ci-bot 701606f0a2 !1005 remove http link from README.md for vgg
6 years ago
mindspore-ci-bot 77f6c85698 !1012 remove dataset_link
6 years ago
mindspore-ci-bot 4fdaa8222a !1022 delete dataset hyperlinks in bert README.md file
6 years ago
yoonlee666 9c80971e20 delete dataset hyperlinks in bert README.md
6 years ago
mindspore-ci-bot 0333696c8b !1004 modify readme for resnet101
6 years ago
mindspore-ci-bot 6955440b24 !1014 eliminate external links to data sets about README
6 years ago
chengxianbin a0a2111587 eliminate external lins to dataset
6 years ago
wukesong bbdfd92979 remove dataset_link
6 years ago
meixiaowei 495d0428f7 modify ReadMe and add data parallel
6 years ago
caojian05 61d7ec7bf3 remove http link from README.md for vgg
6 years ago
mindspore-ci-bot 07ae9fc168 !889 Set description type of whl package
6 years ago
leonwanghui 522e178eb1 Fix release package link in dockerfile
6 years ago
leonwanghui 5ad61af54c Fix release package link in README.md
6 years ago
zhoufeng c1e9391d94 set description type of whl package
6 years ago
mindspore-ci-bot 994b1ed052 !868 modify weight init for resnet101
6 years ago
meixiaowei 69e5978eb2 modify weight init
6 years ago
zhangzhenghai 5d666bdb61 update RELEASE.md.
6 years ago
mindspore-ci-bot 680bf2c891 !827 support buffer fusion for r0.2
6 years ago
mindspore-ci-bot 88c32a6f4d !828 Add reduce mean in SoftmaxCrossEntroyWithLogits in Resnet50 example
6 years ago
mindspore-ci-bot 8d4511e729 !825 Add README.md for YOLOv3
6 years ago
mindspore-ci-bot 0de15935ba !826 Check whether the value tuple is empty when converting it to tuple tensor
6 years ago
mindspore-ci-bot 7aaaf1a5c8 !813 remove enable mixed precision for ge
6 years ago
gengdongjie 5265c90884 add reduce mean in SoftmaxCrossEntropyWithLogits for resent50 example
6 years ago
mindspore-ci-bot 24ff160ff0 !817 remove amp setting and add RANK_TABLE_FILE
6 years ago
jjfeing cd6ed0e344 support buffer fusion
6 years ago
mindspore-ci-bot e3fcf825bc !821 modify resnet101 scripts with the default backend mixed precision
6 years ago
YuJianfeng 277659d544 Check the empty value tuple when converting it to tuple tensor
6 years ago
zhaoting 3a2ddd9bb8 add README.md for YOLOv3
6 years ago
wandongdong 1465afc5f1 del amp and add RANK_TABLE_FILE
6 years ago
mindspore-ci-bot bf1d003137 !820 Update document about dynamic_lr
6 years ago
meixiaowei fce21087dd modify resnet101 scripts
6 years ago
leilei_snow ba7ccf26a3 fix api document about dynamic_lr
6 years ago
caojian05 52a238f4bb remove enable mixed precision for ge
6 years ago
mindspore-ci-bot 3183579e0e !798 change runtime error to type error when cannot find kernel info
6 years ago
lianliguang e9c3a5a7f8 change runtime error to type error when cannot find kernel info
6 years ago
mindspore-ci-bot ca5f81af36 !748 fix np.histogram sometimes calc very large bucket number
6 years ago
mindspore-ci-bot 517e3235ba !790 add distribute train README for vgg16
6 years ago
mindspore-ci-bot 6562aa66d1 !783 add distribute train for vgg16
6 years ago
mindspore-ci-bot f4e8bca783 !787 Fix dtype judge sentence in infer_dtype function of hcom operations
6 years ago
mindspore-ci-bot 928b0bb309 !781 Adjust the order of cast and reshape in the grammar implementation process
6 years ago
caojian05 16bc4abe34 add distribute train README for vgg16
6 years ago
mindspore-ci-bot 9ca2349c81 !782 modify resnet101 dir name to resnet101_imagenet2012
6 years ago
zhouyuanshen 6cc51e0c0c fix bug in infer_dtype function of hcom operations
6 years ago
mindspore-ci-bot fb90cb4da6 !768 [bug]with eval cell show cast is not support in gpu pynative
6 years ago
mindspore-ci-bot c96d5f5353 !744 Disable ConfusionMulGrad fusion pass
6 years ago
caojian05 84f914c4bb add distribute train for vgg16
6 years ago
meixiaowei aef80c44e2 modify resnet101 dir name to resnet101_imagenet2012
6 years ago
Wei Luning 6b39161701 only cast when level is O2
6 years ago
candanzg e8850e485e repair cast
6 years ago
mindspore-ci-bot 06af0f751f !773 Set precision mode and allreduce split strategy
6 years ago
mindspore-ci-bot c90b66a0db !777 fix bugs and dock ops
6 years ago
gengdongjie e8621ce1d6 set auto mix precision and allreduce aplit size
6 years ago
mindspore-ci-bot 118d434a3b !771 Upload Resnet101 Scripts
6 years ago
mindspore-ci-bot 58844968fb !770 Add MobilenetV2 to model_zoo and train scripts to r0.2
6 years ago
buxue 381acf617b dock FloorMod GreaterEqual NotEqual ScatterNdUpdate
6 years ago
mindspore-ci-bot c52934923e !757 remove redundant data copy
6 years ago
buxue 42eb8b08c2 fix bugs of Acosh, TopK, ResizeNearestNeighbor, DepthwiseConv2dNative
6 years ago
buxue 4fa2d03c89 fix reviewboot and example of TruncatedNormal and add type mapping
6 years ago
buxue 71ccf74b88 fix the infer of TruncatedNormal and a bug of structure output and a bug of tensorslice ellipsis
6 years ago
meixiaowei 0cd381be14 resnet101 update
6 years ago
wandongdong 3aa54aada3 add mobilenetv2
6 years ago
mindspore-ci-bot fed85d7927 !758 modify maxpool in alexnet
6 years ago
mindspore-ci-bot a427dd6059 !726 modify init_ge api name and add init_ge to init_dataset
6 years ago
mindspore-ci-bot 2e6f97f60a !752 fix bug of import _akg failed
6 years ago
wukesong 5543f829e1 alexnet-maxpool
6 years ago
dinghao 7bc0cbca18 remove data sync
6 years ago
lizhenyu b7b7ef390d fix bug of import akg failed
6 years ago
wenkai cdc09b1ce5 fix np.histograms(bins='auto') sometimes calc very small width and very large bucket number, which lead to error/long compute time.
6 years ago
huanghui e7549bd78a Disable ConfusionMulGrad fusion pass
6 years ago
mindspore-ci-bot 4337a32ae6 !722 fix load checkpoint bug
6 years ago
mindspore-ci-bot b56cbf1851 !725 Fix confusionmulgrad fusion pass cannot work
6 years ago
leonwanghui e86ab6ce9c !724 Bump the version to 0.2.0-alpha
6 years ago
jinyaohui 5f18c85ffe modify init_dataset
6 years ago
mindspore-ci-bot b04b879431 !714 fix select wrong kernel
6 years ago
mindspore-ci-bot 16c00622e7 !716 Check topk supported before converting input to attr
6 years ago
chang zherui df71c09a4e modify load ckpt
6 years ago
mindspore-ci-bot bfd2afc00b !657 Fix confusionmulgrad fusion pass cannot work
6 years ago
leonwanghui 1f05fa8210 Bump the version to 0.2.0-alpha
6 years ago
mindspore-ci-bot 3fc0c2e1ff !695 Check topk supported before converting input to attr
6 years ago
chenjianping 290f783f3b fix select wrong kernel
6 years ago
mindspore-ci-bot f52d4a3dbb !710 sync profiling modifies from master to r0.2
6 years ago
caifubi bd1d6d558c sync profiling bp_end point modify from master
6 years ago
mindspore-ci-bot 79d1e46573 !702 add buffer fusion bnupdate eltwise pass
6 years ago
Etone.Chan 4e39354daa add buffer fusion bnupdate eltwise pass
6 years ago
mindspore-ci-bot a04e848627 !650 Match format when kernel selecting using raise or reduce precision
6 years ago
liubuyu 05e001fc84 add model parameters for vgg16 to enable mixed precision
6 years ago
mindspore-ci-bot 16ac0f29de !640 add model parameters for vgg16 to enable mixed precision.
6 years ago
mindspore-ci-bot 4bdb03f92c !633 [MD] adjust mindrecord ut
6 years ago
mindspore-ci-bot ec1b5ada66 !638 modify r0.2 version number
6 years ago
wangnan39@huawei.com 361181f465 modify r0.2 version number
6 years ago
mindspore-ci-bot 1b9bad8469 !635 modify r0.2 version number
6 years ago
wangnan39@huawei.com bf9de88cdb modify r0.2 version number
6 years ago
caojian05 73d4cf77d4 add model parameters for vgg16 to open mixed precision.
6 years ago
liyong 1f222ddb9e fix mindrecord c ut
6 years ago

@ -1,7 +1,7 @@
![MindSpore Logo](docs/MindSpore-logo.png "MindSpore logo")
============================================================
- [What is MindSpore?](#what-is-mindspore)
- [What Is MindSpore?](#what-is-mindspore)
- [Automatic Differentiation](#automatic-differentiation)
- [Automatic Parallel](#automatic-parallel)
- [Installation](#installation)
@ -29,7 +29,7 @@ enrichment of the AI software/hardware application ecosystem.
<img src="docs/MindSpore-architecture.png" alt="MindSpore Architecture" width="600"/>
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/0.1.0-alpha/architecture.html).
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/0.2.0-alpha/architecture.html).
### Automatic Differentiation
@ -76,13 +76,36 @@ For installation using `pip`, take `CPU` and `Ubuntu-x86` build version as an ex
1. Download whl from [MindSpore download page](https://www.mindspore.cn/versions/en), and install the package.
```
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.1.0-alpha/MindSpore/cpu/ubuntu-x86/mindspore-0.1.0-cp37-cp37m-linux_x86_64.whl
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.2.0-alpha/MindSpore/cpu/x86_ubuntu/mindspore-0.2.0-cp37-cp37m-linux_x86_64.whl
```
2. Run the following command to verify the install.
```python
import numpy as np
import mindspore.context as context
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops import operations as P
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
class Mul(nn.Cell):
def __init__(self):
super(Mul, self).__init__()
self.mul = P.Mul()
def construct(self, x, y):
return self.mul(x, y)
x = Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32))
y = Tensor(np.array([4.0, 5.0, 6.0]).astype(np.float32))
mul = Mul()
print(mul(x, y))
```
python -c 'import mindspore'
```
[ 4. 10. 18.]
```
### From Source
@ -96,20 +119,22 @@ currently the containerized build options are supported as follows:
| Hardware Platform | Docker Image Repository | Tag | Description |
| :---------------- | :---------------------- | :-- | :---------- |
| CPU | `mindspore/mindspore-cpu` | `0.1.0-alpha` | Production environment with pre-installed MindSpore `0.1.0-alpha` CPU release. |
| CPU | `mindspore/mindspore-cpu` | `x.y.z` | Production environment with pre-installed MindSpore `x.y.z` CPU release. |
| | | `devel` | Development environment provided to build MindSpore (with `CPU` backend) from the source, refer to https://www.mindspore.cn/install/en for installation details. |
| | | `runtime` | Runtime environment provided to install MindSpore binary package with `CPU` backend. |
| GPU | `mindspore/mindspore-gpu` | `0.1.0-alpha` | Production environment with pre-installed MindSpore `0.1.0-alpha` GPU release. |
| GPU | `mindspore/mindspore-gpu` | `x.y.z` | Production environment with pre-installed MindSpore `x.y.z` GPU release. |
| | | `devel` | Development environment provided to build MindSpore (with `GPU CUDA10.1` backend) from the source, refer to https://www.mindspore.cn/install/en for installation details. |
| | | `runtime` | Runtime environment provided to install MindSpore binary package with `GPU` backend. |
| | | `runtime` | Runtime environment provided to install MindSpore binary package with `GPU CUDA10.1` backend. |
| Ascend | <center>&mdash;</center> | <center>&mdash;</center> | Coming soon. |
> **NOTICE:** For GPU `devel` docker image, it's NOT suggested to directly install the whl package after building from the source, instead we strongly RECOMMEND you transfer and install the whl package inside GPU `runtime` docker image.
* CPU
For `CPU` backend, you can directly pull and run the image using the below command:
For `CPU` backend, you can directly pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-cpu:0.1.0-alpha
docker run -it mindspore/mindspore-cpu:0.1.0-alpha python -c 'import mindspore'
docker pull mindspore/mindspore-cpu:0.2.0-alpha
docker run -it mindspore/mindspore-cpu:0.2.0-alpha /bin/bash
```
* GPU
@ -124,20 +149,21 @@ currently the containerized build options are supported as follows:
sudo systemctl restart docker
```
Then you can pull and run the image using the below command:
Then you can pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-gpu:0.1.0-alpha
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.1.0-alpha /bin/bash
docker pull mindspore/mindspore-gpu:0.2.0-alpha
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.2.0-alpha /bin/bash
```
To test if the docker image works, please execute the python code below and check the output:
```python
import numpy as np
import mindspore.context as context
from mindspore import Tensor
from mindspore.ops import functional as F
import mindspore.context as context
context.set_context(device_target="GPU")
x = Tensor(np.ones([1,3,3,4]).astype(np.float32))
y = Tensor(np.ones([1,3,3,4]).astype(np.float32))
print(F.tensor_add(x, y))
@ -157,11 +183,11 @@ currently the containerized build options are supported as follows:
```
If you want to learn more about the building process of MindSpore docker images,
please check out `docker` folder for the details.
please check out [docker](docker/README.md) repo for the details.
## Quickstart
See the [Quick Start](https://www.mindspore.cn/tutorial/en/0.1.0-alpha/quick_start/quick_start.html)
See the [Quick Start](https://www.mindspore.cn/tutorial/en/0.2.0-alpha/quick_start/quick_start.html)
to implement the image classification.
## Docs

File diff suppressed because one or more lines are too long

@ -3042,6 +3042,60 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS", AND
Why Three Licenses?
The zlib License could have been used instead of the Modified (3-clause) BSD License, and since the IJG License effectively subsumes the distribution conditions of the zlib License, this would have effectively placed libjpeg-turbo binary distributions under the IJG License. However, the IJG License specifically refers to the Independent JPEG Group and does not extend attribution and endorsement protections to other entities. Thus, it was desirable to choose a license that granted us the same protections for new code that were granted to the IJG for code derived from their software.
Software: libtiff 4.1.0
Copyright notice:
Copyright © 2015 Open Microscopy Environment / University of Dundee
Copyright (c) 2004, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 1990-1997 Sam Leffler
Copyright (c) 1991-1997 Silicon Graphics, Inc.
Copyright (c) 1988-1997 Sam Leffler
Copyright (c) 1991-1997 Sam Leffler
Use and Copyright
Copyright (C) 1990, 1995 Frank D. Cringle.
Copyright (c) 1994-1997 Sam Leffler
Copyright (c) 1994-1997 Silicon Graphics, Inc.
Copyright (c) 1997 Greg Ward Larson
Copyright (c) 1997 Silicon Graphics, Inc.
Copyright (c) 2010, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) Joris Van Damme <info@awaresystems.be>
Copyright (c) AWare Systems <http:www.awaresystems.be/>
Copyright (c) 1996-1997 Sam Leffler
Copyright (c) 1996 Pixar
Copyright (c) 1995-1997 Sam Leffler
Copyright (c) 1995-1997 Silicon Graphics, Inc.
Copyright (c) 1988-1996 Sam Leffler
Copyright (c) 1991-1996 Silicon Graphics, Inc.
Copyright (c) 1992-1997 Sam Leffler
Copyright (c) 1992-1997 Silicon Graphics, Inc.
Copyright (c) 2018, Mapbox
Copyright (c) 2017, Planet Labs
Copyright (c) 1990 by Sun Microsystems, Inc.
Copyright 1990 by Digital Equipment Corporation, Maynard, Massachusetts.
Copyright 1991 by Digital Equipment Corporation, Maynard, Massachusetts.
Copyright (c) 2002, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2003 Ross Finlayson
Additions (c) Richard Nolde 2006-2010
Copyright (c) 2003, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2000, Frank Warmerdam
Copyright (c) 1987, 1993, 1994
Copyright (c) 1989, 1993
Copyright (c) 2009 Frank Warmerdam
Copyright (c) 1987, 1993
Copyright (c) 2005 The DragonFly Project. All rights reserved.
Copyright (c) 2003 Citrus Project,
All rights reserved.
Copyright (c) 1990, 1993
Copyright (c) 1996 Mike Johnson
Copyright (c) 1996 BancTec AB
Copyright (c) 2004, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2012, Frank Warmerdam <warmerdam@pobox.com>
Copyright (c) 2019, Even Rouault <even.rouault at spatialys.com>
Copyright (c) 2007, Frank Warmerdam <warmerdam@pobox.com>
Copyright (c) 2019, Thomas Bernard <miniupnp@free.fr>
Copyright (c) 2008, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 1999, Frank Warmerdam
Copyright (c) 1991-1996 Sam Leffler
Copyright (c) 1996 USAF Phillips Laboratory
Software: opencv 4.2.0
Copyright notice:

@ -14,27 +14,27 @@
@rem ============================================================================
@echo off
@title mindspore_build
SET BASEPATH=%CD%
IF NOT EXIST %BASEPATH%/build (
md "build"
)
cd %BASEPATH%/build
SET BUILD_PATH=%CD%
IF NOT EXIST %BUILD_PATH%/mindspore (
md "mindspore"
)
cd %CD%/mindspore
cmake -DCMAKE_BUILD_TYPE=Release -DENABLE_CPU=ON -DENABLE_MINDDATA=ON -DUSE_GLOG=ON -G "CodeBlocks - MinGW Makefiles" ../..
IF NOT %errorlevel% == 0 (
echo "cmake fail."
goto run_fail
)
IF "%1%" == "" (
cmake --build . --target package -- -j6
) ELSE (

@ -433,9 +433,9 @@ build_predict()
cd "${BASEPATH}/predict/output/"
if [[ "$PREDICT_PLATFORM" == "x86_64" ]]; then
tar -cf MSPredict-0.1.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.2.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
elif [[ "$PREDICT_PLATFORM" == "arm64" ]]; then
tar -cf MSPredict-0.1.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.2.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
fi
echo "success to build predict project!"
}

@ -4,14 +4,13 @@ This folder hosts all the `Dockerfile` to build MindSpore container images with
### MindSpore docker build command
* CPU
| Hardware Platform | Version | Build Command |
| :---------------- | :------ | :------------ |
| CPU | `x.y.z` | cd mindspore-cpu/x.y.z && docker build . -t mindspore/mindspore-cpu:x.y.z |
| | `devel` | cd mindspore-cpu/devel && docker build . -t mindspore/mindspore-cpu:devel |
| | `runtime` | cd mindspore-cpu/runtime && docker build . -t mindspore/mindspore-cpu:runtime |
| GPU | `x.y.z` | cd mindspore-gpu/x.y.z && docker build . -t mindspore/mindspore-gpu:x.y.z |
| | `devel` | cd mindspore-gpu/devel && docker build . -t mindspore/mindspore-gpu:devel |
| | `runtime` | cd mindspore-gpu/runtime && docker build . -t mindspore/mindspore-gpu:runtime |
```
cd mindspore-cpu/0.1.0-alpha && docker build . -t mindspore/mindspore-cpu:0.1.0-alpha
```
* GPU
```
cd mindspore-gpu/0.1.0-alpha && docker build . -t mindspore/mindspore-gpu:0.1.0-alpha
```
> **NOTICE:** The `x.y.z` version shown above should be replaced with the real version number.

@ -64,4 +64,4 @@ RUN mkdir -pv /root/.pip \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install MindSpore cpu whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.1.0-alpha/MindSpore/cpu/ubuntu-x86/mindspore-0.1.0-cp37-cp37m-linux_x86_64.whl
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.2.0-alpha/MindSpore/cpu/ubuntu-x86/mindspore-0.2.0-cp37-cp37m-linux_x86_64.whl

@ -0,0 +1,67 @@
FROM ubuntu:18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV PATH /usr/local/bin:$PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install MindSpore cpu whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.2.0-alpha/MindSpore/cpu/x86_ubuntu/mindspore-0.2.0-cp37-cp37m-linux_x86_64.whl

@ -80,4 +80,4 @@ RUN cd /tmp \
&& rm -f /tmp/openmpi-3.1.5.tar.gz
# Install MindSpore cuda-10.1 whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.1.0-alpha/MindSpore/gpu/cuda-10.1/mindspore-0.1.0-cp37-cp37m-linux_x86_64.whl
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.2.0-alpha/MindSpore/gpu/cuda-10.1/mindspore-0.2.0-cp37-cp37m-linux_x86_64.whl

@ -0,0 +1,83 @@
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5
ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH
ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex \
libnccl2=2.4.8-1+cuda10.1 \
libnccl-dev=2.4.8-1+cuda10.1
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install openmpi (v3.1.5)
RUN cd /tmp \
&& wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \
&& tar -xvf openmpi-3.1.5.tar.gz \
&& cd /tmp/openmpi-3.1.5 \
&& mkdir -p ${OMPI_ROOT_PATH} \
&& ./configure --prefix=${OMPI_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -rf /tmp/openmpi-3.1.5 \
&& rm -f /tmp/openmpi-3.1.5.tar.gz
# Install MindSpore cuda-10.1 whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.2.0-alpha/MindSpore/gpu/cuda-10.1/mindspore_gpu-0.2.0-cp37-cp37m-linux_x86_64.whl

@ -4,8 +4,8 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the zhwiki dataset from <https://dumps.wikimedia.org/zhwiki> for pre-training. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wiliextractor). Convert the dataset to TFRecord format and move the files to a specified path.
- Download the CLUE dataset from <https://www.cluebenchmarks.com> for fine-tuning and evaluation.
- Download the zhwiki dataset for pre-training. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wikiextractor). Convert the dataset to TFRecord format and move the files to a specified path.
- Download the CLUE dataset for fine-tuning and evaluation.
> Notes:
If you are running a fine-tuning or evaluation task, prepare the corresponding checkpoint file.

@ -10,7 +10,7 @@ This is the simple tutorial for training AlexNet in MindSpore.
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the CIFAR-10 dataset at <http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz>. The directory structure is as follows:
- Download the CIFAR-10 dataset, the directory structure is as follows:
```
├─cifar-10-batches-bin

@ -10,7 +10,7 @@ This is the simple and basic tutorial for constructing a network in MindSpore.
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the MNIST dataset at <http://yann.lecun.com/exdb/mnist/>. The directory structure is as follows:
- Download the MNIST dataset, the directory structure is as follows:
```
└─MNIST_Data

@ -0,0 +1,101 @@
# MobileNetV2 Example
## Description
This is an example of training MobileNetV2 with ImageNet2012 dataset in MindSpore.
## Requirements
* Install [MindSpore](https://www.mindspore.cn/install/en).
* Download the dataset [ImageNet2012].
> Unzip the ImageNet2012 dataset to any path you want and the folder structure should be as follows:
> ```
> .
> ├── train # train dataset
> └── val # infer dataset
> ```
## Example structure
``` shell
.
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├── eval.py # infer script
├── launch.py # launcher for distributed training
├── lr_generator.py # generate learning rate for each step
├── run_infer.sh # launch infering
├── run_train.sh # launch training
└── train.py # train script
```
## Parameter configuration
Parameters for both training and inference can be set in 'config.py'.
```
"num_classes": 1000, # dataset class num
"image_height": 224, # image height
"image_width": 224, # image width
"batch_size": 256, # training or infering batch size
"epoch_size": 200, # total training epochs, including warmup_epochs
"warmup_epochs": 4, # warmup epochs
"lr": 0.4, # base learning rate
"momentum": 0.9, # momentum
"weight_decay": 4e-5, # weight decay
"loss_scale": 1024, # loss scale
"save_checkpoint": True, # whether save checkpoint
"save_checkpoint_epochs": 1, # the epoch interval between two checkpoints
"keep_checkpoint_max": 200, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./checkpoint" # path to save checkpoint
```
## Running the example
### Train
#### Usage
Usage: sh run_train.sh [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
#### Launch
```
# training example
sh run_train.sh 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet
```
#### Result
Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings.
```
epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100]
epoch time: 140522.500, per step time: 224.836, avg loss: 5.258
epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200]
epoch time: 138331.250, per step time: 221.330, avg loss: 3.917
```
### Infer
#### Usage
Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]
#### Launch
```
# infer example
sh run_infer.sh ~/imagenet ~/train/mobilenet-200_625.ckpt
```
> checkpoint can be produced in training process.
#### Result
Inference result will be stored in the example path, you can find result like the followings in `val.log`.
```
result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt
```

@ -0,0 +1,35 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in train.py and eval.py
"""
from easydict import EasyDict as ed
config = ed({
"num_classes": 1000,
"image_height": 224,
"image_width": 224,
"batch_size": 256,
"epoch_size": 200,
"warmup_epochs": 4,
"lr": 0.4,
"momentum": 0.9,
"weight_decay": 4e-5,
"loss_scale": 1024,
"save_checkpoint": True,
"save_checkpoint_epochs": 1,
"keep_checkpoint_max": 200,
"save_checkpoint_path": "./checkpoint",
})

@ -0,0 +1,84 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from config import config
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
"""
create a train or eval dataset
Args:
dataset_path(string): the path of dataset.
do_train(bool): whether dataset is used for train or eval.
repeat_num(int): the repeat times of dataset. Default: 1
batch_size(int): the batch size of dataset. Default: 32
Returns:
dataset
"""
rank_size = int(os.getenv("RANK_SIZE"))
rank_id = int(os.getenv("RANK_ID"))
if rank_size == 1:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=16, shuffle=True)
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=16, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
resize_height = config.image_height
resize_width = config.image_width
rescale = 1.0 / 255.0
shift = 0.0
buffer_size = 1000
# define map operations
decode_op = C.Decode()
resize_crop_op = C.RandomResizedCrop(resize_height, scale=(0.2, 1.0))
horizontal_flip_op = C.RandomHorizontalFlip()
resize_op = C.Resize((256, 256))
center_crop = C.CenterCrop(resize_width)
rescale_op = C.Rescale(rescale, shift)
normalize_op = C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
change_swap_op = C.HWC2CHW()
if do_train:
trans = [decode_op, resize_crop_op, horizontal_flip_op, rescale_op, normalize_op, change_swap_op]
else:
trans = [decode_op, resize_op, center_crop, rescale_op, normalize_op, change_swap_op]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="image", operations=trans)
ds = ds.map(input_columns="label", operations=type_cast_op)
# apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds

@ -0,0 +1,56 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
eval.
"""
import os
import argparse
from dataset import create_dataset
from config import config
from mindspore import context
from mindspore.model_zoo.mobilenet import mobilenet_v2
from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
args_opt = parser.parse_args()
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False)
context.set_context(enable_task_sink=True)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)
if __name__ == '__main__':
context.set_context(enable_hccl=False)
loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
net = mobilenet_v2()
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size)
step_size = dataset.get_dataset_size()
if args_opt.checkpoint_path:
param_dict = load_checkpoint(args_opt.checkpoint_path)
load_param_into_net(net, param_dict)
net.set_train(False)
model = Model(net, loss_fn=loss, metrics={'acc'})
res = model.eval(dataset)
print("result:", res, "ckpt=", args_opt.checkpoint_path)

@ -0,0 +1,143 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""launch train script"""
import os
import sys
import json
from argparse import ArgumentParser
def parse_args():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
parser = ArgumentParser(description="mindspore distributed training launch "
"helper utilty that will spawn up "
"multiple distributed processes")
parser.add_argument("--nproc_per_node", type=int, default=1,
help="The number of processes to launch on each node, "
"for D training, this is recommended to be set "
"to the number of D in your system so that "
"each process can be bound to a single D.")
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
help="will use the visible devices sequentially")
parser.add_argument("--server_id", type=str, default="",
help="server ip")
parser.add_argument("--training_script", type=str,
help="The full path to the single D training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script")
# rest from the training program
args, unknown = parser.parse_known_args()
args.training_script_args = unknown
return args
def main():
print("start", __file__)
args = parse_args()
print(args)
visible_devices = args.visible_devices.split(',')
assert os.path.isfile(args.training_script)
assert len(visible_devices) >= args.nproc_per_node
print('visible_devices:{}'.format(visible_devices))
if not args.server_id:
print('pleaser input server ip!!!')
exit(0)
print('server_id:{}'.format(args.server_id))
# construct hccn_table
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
device_ips = {}
for hccn_item in hccn_configs:
hccn_item = hccn_item.strip()
if hccn_item.startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
hccn_table = {}
hccn_table['board_id'] = '0x0000'
hccn_table['chip_info'] = '910'
hccn_table['deploy_mode'] = 'lab'
hccn_table['group_count'] = '1'
hccn_table['group_list'] = []
instance_list = []
usable_dev = ''
for instance_id in range(args.nproc_per_node):
instance = {}
instance['devices'] = []
device_id = visible_devices[instance_id]
device_ip = device_ips[device_id]
usable_dev += str(device_id)
instance['devices'].append({
'device_id': device_id,
'device_ip': device_ip,
})
instance['rank_id'] = str(instance_id)
instance['server_id'] = args.server_id
instance_list.append(instance)
hccn_table['group_list'].append({
'device_num': str(args.nproc_per_node),
'server_num': '1',
'group_name': '',
'instance_count': str(args.nproc_per_node),
'instance_list': instance_list,
})
hccn_table['para_plane_nic_location'] = 'device'
hccn_table['para_plane_nic_name'] = []
for instance_id in range(args.nproc_per_node):
eth_id = visible_devices[instance_id]
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
hccn_table['para_plane_nic_num'] = str(args.nproc_per_node)
hccn_table['status'] = 'completed'
# save hccn_table to file
table_path = os.getcwd()
if not os.path.exists(table_path):
os.mkdir(table_path)
table_fn = os.path.join(table_path,
'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id))
with open(table_fn, 'w') as table_fp:
json.dump(hccn_table, table_fp, indent=4)
sys.stdout.flush()
# spawn the processes
for rank_id in range(0, args.nproc_per_node):
device_id = visible_devices[rank_id]
device_dir = os.path.join(os.getcwd(), 'device{}'.format(rank_id))
rank_process = 'export RANK_SIZE={} && export RANK_ID={} && export DEVICE_ID={} && '.format(args.nproc_per_node,
rank_id, device_id)
if args.nproc_per_node > 1:
rank_process += 'export MINDSPORE_HCCL_CONFIG_PATH={} && '.format(table_fn)
rank_process += 'export RANK_TABLE_FILE={} && '.format(table_fn)
rank_process += 'rm -rf {dir} && mkdir {dir} && cd {dir} && python {script} '.format(dir=device_dir,
script=args.training_script
)
rank_process += ' '.join(args.training_script_args) + ' > log{}.log 2>&1 &'.format(rank_id)
os.system(rank_process)
if __name__ == "__main__":
main()

@ -0,0 +1,54 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""learning rate generator"""
import math
import numpy as np
def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
"""
generate learning rate array
Args:
global_step(int): total steps of the training
lr_init(float): init learning rate
lr_end(float): end learning rate
lr_max(float): max learning rate
warmup_epochs(int): number of warmup epochs
total_epochs(int): total epoch of training
steps_per_epoch(int): steps of one epoch
Returns:
np.array, learning rate array
"""
lr_each_step = []
total_steps = steps_per_epoch * total_epochs
warmup_steps = steps_per_epoch * warmup_epochs
for i in range(total_steps):
if i < warmup_steps:
lr = lr_init + (lr_max - lr_init) * i / warmup_steps
else:
lr = lr_end + \
(lr_max - lr_end) * \
(1. + math.cos(math.pi * (i - warmup_steps) / (total_steps - warmup_steps))) / 2.
if lr < 0.0:
lr = 0.0
lr_each_step.append(lr)
current_step = global_step
lr_each_step = np.array(lr_each_step).astype(np.float32)
learning_rate = lr_each_step[current_step:]
return learning_rate

@ -0,0 +1,33 @@
#!/usr/bin/env bash
if [ $# != 2 ]
then
echo "Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]"
exit 1
fi
if [ ! -d $1 ]
then
echo "error: DATASET_PATH=$1 is not a directory"
exit 1
fi
if [ ! -f $2 ]
then
echo "error: CHECKPOINT_PATH=$2 is not a file"
exit 1
fi
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export DEVICE_ID=0
export RANK_ID=0
export RANK_SIZE=1
if [ -d "eval" ];
then
rm -rf ./eval
fi
mkdir ./eval
cd ./eval || exit
python ${BASEPATH}/eval.py \
--checkpoint_path=$2 \
--dataset_path=$1 &> infer.log & # dataset val folder path

@ -0,0 +1,33 @@
#!/usr/bin/env bash
if [ $# != 4 ]
then
echo "Usage: sh run_train.sh [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]"
exit 1
fi
if [ $1 -lt 1 ] && [ $1 -gt 8 ]
then
echo "error: DEVICE_NUM=$1 is not in (1-8)"
exit 1
fi
if [ ! -d $4 ]
then
echo "error: DATASET_PATH=$4 is not a directory"
exit 1
fi
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "train" ];
then
rm -rf ./train
fi
mkdir ./train
cd ./train || exit
python ${BASEPATH}/launch.py \
--nproc_per_node=$1 \
--visible_devices=$3 \
--server_id=$2 \
--training_script=${BASEPATH}/train.py \
--dataset_path=$4 &> train.log & # dataset train folder

@ -0,0 +1,148 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_imagenet."""
import os
import time
import argparse
import random
import numpy as np
from dataset import create_dataset
from lr_generator import get_lr
from config import config
from mindspore import context
from mindspore import Tensor
from mindspore.model_zoo.mobilenet import mobilenet_v2
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback
from mindspore.train.loss_scale_manager import FixedLossScaleManager
import mindspore.dataset.engine as de
from mindspore.communication.management import init
random.seed(1)
np.random.seed(1)
de.config.set_seed(1)
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
args_opt = parser.parse_args()
device_id = int(os.getenv('DEVICE_ID'))
rank_id = int(os.getenv('RANK_ID'))
rank_size = int(os.getenv('RANK_SIZE'))
run_distribute = rank_size > 1
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False)
context.set_context(enable_task_sink=True)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)
class Monitor(Callback):
"""
Monitor loss and time.
Args:
lr_init (numpy array): train lr
Returns:
None.
Examples:
>>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy())
"""
def __init__(self, lr_init=None):
super(Monitor, self).__init__()
self.lr_init = lr_init
self.lr_init_len = len(lr_init)
def epoch_begin(self, run_context):
self.losses = []
self.epoch_time = time.time()
def epoch_end(self, run_context):
cb_params = run_context.original_args()
epoch_mseconds = (time.time() - self.epoch_time) * 1000
per_step_mseconds = epoch_mseconds / cb_params.batch_num
print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds,
per_step_mseconds,
np.mean(self.losses)
), flush=True)
def step_begin(self, run_context):
self.step_time = time.time()
def step_end(self, run_context):
cb_params = run_context.original_args()
step_mseconds = (time.time() - self.step_time) * 1000
step_loss = cb_params.net_outputs
if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor):
step_loss = step_loss[0]
if isinstance(step_loss, Tensor):
step_loss = np.mean(step_loss.asnumpy())
self.losses.append(step_loss)
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format(
cb_params.cur_epoch_num - 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1]), flush=True)
if __name__ == '__main__':
if run_distribute:
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL,
parameter_broadcast=True, mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
else:
context.set_context(enable_hccl=False)
epoch_size = config.epoch_size
net = mobilenet_v2(num_classes=config.num_classes)
loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
print("train args: ", args_opt, "\ncfg: ", config,
"\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size))
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
repeat_num=epoch_size, batch_size=config.batch_size)
step_size = dataset.get_dataset_size()
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config.lr,
warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size))
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
config.weight_decay, config.loss_scale)
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale)
cb = None
if rank_id == 0:
cb = [Monitor(lr_init=lr.asnumpy())]
if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="mobilenet", directory=config.save_checkpoint_path, config=config_ck)
cb += [ckpt_cb]
model.train(epoch_size, dataset, callbacks=cb)

@ -0,0 +1,135 @@
# ResNet101 Example
## Description
This is an example of training ResNet101 with ImageNet dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset ImageNet2012.
> Unzip the ImageNet2012 dataset to any path you want, the folder should include train and eval dataset as follows:
```
.
└─dataset
├─ilsvrc
└─validation_preprocess
```
## Example structure
```shell
.
├── crossentropy.py # CrossEntropy loss function
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├── eval.py # eval net
├── lr_generator.py # generate learning rate
├── run_distribute_train.sh # launch distributed training(8p)
├── run_infer.sh # launch evaluating
├── run_standalone_train.sh # launch standalone training(1p)
└── train.py # train net
```
## Parameter configuration
Parameters for both training and evaluating can be set in config.py.
```
"class_num": 1001, # dataset class number
"batch_size": 32, # batch size of input tensor
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum optimizer
"weight_decay": 1e-4, # weight decay
"epoch_size": 120, # epoch sizes for training
"buffer_size": 1000, # number of queue size in data preprocessing
"image_height": 224, # image height
"image_width": 224, # image width
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_steps": 500, # the step interval between two checkpoints. By default, the last checkpoint will be saved after the last step
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
"warmup_epochs": 0, # number of warmup epoch
"lr_decay_mode": "cosine" # decay mode for generating learning rate
"label_smooth": 1, # label_smooth
"label_smooth_factor": 0.1, # label_smooth_factor
"lr": 0.1 # base learning rate
```
## Running the example
### Train
#### Usage
```
# distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
# standalone training
sh run_standalone_train.sh [DATASET_PATH]
```
#### Launch
```bash
# distributed training example(8p)
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
# standalone training example1p
sh run_standalone_train.sh dataset/ilsvrc
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in log.
```
# distribute training result(8p)
epoch: 1 step: 5004, loss is 4.805483
epoch: 2 step: 5004, loss is 3.2121816
epoch: 3 step: 5004, loss is 3.429647
epoch: 4 step: 5004, loss is 3.3667371
epoch: 5 step: 5004, loss is 3.1718972
...
epoch: 67 step: 5004, loss is 2.2768745
epoch: 68 step: 5004, loss is 1.7223864
epoch: 69 step: 5004, loss is 2.0665488
epoch: 70 step: 5004, loss is 1.8717369
...
```
### Infer
#### Usage
```
# infer
sh run_infer.sh [VALIDATION_DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```bash
# infer with checkpoint
sh run_infer.sh dataset/validation_preprocess/ train_parallel0/resnet-120_5004.ckpt
```
> checkpoint can be produced in training process.
#### Result
Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log.
```
result: {'top_5_accuracy': 0.9429417413572343, 'top_1_accuracy': 0.7853513124199744} ckpt=train_parallel0/resnet-120_5004.ckpt
```

@ -0,0 +1,39 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in train.py and eval.py
"""
from easydict import EasyDict as ed
config = ed({
"class_num": 1001,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 1e-4,
"epoch_size": 120,
"buffer_size": 1000,
"image_height": 224,
"image_width": 224,
"save_checkpoint": True,
"save_checkpoint_steps": 500,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 0,
"lr_decay_mode": "cosine",
"label_smooth": 1,
"label_smooth_factor": 0.1,
"lr": 0.1
})

@ -0,0 +1,36 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""define loss function for network"""
from mindspore.nn.loss.loss import _Loss
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore import Tensor
from mindspore.common import dtype as mstype
import mindspore.nn as nn
class CrossEntropy(_Loss):
"""the redefined loss function with SoftmaxCrossEntropyWithLogits"""
def __init__(self, smooth_factor=0., num_classes=1001):
super(CrossEntropy, self).__init__()
self.onehot = P.OneHot()
self.on_value = Tensor(1.0 - smooth_factor, mstype.float32)
self.off_value = Tensor(1.0 * smooth_factor / (num_classes -1), mstype.float32)
self.ce = nn.SoftmaxCrossEntropyWithLogits()
self.mean = P.ReduceMean(False)
def construct(self, logit, label):
one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value)
loss = self.ce(logit, one_hot_label)
loss = self.mean(loss, 0)
return loss

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save