Compare commits

...

204 Commits
master ... r0.5

Author SHA1 Message Date
mindspore-ci-bot 2479bed7dd !8165 【轻量级 PR】:update RELEASE.md.
5 years ago
shenwei41 17c1b9199e update RELEASE.md.
5 years ago
mindspore-ci-bot fb93791e47 !6525 【MD】r0.5 Branch: MD5 value update in the file - icu4c.cmake of branch r0.5
6 years ago
mayang 3ae04a25bd MD5 value update in the file icu4c.cmake of branch r0.5
6 years ago
mindspore-ci-bot 1a26789dc6 !6175 update release info for version 0.5.2
6 years ago
yanghaoran 292daa4c57 0.5.2 release info
6 years ago
mindspore-ci-bot a9cce969cf !5892 train alexnet by imagenet
6 years ago
mindspore-ci-bot 0fbc143a53 !5925 googlenet support imagenet dataset on Ascend
6 years ago
wukesong 587bd2dda6 add imaget dataset
6 years ago
mindspore-ci-bot b95334c79a !5922 vgg16 support imagenet dataset on Ascend
6 years ago
caojian05 e4a0b0db34 googlenet support imagenet dataset on Ascend
6 years ago
CaoJian a9ea12dc63 vgg16 support imagenet dataset on Ascend
6 years ago
mindspore-ci-bot 87ae8d7053 !5830 update version v0.5.2
6 years ago
yanghaoran 04a2bc96c6 update version v0.5.2
6 years ago
mindspore-ci-bot 17dfff4155 !5658 fix yolov3_darknet53 bug in r0.5
6 years ago
yangyongjie bd6253d0bf fix yolov3 bug in r0.5
6 years ago
mindspore-ci-bot 286ffa4495 !5539 Copy ub fusion pass from r0.7 to r0.5
6 years ago
mindspore-ci-bot 8e2803944a !5550 fix bug of making file file if not root user
6 years ago
zhouyaqiang 9056ed1dfd fix bug of make file fail if not root user
6 years ago
etone-chan 9ede74b1b2 modify the condition of pattern match in bnupdate + eltwise fusion pass
6 years ago
mindspore-ci-bot 93a50ec44d !5512 fix cpplint
6 years ago
laiyongqiang d8a6aa28a6 fix cpp lint
6 years ago
mindspore-ci-bot 59d459a837 !5468 add atomic clean for every communication op's inputs
6 years ago
laiyongqiang 6316343e08 add atomic clean op for every communication op's input
6 years ago
mindspore-ci-bot 47026a721f !5476 fix pylint warning
6 years ago
root 79fd6690e5 fix pylint warning
6 years ago
mindspore-ci-bot fe5e781ae5 !5345 Add new model_zoo net densenet
6 years ago
root f6a7916ca5 Add model_zoo net Densenet121
6 years ago
mindspore-ci-bot 034453e48f !4733 Synchronize some bugfix of bert to branch r0.5
6 years ago
chenhaozhe 301b01e48b sync some bugfix of bert scripts to branch r0.5
6 years ago
shibeiji d44cf6a031 debug for machine down because of out of memory when global shuffle level was set for large dataset
6 years ago
mindspore-ci-bot 0e3a39c223 !3357 modify device id
6 years ago
changzherui bdc67ee2ca modify device id
6 years ago
mindspore-ci-bot 3f916bddd3 !3340 modify device id
6 years ago
changzherui f689648872 modify device id
6 years ago
mindspore-ci-bot aef097d312 !3293 Add environment variable ASCEND_CUSTOM_FWK_PATH to fit specific ascend software structures
6 years ago
yanghaoran d946b61a88 add environment variable ASCEND_CUSTOM_FWK_PATH to support specific ascend software structutres
6 years ago
mindspore-ci-bot 6ed3347962 !3170 fix avg_pool operator, adding filter input
6 years ago
yanghaoran 86efd33682 fix avg_pool operator, adding filter input
6 years ago
mindspore-ci-bot 370de14557 !3122 update akg commit id to newest in branch r0.5
6 years ago
looop5 c96fa96ac9 update akg commit id to newest in branch r0.5
6 years ago
mindspore-ci-bot 8e7ae18d0e !3044 delete thirdpraty/icu4c/filter.json
6 years ago
qianlong 6458bef341 change icu4c compile way
6 years ago
mindspore-ci-bot 8d32dc74a4 !2932 delete package to_mindrecord
6 years ago
shenwei41 4d3f556f8e delete package to_mindrecord
6 years ago
mindspore-ci-bot 988876c744 !2933 Update sqlite patch
6 years ago
YangLuo 1b5ce7ae50 Update sqlite patch
6 years ago
mindspore-ci-bot e4bed3bf3d !2827 add libtiff notice info to r0.5
6 years ago
xulei2020 3450388471 add libtiff notice info to r0.5
6 years ago
mindspore-ci-bot 16711f3e62 !2797 update link of readme
6 years ago
leiyuning 17281bb339 update readme
6 years ago
mindspore-ci-bot 0295928983 !2778 gpu lstm network modify
6 years ago
wilfChen 4c14c085aa gpu lstm performace
6 years ago
zhangzhenghai 8c8e5d99d0 update serving/ms_service.proto.
6 years ago
mindspore-ci-bot 1aedd6cee1 !2578 Add case for precision of bert network
6 years ago
mindspore-ci-bot cf366aa0e7 !2758 update release notes - sync
6 years ago
jonyguo b5e4174868 update release notes
6 years ago
mindspore-ci-bot 9b547e20f4 !2751 fix mix target device id
6 years ago
kswang d290c07a0f fix mix target device id
6 years ago
duxiutao dcf40c262b add case to guard precision and modify get const tensor value in composite kernel
6 years ago
mindspore-ci-bot ae82f743da !2739 rebase master to r0.5 for quantization aware training
6 years ago
chenzomi 684ecac927 rebase master to r0.5 for quantizaiton aware training
6 years ago
mindspore-ci-bot 256bedf469 !2740 unroll print loss
6 years ago
mindspore-ci-bot 3c324e1031 !2729 Add resnext50 network
6 years ago
mindspore-ci-bot 3b23d8e45a !2727 Add YOLOV3-DarkNet53 to Model Zoo
6 years ago
chenzomi ff042b92ce unroll print loss
6 years ago
yangyongjie 123226c283 Add YOLOV3-DarkNet53 to Model Zoo
6 years ago
mindspore-ci-bot 412e4580d4 !2721 add thirdparty notice for bert/tokenization
6 years ago
mindspore-ci-bot 0201001a41 !2731 remove export lite model api
6 years ago
yangjie159 46da06b3f8 remove export lite model api
6 years ago
zhouyaqiang 66d2690f95 add resnext50 network
6 years ago
yuchaojie a5fd01fd40 add thirdparty notice for bert/tokenization
6 years ago
mindspore-ci-bot f8fa03d732 !2712 Revert "Make assign-node to be before jump-node, ensure child graph can get its inputs"
6 years ago
mindspore-ci-bot 5b46b05b50 !2714 add notice
6 years ago
mindspore-ci-bot bb3e05c317 !2709 [r0.5]Fix bug of paramter is the second input of control depend
6 years ago
mindspore-ci-bot 7891b53fa8 !2704 fix quantization aware training auto create graph bug
6 years ago
chenzomi c831d3eb60 fix quantization aware training auto create graph bug
6 years ago
jinyaohui 5db6852c75 add notice
6 years ago
zhoufeng 30001d68e2 Revert "Make assign-node to be before jump-node, ensure child graph can get its"
6 years ago
chenfei 92aaf297c4 if parameter is the second input of control depend and depend mode is 0,this control relation is invalid
6 years ago
mindspore-ci-bot 2daa1b33c2 !2693 Move resnet_thor from example to model_zoo
6 years ago
mindspore-ci-bot da9ba5e84d !2695 Update akg to r0.5 branch
6 years ago
mindspore-ci-bot 381bbc4db5 !2691 use two condition, false branch caculate error
6 years ago
mindspore-ci-bot bf52346e65 !2678 Add typeid to type conversion scene
6 years ago
mindspore-ci-bot 39d171ce17 !2689 fix cast kernel build in pynative mode
6 years ago
tronzhang 2848ad1f82 update akg to r0.5 branch
6 years ago
panbingao 70bc8ed529 move resnet_thor series from example to model_zoo
6 years ago
hexia 9794b13c10 fix switch input
6 years ago
buxue fe8f47dc45 add typeid to type conversion scene
6 years ago
mindspore-ci-bot 84cdb9f4ed !2646 Fix grad value is wrong when register hook in pynative
6 years ago
mindspore-ci-bot 51dd49c176 !2628 move resnet series from example to model_zoo
6 years ago
limingqi107 7fca26c201 fix cast kernel build in pynative mode
6 years ago
mindspore-ci-bot e70a8c840a !2670 Change comment for akg compilation option
6 years ago
mindspore-ci-bot acbbe52984 !2685 GPU mul support int
6 years ago
mindspore-ci-bot f8b608cb1c !2686 update MindSpore version from r0.3 to r0.5
6 years ago
mindspore-ci-bot 48b4a10f39 !2679 fix issue [controlflow] if Cascad an if, raise error
6 years ago
mindspore-ci-bot 7ec0b35fac !2653 Mass text summarization update.
6 years ago
mindspore-ci-bot 2613d76a96 !2656 fix: tfrecord to mindrecord para check - sync
6 years ago
mindspore-ci-bot 778fdf6e49 !2672 auto paralle for sparse tensor gradient
6 years ago
yanghaoran 5b6a59e13e update version to 0.5
6 years ago
VectorSL 79ec5f7398 gpu mul support int
6 years ago
mindspore-ci-bot 25a34c0d13 !2666 fix python abort bug
6 years ago
mindspore-ci-bot 2108f72cd3 !2682 [quant]The top level add op prefix_name check error r0.5
6 years ago
gengdongjie 6f13315d90 move resnet series from example to model_zoo
6 years ago
Wei Luning f398495a88 fix bug in quant deploy export
6 years ago
mindspore-ci-bot abb7c40315 !2665 handle RecurseCompileGraph when one branch is Assign
6 years ago
mindspore-ci-bot 9b1b34d7ed !2647 Synchronize Ascend software suite 28 Jun 2020
6 years ago
mindspore-ci-bot b5d8134682 !2649 Add group params check method and fix print comment
6 years ago
mindspore-ci-bot 269b514b05 !2650 get monitor sampling interval from json file
6 years ago
mindspore-ci-bot 5a6988bc94 !2663 optimize is all nop node detect in mem reuse
6 years ago
mindspore-ci-bot f6148c7e39 !2657 [CT][MS][Auto-Parallel]Double recursion does not support the gatherv2 operator
6 years ago
lvliang 24c0a8256f fix-grad-value-is-wrong-in-pynative-hook
6 years ago
wenchunjiang b3da4d9b97 fix bug of labelswitch generate task failed when if cascad if
6 years ago
lirongzhen1 c3a9f1455e auto parallel for sparse gradients
6 years ago
tronzhang 6259db4d5d change comment for akg option
6 years ago
jonyguo b3346a98b9 fix: tfrecord to mindrecord parameter check
6 years ago
jinyaohui 79b2fe28d5 fix bug
6 years ago
mindspore-ci-bot 16a75779be !2645 fix vgg16 accuracy lower then 92
6 years ago
Margaret_wangrui 390efd1207 handle RecurseCompileGraph when one branch is Assign
6 years ago
laiyongqiang 4799131e18 optimize is all nop node detect in mem reuse
6 years ago
hongxing eed1881f2d fix GatherV2/GatherV2P
6 years ago
linqingke c4d8c8aec0 Mass text summarization fix bug.
6 years ago
yanghaitao f3423208c4 set monitor sampling interval through json file
6 years ago
yanghaoran f44a0fd3df synchronize lastest Ascend software suite 28 Jun 2020
6 years ago
guohongzilong 9409f83245 fix params KeyError in group params
6 years ago
caojian05 0a261aba50 fix accurancy lower then 92
6 years ago
mindspore-ci-bot 6ef1a731db !2637 add wide&deep standalone training script for gpu in model zoo
6 years ago
mindspore-ci-bot 40a0cd4a57 !2627 Modify long description format of whl package
6 years ago
lizhenyu 3231c4ab13 add wide&deep stanalone training script for gpu in model zoo
6 years ago
mindspore-ci-bot c74b16857a !2546 Fix some mistakes of ConfusionTransposeD vm ops
6 years ago
mindspore-ci-bot 375f2bfa61 !2515 Avoid extra recording of summary when end called
6 years ago
mindspore-ci-bot 231ef6bd74 !2539 1. fix infer value bug 2. tensor init support numpy number
6 years ago
mindspore-ci-bot 80ab1c0ab2 !2538 bugfix(transform): relax the exception of control depend on value node
6 years ago
mindspore-ci-bot f5580ce722 !2617 Decide whether to collect data by dataset sink mode and current step in SummaryCollector
6 years ago
zhoufeng 6f720a6cd8 Modify long description format of whl package
6 years ago
liuwenhao4 89654580dd Fix some mistakes of ConfusionTransposeD vm ops
6 years ago
mindspore-ci-bot 4f377f2ab4 !2596 Make assign-node to be before jump-node, ensure child graph can get its input
6 years ago
mindspore-ci-bot 5488268648 !2613 optimize fastrcnn training script
6 years ago
ougongchang 0934281adc Decide whether to collect data by dataset sink mode and current step in SummaryCollector.
6 years ago
mindspore-ci-bot 1cadea12f0 !2469 add pretrain for lstm & vgg16 and remove lstm/vgg16/googlenet from directory 'mindspore/model_zoo'
6 years ago
yanghaitao1 0c519882b8 optimize fastrcnn training process
6 years ago
mindspore-ci-bot 122a931090 !2577 [AutoPar] copy Master commits to r0.5
6 years ago
mindspore-ci-bot 6d5ea0ee4d !2581 Add ut case test_lamb to r0.5
6 years ago
hongxing 300dd2971c merge master code to r0.5
6 years ago
mindspore-ci-bot ed22908c99 !2597 GraphKernel support akg batchmatmul
6 years ago
dayschan cfe9c35659 GraphKernel support akg batchmatmul
6 years ago
zhoufeng 8628b898e1 Make assign-node to be before jump-node, ensure child graph can get its
6 years ago
mindspore-ci-bot 12a359b9de !2588 fix checkpoint evaliaction.
6 years ago
chenzomi bed6332688 fix checkpoint evaliaction.
6 years ago
Tron Zhang 43d8e6af1d add ut case test_lamb
6 years ago
mindspore-ci-bot 3c48de8262 !2573 fix print file bug
6 years ago
mindspore-ci-bot dd75ebfae3 !2575 dataset: repair bug in GetTensor that access NullPtr
6 years ago
mindspore-ci-bot d90e43a23c !2496 Enhance callback module and strongly check if callbacks is list or not
6 years ago
ms_yan 7d2fe8c279 change GetTensor into GetRow to avoid NullPtr
6 years ago
jinyaohui e893c70164 fix bug
6 years ago
mindspore-ci-bot 9ea10a0022 !2571 add ENABLE_GE
6 years ago
jinyaohui bb17bc4081 add ENABLE_GE
6 years ago
Xian Weizhao 01f228b0d5 relax the exception of control depend on value node
6 years ago
Li Hongzhang 2f33c76d7b warn when values duplicate and set mode to 'eval' to avoid extra recording
6 years ago
mindspore-ci-bot fe1d4ca3bd !2555 checkpoint add model_type
6 years ago
mindspore-ci-bot 2e76c9fb82 !2536 [Control sink]Fix bug of get call real outputs
6 years ago
chenzomi 3b632eac46 checkpoint add model_type
6 years ago
mindspore-ci-bot f1106a18aa !2486 Make sure record the first step data, and catch the ValueError when the loss is not a Scalar
6 years ago
mindspore-ci-bot 0857f43e0e !2516 !2482 fix a bug :serialization.export parameter "file_name" doesn't work
6 years ago
mindspore-ci-bot 9b65782e1b !2522 modify alexnet dataset.py
6 years ago
mindspore-ci-bot 572236bdd7 !2507 fix misspell and check parameters on graphdata
6 years ago
geekun e4b3b72ebf fix infer value bug
6 years ago
mindspore-ci-bot 3f8a7920d5 !2513 fix bug to remove reshape when reshape is depend's input
6 years ago
chenfei 7ede538d6a visit stop if tuple getitem and maketuple of function GetCallRealOutputs
6 years ago
mindspore-ci-bot 49ef6b89dd !2525 Fix some mistakes of TransData vm ops
6 years ago
heleiwang 5f61b83812 fix misspell and check parameters
6 years ago
mindspore-ci-bot 23771def82 !2523 Disable cuda9.2
6 years ago
ougongchang 54af354597 Make sure record the first step data in SummaryCollector, and catch the ValueError when the loss is not a Scalar.
6 years ago
zhoufeng 5520a23f7c Disable cuda9.2, use cuda10.1 as default
6 years ago
liuwenhao4 01789e1aa1 Fix some mistakes of TransData vm ops
6 years ago
wukesong 16544b37d6 modify
6 years ago
mindspore-ci-bot ab83bf18d9 !2509 Graph kernel use control sink
6 years ago
lilei 5d4099704f fix a bug:save .pb file
6 years ago
mindspore-ci-bot 43871e45dd !2493 Add an output to apply_proximal_adagrad op register
6 years ago
laiyongqiang 21770e7b6f fix bug to remove reshape when reshape is depend's input
6 years ago
zhoufeng dd22792344 Graph kernel use control sink
6 years ago
mindspore-ci-bot 166d886501 !2476 fix the summary operator is not work in constant folding scene
6 years ago
Li Hongzhang 9532e53337 enhance callback module and strongly check callbacks is list or not
6 years ago
mindspore-ci-bot f76096333e !2488 delete ENABLE_GE
6 years ago
mindspore-ci-bot 19ff5002ab !2467 check control mode of control depend
6 years ago
mindspore-ci-bot 73839591f5 !2481 add bert inference example in serving
6 years ago
yujianfeng 34407391e6 Add an output to apply_proximal_adagrad op register
6 years ago
jinyaohui 29a2458596 delete ENABLE_GE
6 years ago
caojian05 a88e6ea270 add pretrain for lstm & vgg16 and remove lstm/vgg16/googlenet from directory 'mindspore/model_zoo'
6 years ago
chenfei 0e6752fa6a check control mode of control depend
6 years ago
dinghao 4ddb00b996 add bert example
6 years ago
mindspore-ci-bot 35ab95bfae !2461 Add multiple process for computation of optimizer in cpu
6 years ago
ougongchang 20a164e9cf fix the summary operator is not work in constant folding scene
6 years ago
mindspore-ci-bot 1b52753fd7 !2462 optimize cpu reduce gradient
6 years ago
kswang e74b02f460 optimize cpu reduce sparse gradient
6 years ago
mindspore-ci-bot 56b6191db5 !2449 Fix bug of ascend control parser
6 years ago
mindspore-ci-bot eef762e58a !2456 Fix BackendCommonOptimization order
6 years ago
yujianfeng 794ed3a291 Add multiple process for computation of sparse optimizers
6 years ago
mindspore-ci-bot a420c667c9 !2453 Change the dataset attribute in SummaryCollector
6 years ago
ougongchang 3c08137904 Change the attribute to children, becuase the attribute has beed changed in dataset
6 years ago
mindspore-ci-bot e726680e38 !2444 fix mix target entry
6 years ago
chenfei 144aca43c3 fix bug of control parser
6 years ago
kswang ae3db6d4de fix mix target entry
6 years ago
zhoufeng d4de0c5af1 fix BackendCommonOptimization order
6 years ago

@ -29,7 +29,7 @@ enrichment of the AI software/hardware application ecosystem.
<img src="docs/MindSpore-architecture.png" alt="MindSpore Architecture" width="600"/>
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/0.3.0-alpha/architecture.html).
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/r0.5/architecture.html).
### Automatic Differentiation
@ -66,7 +66,6 @@ MindSpore offers build options across multiple backends:
| Ascend910 | Ubuntu-x86 | ✔️ |
| | EulerOS-x86 | ✔️ |
| | EulerOS-aarch64 | ✔️ |
| GPU CUDA 9.2 | Ubuntu-x86 | ✔️ |
| GPU CUDA 10.1 | Ubuntu-x86 | ✔️ |
| CPU | Ubuntu-x86 | ✔️ |
| | Windows-x86 | ✔️ |
@ -76,7 +75,7 @@ For installation using `pip`, take `CPU` and `Ubuntu-x86` build version as an ex
1. Download whl from [MindSpore download page](https://www.mindspore.cn/versions/en), and install the package.
```
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.3.0-alpha/MindSpore/cpu/ubuntu_x86/mindspore-0.3.0-cp37-cp37m-linux_x86_64.whl
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.2-cp37-cp37m-linux_x86_64.whl
```
2. Run the following command to verify the install.
@ -133,8 +132,8 @@ currently the containerized build options are supported as follows:
For `CPU` backend, you can directly pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-cpu:0.3.0-alpha
docker run -it mindspore/mindspore-cpu:0.3.0-alpha /bin/bash
docker pull mindspore/mindspore-cpu:0.5.2-beta
docker run -it mindspore/mindspore-cpu:0.5.2-beta /bin/bash
```
* GPU
@ -151,8 +150,8 @@ currently the containerized build options are supported as follows:
Then you can pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-gpu:0.3.0-alpha
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.3.0-alpha /bin/bash
docker pull mindspore/mindspore-gpu:0.5.2-beta
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.5.2-beta /bin/bash
```
To test if the docker image works, please execute the python code below and check the output:
@ -187,7 +186,7 @@ please check out [docker](docker/README.md) repo for the details.
## Quickstart
See the [Quick Start](https://www.mindspore.cn/tutorial/en/0.3.0-alpha/quick_start/quick_start.html)
See the [Quick Start](https://www.mindspore.cn/tutorial/en/r0.5/quick_start/quick_start.html)
to implement the image classification.
## Docs

File diff suppressed because one or more lines are too long

@ -3638,6 +3638,61 @@ Copyright (C) 2001-2005, International Business Machines Corporation and others.
Copyright (c) 1996-2016, International Business Machines Corporation
Copyright (C) 1997-2010, International Business Machines
Software: libtiff 4.1.0
Copyright notice:
Copyright © 2015 Open Microscopy Environment / University of Dundee
Copyright (c) 2004, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 1990-1997 Sam Leffler
Copyright (c) 1991-1997 Silicon Graphics, Inc.
Copyright (c) 1988-1997 Sam Leffler
Copyright (c) 1991-1997 Sam Leffler
Use and Copyright
Copyright (C) 1990, 1995 Frank D. Cringle.
Copyright (c) 1994-1997 Sam Leffler
Copyright (c) 1994-1997 Silicon Graphics, Inc.
Copyright (c) 1997 Greg Ward Larson
Copyright (c) 1997 Silicon Graphics, Inc.
Copyright (c) 2010, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) Joris Van Damme <info@awaresystems.be>
Copyright (c) AWare Systems <http:www.awaresystems.be/>
Copyright (c) 1996-1997 Sam Leffler
Copyright (c) 1996 Pixar
Copyright (c) 1995-1997 Sam Leffler
Copyright (c) 1995-1997 Silicon Graphics, Inc.
Copyright (c) 1988-1996 Sam Leffler
Copyright (c) 1991-1996 Silicon Graphics, Inc.
Copyright (c) 1992-1997 Sam Leffler
Copyright (c) 1992-1997 Silicon Graphics, Inc.
Copyright (c) 2018, Mapbox
Copyright (c) 2017, Planet Labs
Copyright (c) 1990 by Sun Microsystems, Inc.
Copyright 1990 by Digital Equipment Corporation, Maynard, Massachusetts.
Copyright 1991 by Digital Equipment Corporation, Maynard, Massachusetts.
Copyright (c) 2002, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2003 Ross Finlayson
Additions (c) Richard Nolde 2006-2010
Copyright (c) 2003, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2000, Frank Warmerdam
Copyright (c) 1987, 1993, 1994
Copyright (c) 1989, 1993
Copyright (c) 2009 Frank Warmerdam
Copyright (c) 1987, 1993
Copyright (c) 2005 The DragonFly Project. All rights reserved.
Copyright (c) 2003 Citrus Project,
All rights reserved.
Copyright (c) 1990, 1993
Copyright (c) 1996 Mike Johnson
Copyright (c) 1996 BancTec AB
Copyright (c) 2004, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2012, Frank Warmerdam <warmerdam@pobox.com>
Copyright (c) 2019, Even Rouault <even.rouault at spatialys.com>
Copyright (c) 2007, Frank Warmerdam <warmerdam@pobox.com>
Copyright (c) 2019, Thomas Bernard <miniupnp@free.fr>
Copyright (c) 2008, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 1999, Frank Warmerdam
Copyright (c) 1991-1996 Sam Leffler
Copyright (c) 1996 USAF Phillips Laboratory
Software: opencv 4.2.0
Copyright notice:
Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
@ -4095,3 +4150,11 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Software: bert eedf5716ce1268e56f0a50264a88cafad334ac61
MindSpore only used file bert/tokenization.py
Copyright notice:
Copyright 2018 The Google AI Language Team Authors.
Apache License, Version 2.0

2
akg

@ -1 +1 @@
Subproject commit c460176523d039c8995f1d71089753725ebc0792
Subproject commit 7c462a5d5acd073dfeff4a49b28e01af55c31c55

@ -50,9 +50,9 @@ usage()
echo " -D Enable dumping of function graph ir, default on"
echo " -z Compile dataset & mindrecord, default on"
echo " -M Enable MPI and NCCL for GPU training, gpu default on"
echo " -V Specify the minimum required cuda version, default CUDA 9.2"
echo " -V Specify the minimum required cuda version, default CUDA 10.1"
echo " -I Compile predict, default off"
echo " -K Compile with AKG, default off"
echo " -K Compile with AKG, default on"
echo " -s Enable serving module, default off"
}
@ -88,7 +88,7 @@ checkopts()
ENABLE_DUMP_IR="on"
COMPILE_MINDDATA="on"
ENABLE_MPI="off"
CUDA_VERSION="9.2"
CUDA_VERSION="10.1"
COMPILE_PREDICT="off"
USE_GLOG="on"
PREDICT_PLATFORM=""
@ -191,6 +191,10 @@ checkopts()
usage
exit 1
fi
if [[ "X$OPTARG" == "X9.2" ]]; then
echo "Unsupported CUDA version 9.2"
exit 1
fi
CUDA_VERSION="$OPTARG"
;;
P)
@ -248,7 +252,7 @@ checkopts()
done
}
checkopts "$@"
echo "---------------- mindspore: build start ----------------"
echo "---------------- mindSpore: build start ----------------"
mkdir -pv "${BUILD_PATH}/package/mindspore/lib"
git submodule update --init graphengine
if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" ]]; then
@ -446,9 +450,9 @@ build_predict()
cd "${BASEPATH}/predict/output/"
if [[ "$PREDICT_PLATFORM" == "x86_64" ]]; then
tar -cf MSPredict-0.3.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.5.2-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
elif [[ "$PREDICT_PLATFORM" == "arm64" ]]; then
tar -cf MSPredict-0.3.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.5.2-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
fi
echo "success to build predict project!"
}

@ -45,7 +45,11 @@ else()
set(ASCEND_PATH /usr/local/Ascend)
endif()
set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common)
set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
if (DEFINED ENV{ASCEND_CUSTOM_FWK_PATH})
set(ASCEND_RUNTIME_PATH $ENV{ASCEND_CUSTOM_FWK_PATH}/fwkacllib/lib64)
else ()
set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
endif ()
find_library(c_sec libc_sec.so ${ASCEND_DRIVER_PATH})
find_library(slog libslog.so ${ASCEND_DRIVER_PATH})
find_library(mmpa libmmpa.so ${ASCEND_DRIVER_PATH})

@ -8,12 +8,12 @@ else()
VER 67.1
LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
MD5 0c2662a2b0bc80b0eb56495205247c8f
CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-rpath --disable-tests --disable-samples --disable-icuio --disable-extras ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
MD5 fd525fb47d8827b0b7da78b51dd2d93f
CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/scripts/build_icu4c.sh
)
include_directories(${icu4c_INC})
add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
add_definitions(-D ENABLE_ICU4C)
endif()
endif()

@ -0,0 +1,67 @@
FROM ubuntu:18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV PATH /usr/local/bin:$PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install MindSpore cpu whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl

@ -0,0 +1,67 @@
FROM ubuntu:18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV PATH /usr/local/bin:$PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install MindSpore cpu whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.2-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.2-cp37-cp37m-linux_x86_64.whl

@ -0,0 +1,83 @@
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5
ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH
ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex \
libnccl2=2.4.8-1+cuda10.1 \
libnccl-dev=2.4.8-1+cuda10.1
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install openmpi (v3.1.5)
RUN cd /tmp \
&& wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \
&& tar -xvf openmpi-3.1.5.tar.gz \
&& cd /tmp/openmpi-3.1.5 \
&& mkdir -p ${OMPI_ROOT_PATH} \
&& ./configure --prefix=${OMPI_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -rf /tmp/openmpi-3.1.5 \
&& rm -f /tmp/openmpi-3.1.5.tar.gz
# Install MindSpore cuda-10.1 whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/gpu/ubuntu_x86/cuda-10.1/mindspore_gpu-0.5.0-cp37-cp37m-linux_x86_64.whl

@ -0,0 +1,83 @@
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5
ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH
ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex \
libnccl2=2.4.8-1+cuda10.1 \
libnccl-dev=2.4.8-1+cuda10.1
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install openmpi (v3.1.5)
RUN cd /tmp \
&& wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \
&& tar -xvf openmpi-3.1.5.tar.gz \
&& cd /tmp/openmpi-3.1.5 \
&& mkdir -p ${OMPI_ROOT_PATH} \
&& ./configure --prefix=${OMPI_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -rf /tmp/openmpi-3.1.5 \
&& rm -f /tmp/openmpi-3.1.5.tar.gz
# Install MindSpore cuda-10.1 whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.2-beta/MindSpore/gpu/ubuntu_x86/cuda-10.1/mindspore_gpu-0.5.2-cp37-cp37m-linux_x86_64.whl

@ -1,82 +0,0 @@
# Guideline to Convert Training Data CLUERNER2020 to MindRecord For Bert Fine Tuning
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [How to use the example to process CLUERNER2020](#how-to-use-the-example-to-process-cluerner2020)
- [Download CLUERNER2020 and unzip](#download-cluerner2020-and-unzip)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [CLUERNER2020](https://www.cluebenchmarks.com/introduce.html) training data, generating MindRecord file, and finally used for Bert Fine Tuning progress.
1. run.sh: generate MindRecord entry script
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## How to use the example to process CLUERNER2020
Download CLUERNER2020, convert it to MindRecord, use MindDataset to read MindRecord.
### Download CLUERNER2020 and unzip
1. Download the training data zip.
> [CLUERNER2020 dataset download address](https://www.cluebenchmarks.com/introduce.html) **-> 任务介绍 -> CLUENER 细粒度命名实体识别 -> cluener下载链接**
2. Unzip the training data to dir example/nlp_to_mindrecord/CLUERNER2020/cluener_public.
```
unzip -d {your-mindspore}/example/nlp_to_mindrecord/CLUERNER2020/data/cluener_public cluener_public.zip
```
### Generate MindRecord
1. Run the run.sh script.
```bash
bash run.sh
```
2. Output like this:
```
...
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:12.498.235 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/train.mindrecord'], and the list of index files are: ['data/train.mindrecord.db']
...
[INFO] ME(17603,python):2020-04-28-16:56:13.400.175 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.400.863 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.401.534 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.402.179 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.402.702 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
...
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:13.431.208 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/dev.mindrecord'], and the list of index files are: ['data/dev.mindrecord.db']
```
3. Generate files like this:
```bash
$ ls output/
dev.mindrecord dev.mindrecord.db README.md train.mindrecord train.mindrecord.db
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh
```
2. Output like this:
```
...
example 1340: input_ids: [ 101 3173 1290 4852 7676 3949 122 3299 123 126 3189 4510 8020 6381 5442 7357 2590 3636 8021 7676 3949 4294 1166 6121 3124 1277 6121 3124 7270 2135 3295 5789 3326 123 126 3189 1355 6134 1093 1325 3173 2399 6590 6791 8024 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: label_ids: [ 0 18 19 20 2 4 0 0 0 0 0 0 0 34 36 26 27 28 0 34 35 35 35 35 35 35 35 35 35 36 26 27 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: input_ids: [ 101 1728 711 4293 3868 1168 2190 2150 3791 934 3633 3428 4638 6237 7025 8024 3297 1400 5310 3362 6206 5023 5401 1744 3297 7770 3791 7368 976 1139 1104 2137 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: label_ids: [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 19 19 19 19 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
...
```

@ -1,36 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: label_ids: {}".format(index, item['label_ids']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
create_dataset('output/train.mindrecord')
create_dataset('output/dev.mindrecord')

@ -1,40 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
rm -f output/train.mindrecord*
rm -f output/dev.mindrecord*
if [ ! -d "../../../third_party/to_mindrecord/CLUERNER2020" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/CLUERNER2020 is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch is not exist."
exit 1
fi
# patch for data_processor_seq.py
patch -p0 -d ../../../third_party/to_mindrecord/CLUERNER2020/ -o data_processor_seq_patched.py < ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py failed"
exit 1
fi
# use patched script
python ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq_patched.py \
--vocab_file=../../../third_party/to_mindrecord/CLUERNER2020/vocab.txt \
--label2id_file=../../../third_party/to_mindrecord/CLUERNER2020/label2id.json

@ -1,173 +0,0 @@
# Guideline to Convert Training Data enwiki to MindRecord For Bert Pre Training
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [How to use the example to process enwiki](#how-to-use-the-example-to-process-enwiki)
- [Download enwiki training data](#download-enwiki-training-data)
- [Process the enwiki](#process-the-enwiki)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [enwiki](https://dumps.wikimedia.org/enwiki) training data, generating MindRecord file, and finally used for Bert network training.
1. run.sh: generate MindRecord entry script.
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## How to use the example to process enwiki
Download enwiki data, process it, convert it to MindRecord, use MindDataset to read MindRecord.
### Download enwiki training data
> [enwiki dataset download address](https://dumps.wikimedia.org/enwiki) **-> 20200501 -> enwiki-20200501-pages-articles-multistream.xml.bz2**
### Process the enwiki
1. Please follow the steps in [process enwiki](https://github.com/mlperf/training/tree/master/language_model/tensorflow/bert)
- All permissions of this step belong to the link address website.
### Generate MindRecord
1. Run the run.sh script.
```
bash run.sh input_dir output_dir vocab_file
```
- input_dir: the directory which contains files like 'part-00251-of-00500'.
- output_dir: which will store the output mindrecord files.
- vocab_file: the vocab file which you can download from other opensource project.
2. The output like this:
```
...
Begin preprocess Wed Jun 10 09:21:23 CST 2020
Begin preprocess input file: /mnt/data/results/part-00000-of-00500
Begin output file: part-00000-of-00500.mindrecord
Total task: 510, processing: 1
Begin preprocess input file: /mnt/data/results/part-00001-of-00500
Begin output file: part-00001-of-00500.mindrecord
Total task: 510, processing: 2
Begin preprocess input file: /mnt/data/results/part-00002-of-00500
Begin output file: part-00002-of-00500.mindrecord
Total task: 510, processing: 3
Begin preprocess input file: /mnt/data/results/part-00003-of-00500
Begin output file: part-00003-of-00500.mindrecord
Total task: 510, processing: 4
Begin preprocess input file: /mnt/data/results/part-00004-of-00500
Begin output file: part-00004-of-00500.mindrecord
Total task: 510, processing: 4
...
```
3. Generate files like this:
```bash
$ ls {your_output_dir}/
part-00000-of-00500.mindrecord part-00000-of-00500.mindrecord.db part-00001-of-00500.mindrecord part-00001-of-00500.mindrecord.db part-00002-of-00500.mindrecord part-00002-of-00500.mindrecord.db ...
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh input_dir
```
- input_dir: the directory which contains mindrecord files.
2. The output like this:
```
...
example 633: input_ids: [ 101 2043 19781 4305 2140 4520 2041 1010 103 2034 2455 2002
7879 2003 1996 2455 1997 103 26378 4160 1012 102 7291 2001
1996 103 1011 2343 1997 6327 1010 3423 1998 103 4262 2005
1996 2118 1997 2329 3996 103 102 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0]
example 633: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 633: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 633: masked_lm_positions: [ 8 17 20 25 33 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
example 633: masked_lm_ids: [ 1996 16137 1012 3580 2451 1012 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
example 633: masked_lm_weights: [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0.]
example 633: next_sentence_labels: [1]
...
```

@ -1,43 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import argparse
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
args = parser.parse_args()
create_dataset(args.input_file)

@ -1,133 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# -ne 3 ]; then
echo "Usage: $0 input_dir output_dir vocab_file"
exit 1
fi
if [ ! -d $1 ]; then
echo "The input dir: $1 is not exist."
exit 1
fi
if [ ! -d $2 ]; then
echo "The output dir: $2 is not exist."
exit 1
fi
rm -fr $2/*.mindrecord*
if [ ! -f $3 ]; then
echo "The vocab file: $3 is not exist."
exit 1
fi
data_dir=$1
output_dir=$2
vocab_file=$3
file_list=()
output_filename=()
file_index=0
function getdir() {
elements=`ls $1`
for element in ${elements[*]};
do
dir_or_file=$1"/"$element
if [ -d $dir_or_file ];
then
getdir $dir_or_file
else
file_list[$file_index]=$dir_or_file
echo "${dir_or_file}" | tr '/' '\n' > dir_file_list.txt # dir dir file to mapfile
mapfile parent_dir < dir_file_list.txt
rm dir_file_list.txt >/dev/null 2>&1
tmp_output_filename=${parent_dir[${#parent_dir[@]}-1]}".mindrecord"
output_filename[$file_index]=`echo ${tmp_output_filename} | sed 's/ //g'`
file_index=`expr $file_index + 1`
fi
done
}
getdir "${data_dir}"
# echo "The input files: "${file_list[@]}
# echo "The output files: "${output_filename[@]}
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
exit 1
fi
# patch for create_pretraining_data.py
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
exit 1
fi
# get the cpu core count
num_cpu_core=`cat /proc/cpuinfo | grep "processor" | wc -l`
avaiable_core_size=`expr $num_cpu_core / 3 \* 2`
echo "Begin preprocess `date`"
# using patched script to generate mindrecord
file_list_len=`expr ${#file_list[*]} - 1`
for index in $(seq 0 $file_list_len); do
echo "Begin preprocess input file: ${file_list[$index]}"
echo "Begin output file: ${output_filename[$index]}"
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
--input_file=${file_list[$index]} \
--output_file=${output_dir}/${output_filename[$index]} \
--partition_number=1 \
--vocab_file=${vocab_file} \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10 >/tmp/${output_filename[$index]}.log 2>&1 &
process_count=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
echo "Total task: ${#file_list[*]}, processing: ${process_count}"
if [ $process_count -ge $avaiable_core_size ]; then
while [ 1 ]; do
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
if [ $process_count -gt $process_num ]; then
process_count=$process_num
break;
fi
sleep 2
done
fi
done
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
while [ 1 ]; do
if [ $process_num -eq 0 ]; then
break;
fi
echo "There are still ${process_num} preprocess running ..."
sleep 2
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
done
echo "Preprocess all the data success."
echo "End preprocess `date`"

@ -1,113 +0,0 @@
# Guideline to Convert Training Data zhwiki to MindRecord For Bert Pre Training
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [Run simple test](#run-simple-test)
- [How to use the example to process zhwiki](#how-to-use-the-example-to-process-zhwiki)
- [Download zhwiki training data](#download-zhwiki-training-data)
- [Extract the zhwiki](#extract-the-zhwiki)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [zhwiki](https://dumps.wikimedia.org/zhwiki) training data, generating MindRecord file, and finally used for Bert network training.
1. run.sh: generate MindRecord entry script.
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## Run simple test
Follow the step:
```bash
bash run_simple.sh # generate output/simple.mindrecord* by ../../../third_party/to_mindrecord/zhwiki/sample_text.txt
bash run_read_simple.sh # use MindDataset to read output/simple.mindrecord*
```
## How to use the example to process zhwiki
Download zhwiki data, extract it, convert it to MindRecord, use MindDataset to read MindRecord.
### Download zhwiki training data
> [zhwiki dataset download address](https://dumps.wikimedia.org/zhwiki) **-> 20200401 -> zhwiki-20200401-pages-articles-multistream.xml.bz2**
- put the zhwiki-20200401-pages-articles-multistream.xml.bz2 in {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
### Extract the zhwiki
1. Download [wikiextractor](https://github.com/attardi/wikiextractor) script to {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
```
$ ls data/
README.md wikiextractor zhwiki-20200401-pages-articles-multistream.xml.bz2
```
2. Extract the zhwiki.
```python
python data/wikiextractor/WikiExtractor.py data/zhwiki-20200401-pages-articles-multistream.xml.bz2 --processes 4 --templates data/template --bytes 8M --min_text_length 0 --filter_disambig_pages --output data/extract
```
3. Generate like this:
```
$ ls data/extract
AA AB
```
### Generate MindRecord
1. Run the run.sh script.
```
bash run.sh
```
> Caution: This process maybe slow, please wait patiently. If you do not have a machine with enough memory and cpu, it is recommended that you modify the script to generate mindrecord in step by step.
2. The output like this:
```
patching file create_pretraining_data_patched.py (read from create_pretraining_data.py)
Begin preprocess input file: ./data/extract/AA/wiki_00
Begin output file: AAwiki_00.mindrecord
Total task: 5, processing: 1
Begin preprocess input file: ./data/extract/AA/wiki_01
Begin output file: AAwiki_01.mindrecord
Total task: 5, processing: 2
Begin preprocess input file: ./data/extract/AA/wiki_02
Begin output file: AAwiki_02.mindrecord
Total task: 5, processing: 3
Begin preprocess input file: ./data/extract/AB/wiki_02
Begin output file: ABwiki_02.mindrecord
Total task: 5, processing: 4
...
```
3. Generate files like this:
```bash
$ ls output/
AAwiki_00.mindrecord AAwiki_00.mindrecord.db AAwiki_01.mindrecord AAwiki_01.mindrecord.db AAwiki_02.mindrecord AAwiki_02.mindrecord.db ... ABwiki_00.mindrecord ABwiki_00.mindrecord.db ...
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh
```
2. The output like this:
```
...
example 74: input_ids: [ 101 8168 118 12847 8783 9977 15908 117 8256 9245 11643 8168 8847 8588 11575 8154 8228 143 8384 8376 9197 10241 103 10564 11421 8199 12268 112 161 8228 11541 9586 8436 8174 8363 9864 9702 103 103 119 103 9947 10564 103 8436 8806 11479 103 8912 119 103 103 103 12209 8303 103 8757 8824 117 8256 103 8619 8168 11541 102 11684 8196 103 8228 8847 11523 117 9059 9064 12410 8358 8181 10764 117 11167 11706 9920 148 8332 11390 8936 8205 10951 11997 103 8154 117 103 8670 10467 112 161 10951 13139 12413 117 10288 143 10425 8205 152 10795 8472 8196 103 161 12126 9172 13129 12106 8217 8174 12244 8205 143 103 8461 8277 10628 160 8221 119 102]
example 74: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
example 74: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
example 74: masked_lm_positions: [ 6 22 37 38 40 43 47 50 51 52 55 60 67 76 89 92 98 109 120 0]
example 74: masked_lm_ids: [ 8118 8165 8329 8890 8554 8458 119 8850 8565 10392 8174 11467 10291 8181 8549 12718 13139 112 158 0]
example 74: masked_lm_weights: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
example 74: next_sentence_labels: [0]
...
```

@ -1,43 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import argparse
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
args = parser.parse_args()
create_dataset(args.input_file)

@ -1,3 +0,0 @@
wikiextractor/
zhwiki-20200401-pages-articles-multistream.xml.bz2
extract/

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save