From a82e99d0c1cd27bfc2ee1e3bb2065be847996e6b Mon Sep 17 00:00:00 2001 From: linqingke Date: Wed, 28 Oct 2020 09:32:17 +0800 Subject: [PATCH] remove generate_hccn_file.py --- model_zoo/official/cv/cnnctc/README.md | 1 - .../scripts/run_distribute_train_ascend.sh | 1 - .../cv/cnnctc/src/generate_hccn_file.py | 88 ------------------- model_zoo/official/cv/psenet/README.md | 23 +++-- .../cv/psenet/scripts/run_distribute_train.sh | 20 +++-- .../cv/psenet/src/generate_hccn_file.py | 85 ------------------ 6 files changed, 23 insertions(+), 195 deletions(-) delete mode 100644 model_zoo/official/cv/cnnctc/src/generate_hccn_file.py delete mode 100644 model_zoo/official/cv/psenet/src/generate_hccn_file.py diff --git a/model_zoo/official/cv/cnnctc/README.md b/model_zoo/official/cv/cnnctc/README.md index 0ec86bd8c5..9ce50b3bab 100644 --- a/model_zoo/official/cv/cnnctc/README.md +++ b/model_zoo/official/cv/cnnctc/README.md @@ -148,7 +148,6 @@ The entire code structure is as following: |---callback.py // loss callback file |---dataset.py // process dataset |---util.py // routine operation - |---generate_hccn_file.py // generate distribute json file |---preprocess_dataset.py // preprocess dataset ``` diff --git a/model_zoo/official/cv/cnnctc/scripts/run_distribute_train_ascend.sh b/model_zoo/official/cv/cnnctc/scripts/run_distribute_train_ascend.sh index 4d9b072be4..5688c449aa 100644 --- a/model_zoo/official/cv/cnnctc/scripts/run_distribute_train_ascend.sh +++ b/model_zoo/official/cv/cnnctc/scripts/run_distribute_train_ascend.sh @@ -31,7 +31,6 @@ echo $PATH1 PATH2=$(get_real_path $2) echo $PATH2 -python ${current_exec_path}/src/generate_hccn_file.py --rank_file=$PATH1 export RANK_TABLE_FILE=$PATH1 export RANK_SIZE=8 ulimit -u unlimited diff --git a/model_zoo/official/cv/cnnctc/src/generate_hccn_file.py b/model_zoo/official/cv/cnnctc/src/generate_hccn_file.py deleted file mode 100644 index 6c0dfef14a..0000000000 --- a/model_zoo/official/cv/cnnctc/src/generate_hccn_file.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""generate ascend rank file""" - -import os -import socket -import argparse - -parser = argparse.ArgumentParser(description="ascend distribute rank.") -parser.add_argument("--rank_file", type=str, default="scripts/rank_table_8p.json", help="rank_tabel_file_path.") - -def main(rank_table_file): - nproc_per_node = 8 - - visible_devices = ['0', '1', '2', '3', '4', '5', '6', '7'] - - server_id = socket.gethostbyname(socket.gethostname()) - - hccn_configs = open('/etc/hccn.conf', 'r').readlines() - device_ips = {} - for hccn_item in hccn_configs: - hccn_item = hccn_item.strip() - if hccn_item.startswith('address_'): - device_id, device_ip = hccn_item.split('=') - device_id = device_id.split('_')[1] - device_ips[device_id] = device_ip - print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) - - hccn_table = {} - hccn_table['board_id'] = '0x002f' # A+K - # hccn_table['board_id'] = '0x0000' # A+X - - hccn_table['chip_info'] = '910' - hccn_table['deploy_mode'] = 'lab' - hccn_table['group_count'] = '1' - hccn_table['group_list'] = [] - instance_list = [] - for instance_id in range(nproc_per_node): - instance = {} - instance['devices'] = [] - device_id = visible_devices[instance_id] - device_ip = device_ips[device_id] - instance['devices'].append({ - 'device_id': device_id, - 'device_ip': device_ip, - }) - instance['rank_id'] = str(instance_id) - instance['server_id'] = server_id - instance_list.append(instance) - hccn_table['group_list'].append({ - 'device_num': str(nproc_per_node), - 'server_num': '1', - 'group_name': '', - 'instance_count': str(nproc_per_node), - 'instance_list': instance_list, - }) - hccn_table['para_plane_nic_location'] = 'device' - hccn_table['para_plane_nic_name'] = [] - for instance_id in range(nproc_per_node): - eth_id = visible_devices[instance_id] - hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) - hccn_table['para_plane_nic_num'] = str(nproc_per_node) - hccn_table['status'] = 'completed' - import json - with open(rank_table_file, 'w') as table_fp: - json.dump(hccn_table, table_fp, indent=4) - -if __name__ == '__main__': - args_opt = parser.parse_args() - rank_table = args_opt.rank_file - if os.path.exists(rank_table): - print('Rank table file exists.') - else: - print('Generating rank table file.') - main(rank_table) - print('Rank table file generated') diff --git a/model_zoo/official/cv/psenet/README.md b/model_zoo/official/cv/psenet/README.md index 80bfff34d2..1ddfb700cd 100644 --- a/model_zoo/official/cv/psenet/README.md +++ b/model_zoo/official/cv/psenet/README.md @@ -55,7 +55,7 @@ A testing set containing about 2000 readable words After installing MindSpore via the official website, you can start training and evaluation as follows: ```python # run distributed training example -sh scripts/run_distribute_train.sh pretrained_model.ckpt +sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt #download opencv library download pyblind11, opencv3.4 @@ -88,7 +88,6 @@ sh scripts/run_eval_ascend.sh └── run_eval_ascend.sh // shell script for evaluation ├── src ├── __init__.py - ├── generate_hccn_file.py // creating rank.json ├── ETSNET ├── __init__.py ├── base.py // convolution and BN operator @@ -127,7 +126,7 @@ Major parameters in train.py and config.py are: ### Distributed Training ``` -sh scripts/run_distribute_train.sh pretrained_model.ckpt +sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt ``` The above shell script will run distribute training in the background. You can view the results through the file @@ -166,18 +165,18 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean | Parameters | PSENet | | -------------------------- | ----------------------------------------------------------- | -| Model Version | Inception V1 | +| Model Version | V1 | | Resource | Ascend 910 ;CPU 2.60GHz,192cores;Memory,755G | -| uploaded Date | 09/15/2020 (month/day/year) | -| MindSpore Version | 1.0-alpha | +| uploaded Date | 09/30/2020 (month/day/year) | +| MindSpore Version | 1.0.0 | | Dataset | ICDAR2015 | | Training Parameters | start_lr=0.1; lr_scale=0.1 | | Optimizer | SGD | | Loss Function | LossCallBack | | outputs | probability | | Loss | 0.35 | -| Speed | 1pc: 444 ms/step; 4pcs: 446 ms/step | -| Total time | 1pc: 75.48 h; 4pcs: 18.87 h | +| Speed | 1pc: 444 ms/step; 8pcs: 446 ms/step | +| Total time | 1pc: 75.48 h; 8pcs: 10.01 h | | Parameters (M) | 27.36 | | Checkpoint for Fine tuning | 109.44M (.ckpt file) | | Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet | @@ -187,13 +186,13 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean | Parameters | PSENet | | ------------------- | --------------------------- | -| Model Version | Inception V1 | +| Model Version | V1 | | Resource | Ascend 910 | -| Uploaded Date | 09/15/2020 (month/day/year) | -| MindSpore Version | 1.0-alpha | +| Uploaded Date | 09/30/2020 (month/day/year) | +| MindSpore Version | 1.0,0 | | Dataset | ICDAR2015 | | outputs | probability | -| Accuracy | 1pc: 81%; 4pcs: 81% | +| Accuracy | 1pc: 81%; 8pcs: 81% | ## [How to use](#contents) diff --git a/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh b/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh index 9c6eea4d2e..432ef30439 100644 --- a/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh @@ -17,9 +17,9 @@ current_exec_path=$(pwd) echo 'current_exec_path: '${current_exec_path} -if [ $# != 1 ] +if [ $# != 2 ] then - echo "Usage: sh run_distribute_train.sh [PRETRAINED_PATH]" + echo "Usage: sh run_distribute_train.sh [RANK_FILE] [PRETRAINED_PATH]" exit 1 fi @@ -30,20 +30,24 @@ get_real_path(){ echo "$(realpath -m $PWD/$1)" fi } -PATH1=$(get_real_path $1) - +PATH1=$(get_real_path $1) if [ ! -f $PATH1 ] then - echo "error: PRETRAINED_PATH=$PATH1 is not a file" + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" exit 1 fi -python ${current_exec_path}/src/generate_hccn_file.py +PATH2=$(get_real_path $2) +if [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_PATH=$PATH2 is not a file" +exit 1 +fi export DEVICE_NUM=8 export RANK_SIZE=8 -export RANK_TABLE_FILE=${current_exec_path}/rank_table_8p.json +export RANK_TABLE_FILE=$PATH1 for((i=0; i<${DEVICE_NUM}; i++)) do @@ -70,7 +74,7 @@ do cd ${current_exec_path}/device_$i || exit export RANK_ID=$i export DEVICE_ID=$i - python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH1 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 & + python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH2 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 & cd ${current_exec_path} || exit done diff --git a/model_zoo/official/cv/psenet/src/generate_hccn_file.py b/model_zoo/official/cv/psenet/src/generate_hccn_file.py deleted file mode 100644 index 514ca5e74b..0000000000 --- a/model_zoo/official/cv/psenet/src/generate_hccn_file.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - - -import os -import socket - -RANK_TABLE_SAVE_PATH = './rank_table_8p.json' - - -def main(): - nproc_per_node = 4 - - visible_devices = ['0', '1', '2', '3'] - - server_id = socket.gethostbyname(socket.gethostname()) - - hccn_configs = open('/etc/hccn.conf', 'r').readlines() - device_ips = {} - for hccn_item in hccn_configs: - hccn_item = hccn_item.strip() - if hccn_item.startswith('address_'): - device_id, device_ip = hccn_item.split('=') - device_id = device_id.split('_')[1] - device_ips[device_id] = device_ip - print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) - - hccn_table = {} - hccn_table['board_id'] = '0x002f' # A+K - - hccn_table['chip_info'] = '910' - hccn_table['deploy_mode'] = 'lab' - hccn_table['group_count'] = '1' - hccn_table['group_list'] = [] - instance_list = [] - for instance_id in range(nproc_per_node): - instance = {} - instance['devices'] = [] - device_id = visible_devices[instance_id] - device_ip = device_ips[device_id] - instance['devices'].append({ - 'device_id': device_id, - 'device_ip': device_ip, - }) - instance['rank_id'] = str(instance_id) - instance['server_id'] = server_id - instance_list.append(instance) - hccn_table['group_list'].append({ - 'device_num': str(nproc_per_node), - 'server_num': '1', - 'group_name': '', - 'instance_count': str(nproc_per_node), - 'instance_list': instance_list, - }) - hccn_table['para_plane_nic_location'] = 'device' - hccn_table['para_plane_nic_name'] = [] - for instance_id in range(nproc_per_node): - eth_id = visible_devices[instance_id] - hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) - hccn_table['para_plane_nic_num'] = str(nproc_per_node) - hccn_table['status'] = 'completed' - import json - with open(RANK_TABLE_SAVE_PATH, 'w') as table_fp: - json.dump(hccn_table, table_fp, indent=4) - - -if __name__ == '__main__': - if os.path.exists(RANK_TABLE_SAVE_PATH): - print('Rank table file exists.') - else: - print('Generating rank table file.') - main() - print('Rank table file generated')