!10290 add fasttext model to model zoo

From: @zhaojichen Reviewed-by: Signed-off-by:
4 years ago · 1150ae3376
parent ce7b3ea9d1 edc48b48b8
commit 1150ae3376
15 changed files with 1565 additions and 0 deletions
--- a/model_zoo/README.md
+++ b/model_zoo/README.md
@ -46,6 +46,7 @@ In order to facilitate developers to enjoy the benefits of MindSpore framework,
        - [BERT[benchmark]](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/bert/README.md)
        - [TinyBERT](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/tinybert/README.md)
        - [GNMT V2](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/gnmt_v2/README.md)
        - [FastText](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/fasttext/README.md)
        - [LSTM](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/lstm/README.md)
        - [MASS](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/mass/README.md)
        - [Transformer](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/transformer/README.md)
--- a/model_zoo/official/nlp/fasttext/README.md
+++ b/model_zoo/official/nlp/fasttext/README.md
--- a/model_zoo/official/nlp/fasttext/eval.py
+++ b/model_zoo/official/nlp/fasttext/eval.py
@ -0,0 +1,118 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """FastText for Evaluation"""
 import argparse
 import numpy as np
 import mindspore.nn as nn
 import mindspore.common.dtype as mstype
 import mindspore.ops.operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.train.model import Model
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.c_transforms as deC
 from mindspore import context
 from src.fasttext_model import FastText
 parser = argparse.ArgumentParser(description='fasttext')
 parser.add_argument('--data_path', type=str, help='infer dataset path..')
 parser.add_argument('--data_name', type=str, required=True, default='ag',
                    help='dataset name. eg. ag, dbpedia')
 parser.add_argument("--model_ckpt", type=str, required=True,
                    help="existed checkpoint address.")
 args = parser.parse_args()
 if args.data_name == "ag":
    from src.config import config_ag as config
    target_label1 = ['0', '1', '2', '3']
 elif args.data_name == 'dbpedia':
    from src.config import config_db as config
    target_label1 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13']
 elif args.data_name == 'yelp_p':
    from  src.config import config_yelpp as config
    target_label1 = ['0', '1']
 context.set_context(
    mode=context.GRAPH_MODE,
    save_graphs=False,
    device_target="Ascend")
 class FastTextInferCell(nn.Cell):
    """
    Encapsulation class of FastText network infer.
    Args:
        network (nn.Cell): FastText model.
    Returns:
        Tuple[Tensor, Tensor], predicted_ids
    """
    def __init__(self, network):
        super(FastTextInferCell, self).__init__(auto_prefix=False)
        self.network = network
        self.argmax = P.ArgMaxWithValue(axis=1, keep_dims=True)
        self.log_softmax = nn.LogSoftmax(axis=1)
    def construct(self, src_tokens, src_tokens_lengths):
        """construct fasttext infer cell"""
        prediction = self.network(src_tokens, src_tokens_lengths)
        predicted_idx = self.log_softmax(prediction)
        predicted_idx, _ = self.argmax(predicted_idx)
        return predicted_idx
 def load_infer_dataset(batch_size, datafile):
    """data loader for infer"""
    ds = de.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx'])
    type_cast_op = deC.TypeCast(mstype.int32)
    ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
    ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
    ds = ds.map(operations=type_cast_op, input_columns="label_idx")
    ds = ds.batch(batch_size=batch_size, drop_remainder=True)
    return ds
 def run_fasttext_infer():
    """run infer with FastText"""
    dataset = load_infer_dataset(batch_size=config.batch_size, datafile=args.data_path)
    fasttext_model = FastText(config.vocab_size, config.embedding_dims, config.num_class)
    parameter_dict = load_checkpoint(args.model_ckpt)
    load_param_into_net(fasttext_model, parameter_dict=parameter_dict)
    ft_infer = FastTextInferCell(fasttext_model)
    model = Model(ft_infer)
    predictions = []
    target_sens = []
    for batch in dataset.create_dict_iterator(output_numpy=True, num_epochs=1):
        target_sens.append(batch['label_idx'])
        src_tokens = Tensor(batch['src_tokens'], mstype.int32)
        src_tokens_length = Tensor(batch['src_tokens_length'], mstype.int32)
        predicted_idx = model.predict(src_tokens, src_tokens_length)
        predictions.append(predicted_idx.asnumpy())
    from sklearn.metrics import accuracy_score, classification_report
    target_sens = np.array(target_sens).flatten()
    predictions = np.array(predictions).flatten()
    acc = accuracy_score(target_sens, predictions)
    result_report = classification_report(target_sens, predictions, target_names=target_label1)
    print("********Accuracy: ", acc)
    print(result_report)
 if __name__ == '__main__':
    run_fasttext_infer()
--- a/model_zoo/official/nlp/fasttext/requirements.txt
+++ b/model_zoo/official/nlp/fasttext/requirements.txt
@ -0,0 +1,3 @@
 spacy
 sklearn
 en_core_web_lg
--- a/model_zoo/official/nlp/fasttext/scripts/create_dataset.sh
+++ b/model_zoo/official/nlp/fasttext/scripts/create_dataset.sh
@ -0,0 +1,83 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
 echo "sh create_dataset.sh SOURCE_DATASET_PATH DATASET_NAME"
 echo "for example: sh create_dataset.sh /home/workspace/ag_news_csv ag"
 echo "DATASET_NAME including ag, dbpedia, and yelp_p"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 ulimit -u unlimited
 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }
 SOURCE_DATASET_PATH=$(get_real_path $1)
 DATASET_NAME=$2
 export DEVICE_NUM=1
 export DEVICE_ID=5
 export RANK_ID=0
 export RANK_SIZE=1
 if [ $DATASET_NAME == 'ag' ];
 then
  echo "Begin to process ag news data"
  if [ -d "ag" ];
  then
    rm -rf ./ag
  fi
  mkdir ./ag
  cd ./ag || exit
  echo "start data preprocess for device $DEVICE_ID"
  python ../../src/dataset.py  --train_file $SOURCE_DATASET_PATH/train.csv --test_file $SOURCE_DATASET_PATH/test.csv --class_num 4 --max_len 467 --bucket [64,128,467] --test_bucket [467]
  cd ..
 fi
 if [ $DATASET_NAME == 'dbpedia' ];
 then
  echo "Begin to process dbpedia data"
  if [ -d "dbpedia" ];
  then
    rm -rf ./dbpedia
  fi
  mkdir ./dbpedia
  cd ./dbpedia || exit
  echo "start data preprocess for device $DEVICE_ID"
  python ../../src/dataset.py  --train_file $SOURCE_DATASET_PATH/train.csv --test_file $SOURCE_DATASET_PATH/test.csv --class_num 14 --max_len 3013 --bucket [128,512,3013] --test_bucket [1120]
  cd ..
 fi
 if [ $DATASET_NAME == 'yelp_p' ];
 then
  echo "Begin to process ag news data"
  if [ -d "yelp_p" ];
  then
    rm -rf ./yelp_p
  fi
  mkdir ./yelp_p
  cd ./yelp_p || exit
  echo "start data preprocess for device $DEVICE_ID"
  python ../../src/dataset.py  --train_file $SOURCE_DATASET_PATH/train.csv --test_file $SOURCE_DATASET_PATH/test.csv --class_num 2 --max_len 2955 --bucket [64,128,256,512,2955] --test_bucket [2955]
  cd ..
 fi
--- a/model_zoo/official/nlp/fasttext/scripts/run_distribute_train_8p.sh
+++ b/model_zoo/official/nlp/fasttext/scripts/run_distribute_train_8p.sh
@ -0,0 +1,66 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
 echo "sh run_distributed_train.sh DATASET_PATH RANK_TABLE_PATH"
 echo "for example: sh run_distributed_train.sh /home/workspace/ag /home/workspace/rank_table_file.json"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }
 DATASET=$(get_real_path $1)
 echo $DATASET
 DATANAME=$(basename $DATASET)
 RANK_TABLE_PATH=$(get_real_path $2)
 echo $DATANAME
 if [ ! -d $DATASET ]
 then
  echo "Error: DATA_PATH=$DATASET is not a file"
 exit 1
 fi
 current_exec_path=$(pwd)
 echo ${current_exec_path}
 export RANK_TABLE_FILE=$RANK_TABLE_PATH
 echo $RANK_TABLE_FILE
 export RANK_SIZE=8
 export DEVICE_NUM=8
 for((i=0;i<=7;i++));
 do
    rm -rf ${current_exec_path}/device$i
    mkdir ${current_exec_path}/device$i
    cd ${current_exec_path}/device$i
    cp ../../*.py ./
    cp -r ../../src ./
    cp -r ../*.sh ./
    export RANK_ID=$i
    export DEVICE_ID=$i
    echo "start training for rank $i, device $DEVICE_ID"
    python ../../train.py --data_path $DATASET --data_name $DATANAME > log_fasttext.log 2>&1 &
    cd ${current_exec_path}
 done
 cd ${current_exec_path}
--- a/model_zoo/official/nlp/fasttext/scripts/run_eval.sh
+++ b/model_zoo/official/nlp/fasttext/scripts/run_eval.sh
@ -0,0 +1,54 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
 echo "sh run_eval.sh DATASET_PATH DATASET_NAME MODEL_CKPT"
 echo "for example: sh run_eval.sh /home/workspace/ag/test*.mindrecord ag device0/ckpt0/fasttext-5-118.ckpt"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }
 DATASET=$(get_real_path $1)
 echo $DATASET
 DATANAME=$2
 MODEL_CKPT=$(get_real_path $3)
 ulimit -u unlimited
 export DEVICE_NUM=1
 export DEVICE_ID=5
 export RANK_ID=0
 export RANK_SIZE=1
 if [ -d "eval" ];
 then
    rm -rf ./eval
 fi
 mkdir ./eval
 cp ../*.py ./eval
 cp -r ../src ./eval
 cp -r ../scripts/*.sh ./eval
 cd ./eval || exit
 echo "start training for device $DEVICE_ID"
 env > env.log
 python ../../eval.py  --data_path $DATASET --data_name $DATANAME --model_ckpt $MODEL_CKPT> log_fasttext.log 2>&1 &
 cd ..
--- a/model_zoo/official/nlp/fasttext/scripts/run_standalone_train.sh
+++ b/model_zoo/official/nlp/fasttext/scripts/run_standalone_train.sh
@ -0,0 +1,55 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
 echo "sh run_standalone_train.sh DATASET_PATH"
 echo "for example: sh run_standalone_train.sh /home/workspace/ag"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }
 DATASET=$(get_real_path $1)
 echo $DATASET
 DATANAME=$(basename $DATASET)
 echo $DATANAME
 ulimit -u unlimited
 export DEVICE_NUM=1
 export DEVICE_ID=0
 export RANK_ID=0
 export RANK_SIZE=1
 if [ -d "train" ];
 then
    rm -rf ./train
 fi
 mkdir ./train
 cp ../*.py ./train
 cp -r ../src ./train
 cp -r ../scripts/*.sh ./train
 cd ./train || exit
 echo "start training for device $DEVICE_ID"
 env > env.log
 python train.py  --data_path $DATASET --data_name $DATANAME > log_fasttext.log 2>&1 &
 cd ..
--- a/model_zoo/official/nlp/fasttext/src/config.py
+++ b/model_zoo/official/nlp/fasttext/src/config.py
@ -0,0 +1,72 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #" :===========================================================================
 """
 network config setting, will be used in train.py and eval.py
 """
 from easydict import EasyDict as ed
 config_yelpp = ed({
    'vocab_size': 6414979,
    'buckets': [64, 128, 256, 512, 2955],
    'batch_size': 128,
    'embedding_dims': 16,
    'num_class': 2,
    'epoch': 5,
    'lr': 0.02,
    'min_lr': 1e-6,
    'decay_steps': 549,
    'warmup_steps': 400000,
    'poly_lr_scheduler_power': 0.5,
    'epoch_count': 1,
    'pretrain_ckpt_dir': None,
    'save_ckpt_steps': 549,
    'keep_ckpt_max': 10,
 })
 config_db = ed({
    'vocab_size': 6596536,
    'buckets': [128, 512, 3013],
    'batch_size': 128,
    'embedding_dims': 16,
    'num_class': 14,
    'epoch': 5,
    'lr': 0.05,
    'min_lr': 1e-6,
    'decay_steps': 549,
    'warmup_steps': 400000,
    'poly_lr_scheduler_power': 0.5,
    'epoch_count': 1,
    'pretrain_ckpt_dir': None,
    'save_ckpt_steps': 548,
    'keep_ckpt_max': 10,
 })
 config_ag = ed({
    'vocab_size': 1383812,
    'buckets': [64, 128, 467],
    'batch_size': 128,
    'embedding_dims': 16,
    'num_class': 4,
    'epoch': 5,
    'lr': 0.05,
    'min_lr': 1e-6,
    'decay_steps': 115,
    'warmup_steps': 400000,
    'poly_lr_scheduler_power': 0.5,
    'epoch_count': 1,
    'pretrain_ckpt_dir': None,
    'save_ckpt_steps': 116,
    'keep_ckpt_max': 10,
 })
--- a/model_zoo/official/nlp/fasttext/src/dataset.py
+++ b/model_zoo/official/nlp/fasttext/src/dataset.py
--- a/model_zoo/official/nlp/fasttext/src/fasttext_model.py
+++ b/model_zoo/official/nlp/fasttext/src/fasttext_model.py
@ -0,0 +1,70 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """FastText model."""
 from mindspore import nn
 from mindspore.ops import operations as P
 from mindspore.common.initializer import XavierUniform
 from mindspore.common import dtype as mstype
 class FastText(nn.Cell):
    """
    FastText model
    Args:
        vocab_size: vocabulary size
        embedding_dims: The size of each embedding vector
        num_class: number of labels
    """
    def __init__(self, vocab_size, embedding_dims, num_class):
        super(FastText, self).__init__()
        self.vocab_size = vocab_size
        self.embeding_dims = embedding_dims
        self.num_class = num_class
        self.embeding_func = nn.Embedding(vocab_size=self.vocab_size,
                                          embedding_size=self.embeding_dims,
                                          padding_idx=0, embedding_table='Zeros')
        self.fc = nn.Dense(self.embeding_dims, out_channels=self.num_class,
                           weight_init=XavierUniform(1)).to_float(mstype.float16)
        self.reducesum = P.ReduceSum()
        self.expand_dims = P.ExpandDims()
        self.squeeze = P.Squeeze(axis=1)
        self.cast = P.Cast()
        self.tile = P.Tile()
        self.realdiv = P.RealDiv()
        self.fill = P.Fill()
        self.log_softmax = nn.LogSoftmax(axis=1)
    def construct(self, src_tokens, src_token_length):
        """
        construct network
        Args:
            src_tokens: source sentences
            src_token_length: source sentences length
        Returns:
            Tuple[Tensor], network outputs
        """
        src_tokens = self.embeding_func(src_tokens)
        embeding = self.reducesum(src_tokens, 1)
        length_tiled = self.tile(src_token_length, (1, self.embeding_dims))
        embeding = self.realdiv(embeding, length_tiled)
        embeding = self.cast(embeding, mstype.float16)
        classifer = self.fc(embeding)
        classifer = self.cast(classifer, mstype.float32)
        return classifer
--- a/model_zoo/official/nlp/fasttext/src/fasttext_train.py
+++ b/model_zoo/official/nlp/fasttext/src/fasttext_train.py
@ -0,0 +1,142 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """FastText for train"""
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.ops import composite as C
 from mindspore.common.parameter import ParameterTuple
 from mindspore.context import ParallelMode
 from mindspore import nn
 from mindspore.communication.management import get_group_size
 from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
 from mindspore import context
 from src.fasttext_model import FastText
 GRADIENT_CLIP_TYPE = 1
 GRADIENT_CLIP_VALUE = 1.0
 clip_grad = C.MultitypeFuncGraph("clip_grad")
@clip_grad.register("Number", "Number", "Tensor")
 def _clip_grad(clip_type, clip_value, grad):
    """
    Clip gradients.
    Inputs:
        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
        clip_value (float): Specifies how much to clip.
        grad (tuple[Tensor]): Gradients.
    Outputs:
        tuple[Tensor], clipped gradients.
    """
    if clip_type not in (0, 1):
        return grad
    dt = F.dtype(grad)
    if clip_type == 0:
        new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
                                   F.cast(F.tuple_to_array((clip_value,)), dt))
    else:
        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
    return new_grad
 class FastTextNetWithLoss(nn.Cell):
    """
    Provide FastText training loss
    Args:
        vocab_size: vocabulary size
        embedding_dims: The size of each embedding vector
        num_class: number of labels
    """
    def __init__(self, vocab_size, embedding_dims, num_class):
        super(FastTextNetWithLoss, self).__init__()
        self.fasttext = FastText(vocab_size, embedding_dims, num_class)
        self.loss_func = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
        self.squeeze = P.Squeeze(axis=1)
        self.print = P.Print()
    def construct(self, src_tokens, src_tokens_lengths, label_idx):
        """
        FastText network with loss.
        """
        predict_score = self.fasttext(src_tokens, src_tokens_lengths)
        label_idx = self.squeeze(label_idx)
        predict_score = self.loss_func(predict_score, label_idx)
        return predict_score
 class FastTextTrainOneStepCell(nn.Cell):
    """
    Encapsulation class of fasttext network training.
    Append an optimizer to the training network after that the construct
    function can be called to create the backward graph.
    Args:
        network (Cell): The training network. Note that loss function should have been added.
        optimizer (Optimizer): Optimizer for updating the weights.
        sens (Number): The adjust parameter. Default: 1.0.
    """
    def __init__(self, network, optimizer, sens=1.0):
        super(FastTextTrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ", self.parallel_mode)
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.hyper_map = C.HyperMap()
        self.cast = P.Cast()
    def set_sens(self, value):
        self.sens = value
    def construct(self,
                  src_token_text,
                  src_tokens_text_length,
                  label_idx_tag):
        """Defines the computation performed."""
        weights = self.weights
        loss = self.network(src_token_text,
                            src_tokens_text_length,
                            label_idx_tag)
        grads = self.grad(self.network, weights)(src_token_text,
                                                 src_tokens_text_length,
                                                 label_idx_tag,
                                                 self.cast(F.tuple_to_array((self.sens,)),
                                                           mstype.float32))
        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
        if self.reducer_flag:
            # apply grad reducer on grads
            grads = self.grad_reducer(grads)
        succ = self.optimizer(grads)
        return F.depend(loss, succ)
--- a/model_zoo/official/nlp/fasttext/src/load_dataset.py
+++ b/model_zoo/official/nlp/fasttext/src/load_dataset.py
@ -0,0 +1,62 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """FastText data loader"""
 import mindspore.common.dtype as mstype
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.c_transforms as deC
 def load_dataset(dataset_path,
                 batch_size,
                 epoch_count=1,
                 rank_size=1,
                 rank_id=0,
                 bucket=None,
                 shuffle=True):
    """dataset loader"""
    def batch_per_bucket(bucket_length, input_file):
        input_file = input_file +'/train_dataset_bs_' + str(bucket_length) + '.mindrecord'
        if not input_file:
            raise FileNotFoundError("input file parameter must not be empty.")
        ds = de.MindDataset(input_file,
                            columns_list=['src_tokens', 'src_tokens_length', 'label_idx'],
                            shuffle=shuffle,
                            num_shards=rank_size,
                            shard_id=rank_id,
                            num_parallel_workers=8)
        ori_dataset_size = ds.get_dataset_size()
        print(f"Dataset size: {ori_dataset_size}")
        repeat_count = epoch_count
        type_cast_op = deC.TypeCast(mstype.int32)
        ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
        ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
        ds = ds.map(operations=type_cast_op, input_columns="label_idx")
        ds = ds.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'],
                       output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag'])
        ds = ds.batch(batch_size, drop_remainder=False)
        ds = ds.repeat(repeat_count)
        return ds
    for i, _ in enumerate(bucket):
        bucket_len = bucket[i]
        ds_per = batch_per_bucket(bucket_len, dataset_path)
        if i == 0:
            ds = ds_per
        else:
            ds = ds + ds_per
    ds = ds.shuffle(ds.get_dataset_size())
    ds.channel_name = 'fasttext'
    return ds
--- a/model_zoo/official/nlp/fasttext/src/lr_schedule.py
+++ b/model_zoo/official/nlp/fasttext/src/lr_schedule.py
@ -0,0 +1,54 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Learning rate utilities."""
 from math import ceil
 import numpy as np
 def polynomial_decay_scheduler(lr, min_lr, decay_steps, total_update_num, warmup_steps=1000, power=1.0):
    """
    Implements of polynomial decay learning rate scheduler which cycles by default.
    Args:
        lr (float): Initial learning rate.
        warmup_steps (int): Warmup steps.
        decay_steps (int): Decay steps.
        total_update_num (int): Total update steps.
        min_lr (float): Min learning.
        power (float): Power factor.
    Returns:
        np.ndarray, learning rate of each step.
    """
    lrs = np.zeros(shape=total_update_num, dtype=np.float32)
    if decay_steps <= 0:
        raise ValueError("`decay_steps` must larger than 1.")
    _start_step = 0
    if 0 < warmup_steps < total_update_num:
        warmup_end_lr = lr
        warmup_init_lr = 0 if warmup_steps > 0 else warmup_end_lr
        lrs[:warmup_steps] = np.linspace(warmup_init_lr, warmup_end_lr, warmup_steps)
        _start_step = warmup_steps
    decay_steps = decay_steps
    for step in range(_start_step, total_update_num):
        _step = step - _start_step
        ratio = ceil(_step / decay_steps)
        ratio = 1 if ratio < 1 else ratio
        _decay_steps = decay_steps * ratio
        lrs[step] = (lr - min_lr) * pow(1 - _step / _decay_steps, power) + min_lr
    return lrs
--- a/model_zoo/official/nlp/fasttext/train.py
+++ b/model_zoo/official/nlp/fasttext/train.py
@ -0,0 +1,202 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """FastText for train"""
 import os
 import time
 import argparse
 from mindspore import context
 from mindspore.nn.optim import Adam
 from mindspore.common import set_seed
 from mindspore.train.model import Model
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore.context import ParallelMode
 from mindspore.train.callback import Callback, TimeMonitor
 from mindspore.communication import management as MultiAscend
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from src.load_dataset import load_dataset
 from src.lr_schedule import polynomial_decay_scheduler
 from src.fasttext_train import FastTextTrainOneStepCell, FastTextNetWithLoss
 parser = argparse.ArgumentParser()
 parser.add_argument('--data_path', type=str, required=True, help='FastText input data file path.')
 parser.add_argument('--data_name', type=str, required=True, default='ag', help='dataset name. eg. ag, dbpedia')
 args = parser.parse_args()
 if args.data_name == "ag":
    from src.config import config_ag as config
 elif args.data_name == 'dbpedia':
    from src.config import config_db as config
 elif args.data_name == 'yelp_p':
    from  src.config import config_yelpp as config
 def get_ms_timestamp():
    t = time.time()
    return int(round(t * 1000))
 set_seed(5)
 time_stamp_init = False
 time_stamp_first = 0
 rank_id = os.getenv('DEVICE_ID')
 context.set_context(
    mode=context.GRAPH_MODE,
    save_graphs=False,
    device_target="Ascend")
 class LossCallBack(Callback):
    """
    Monitor the loss in training.
    If the loss is NAN or INF terminating training.
    Note:
        If per_print_times is 0 do not print loss.
    Args:
        per_print_times (int): Print loss every times. Default: 1.
    """
    def __init__(self, per_print_times=1, rank_ids=0):
        super(LossCallBack, self).__init__()
        if not isinstance(per_print_times, int) or per_print_times < 0:
            raise ValueError("print_step must be int and >= 0.")
        self._per_print_times = per_print_times
        self.rank_id = rank_ids
        global time_stamp_init, time_stamp_first
        if not time_stamp_init:
            time_stamp_first = get_ms_timestamp()
            time_stamp_init = True
    def step_end(self, run_context):
        """Monitor the loss in training."""
        global time_stamp_first
        time_stamp_current = get_ms_timestamp()
        cb_params = run_context.original_args()
        print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
                                                                     cb_params.cur_epoch_num,
                                                                     cb_params.cur_step_num,
                                                                     str(cb_params.net_outputs)))
        with open("./loss_{}.log".format(self.rank_id), "a+") as f:
            f.write("time: {}, epoch: {}, step: {}, loss: {}".format(
                time_stamp_current - time_stamp_first,
                cb_params.cur_epoch_num,
                cb_params.cur_step_num,
                str(cb_params.net_outputs.asnumpy())))
            f.write('\n')
 def _build_training_pipeline(pre_dataset):
    """
    Build training pipeline
    Args:
        pre_dataset: preprocessed dataset
    """
    net_with_loss = FastTextNetWithLoss(config.vocab_size, config.embedding_dims, config.num_class)
    net_with_loss.init_parameters_data()
    if config.pretrain_ckpt_dir:
        parameter_dict = load_checkpoint(config.pretrain_ckpt_dir)
        load_param_into_net(net_with_loss, parameter_dict)
    if pre_dataset is None:
        raise ValueError("pre-process dataset must be provided")
    #get learning rate
    update_steps = config.epoch * pre_dataset.get_dataset_size()
    decay_steps = pre_dataset.get_dataset_size()
    rank_size = os.getenv("RANK_SIZE")
    if isinstance(rank_size, int):
        raise ValueError("RANK_SIZE must be integer")
    if rank_size is not None and int(rank_size) > 1:
        base_lr = config.lr
    else:
        base_lr = config.lr / 10
    print("+++++++++++Total update steps ", update_steps)
    lr = Tensor(polynomial_decay_scheduler(lr=base_lr,
                                           min_lr=config.min_lr,
                                           decay_steps=decay_steps,
                                           total_update_num=update_steps,
                                           warmup_steps=config.warmup_steps,
                                           power=config.poly_lr_scheduler_power), dtype=mstype.float32)
    optimizer = Adam(net_with_loss.trainable_params(), lr, beta1=0.9, beta2=0.999)
    net_with_grads = FastTextTrainOneStepCell(net_with_loss, optimizer=optimizer)
    net_with_grads.set_train(True)
    model = Model(net_with_grads)
    loss_monitor = LossCallBack(rank_ids=rank_id)
    dataset_size = pre_dataset.get_dataset_size()
    time_monitor = TimeMonitor(data_size=dataset_size)
    ckpt_config = CheckpointConfig(save_checkpoint_steps=decay_steps,
                                   keep_checkpoint_max=config.keep_ckpt_max)
    callbacks = [time_monitor, loss_monitor]
    if rank_size is None or int(rank_size) == 1:
        ckpt_callback = ModelCheckpoint(prefix='fasttext',
                                        directory=os.path.join('./', 'ckpe_{}'.format(os.getenv("DEVICE_ID"))),
                                        config=ckpt_config)
        callbacks.append(ckpt_callback)
    if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
        ckpt_callback = ModelCheckpoint(prefix='fasttext',
                                        directory=os.path.join('./', 'ckpe_{}'.format(os.getenv("DEVICE_ID"))),
                                        config=ckpt_config)
        callbacks.append(ckpt_callback)
    print("Prepare to Training....")
    epoch_size = pre_dataset.get_repeat_count()
    print("Epoch size ", epoch_size)
    if os.getenv("RANK_SIZE") is not None and int(os.getenv("RANK_SIZE")) > 1:
        print(f" | Rank {MultiAscend.get_rank()} Call model train.")
    model.train(epoch=config.epoch, train_dataset=pre_dataset, callbacks=callbacks, dataset_sink_mode=False)
 def train_single(input_file_path):
    """
    Train model on single device
    Args:
        input_file_path: preprocessed dataset path
    """
    print("Staring training on single device.")
    preprocessed_data = load_dataset(dataset_path=input_file_path,
                                     batch_size=config.batch_size,
                                     epoch_count=config.epoch_count,
                                     bucket=config.buckets)
    _build_training_pipeline(preprocessed_data)
 def set_parallel_env():
    context.reset_auto_parallel_context()
    MultiAscend.init()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
                                      device_num=MultiAscend.get_group_size(),
                                      gradients_mean=True)
 def train_paralle(input_file_path):
    """
    Train model on multi device
    Args:
        input_file_path: preprocessed dataset path
    """
    set_parallel_env()
    print("Starting traning on mutiple devices. |~ _ ~| |~ _ ~| |~ _ ~| |~ _ ~|")
    preprocessed_data = load_dataset(dataset_path=input_file_path,
                                     batch_size=config.batch_size,
                                     epoch_count=config.epoch_count,
                                     rank_size=MultiAscend.get_group_size(),
                                     rank_id=MultiAscend.get_rank(),
                                     bucket=config.buckets,
                                     shuffle=False)
    _build_training_pipeline(preprocessed_data)
 if __name__ == "__main__":
    _rank_size = os.getenv("RANK_SIZE")
    if _rank_size is not None and int(_rank_size) > 1:
        train_paralle(args.data_path)
    else:
        train_single(args.data_path)