!11773 Add WaveNet to Model Zoo

From: @wanyiming Reviewed-by: Signed-off-by:
4 years ago · b82df95b43
parent 96f007ebb4 acd40e37e2
commit b82df95b43
15 changed files with 2584 additions and 0 deletions
--- a/model_zoo/research/audio/wavenet/README.md
+++ b/model_zoo/research/audio/wavenet/README.md
--- a/model_zoo/research/audio/wavenet/evaluate.py
+++ b/model_zoo/research/audio/wavenet/evaluate.py
--- a/model_zoo/research/audio/wavenet/export.py
+++ b/model_zoo/research/audio/wavenet/export.py
@ -0,0 +1,95 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """export mindir."""
 import json
 from os.path import join
 import argparse
 from warnings import warn
 from hparams import hparams, hparams_debug_string
 from mindspore import context, Tensor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
 from wavenet_vocoder import WaveNet
 from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
 import numpy as np
 from src.loss import PredictNet
 parser = argparse.ArgumentParser(description='TTS training')
 parser.add_argument('--preset', type=str, default='', help='Path of preset parameters (json).')
 parser.add_argument('--speaker_id', type=str, default='',
                    help=' Use specific speaker of data in case for multi-speaker datasets.')
 parser.add_argument('--pretrain_ckpt', type=str, default='', help='Pretrained checkpoint path')
 args = parser.parse_args()
 if __name__ == '__main__':
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
    speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
    if args.preset is not None:
        with open(args.preset) as f:
            hparams.parse_json(f.read())
    assert hparams.name == "wavenet_vocoder"
    print(hparams_debug_string())
    fs = hparams.sample_rate
    output_json_path = join(args.checkpoint_dir, "hparams.json")
    with open(output_json_path, "w") as f:
        json.dump(hparams.values(), f, indent=2)
    if is_mulaw_quantize(hparams.input_type):
        if hparams.out_channels != hparams.quantize_channels:
            raise RuntimeError(
                "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
    if hparams.upsample_conditional_features and hparams.cin_channels < 0:
        s = "Upsample conv layers were specified while local conditioning disabled. "
        s += "Notice that upsample conv layers will never be used."
        warn(s)
    upsample_params = hparams.upsample_params
    upsample_params["cin_channels"] = hparams.cin_channels
    upsample_params["cin_pad"] = hparams.cin_pad
    model = WaveNet(
        out_channels=hparams.out_channels,
        layers=hparams.layers,
        stacks=hparams.stacks,
        residual_channels=hparams.residual_channels,
        gate_channels=hparams.gate_channels,
        skip_out_channels=hparams.skip_out_channels,
        cin_channels=hparams.cin_channels,
        gin_channels=hparams.gin_channels,
        n_speakers=hparams.n_speakers,
        dropout=hparams.dropout,
        kernel_size=hparams.kernel_size,
        cin_pad=hparams.cin_pad,
        upsample_conditional_features=hparams.upsample_conditional_features,
        upsample_params=upsample_params,
        scalar_input=is_scalar_input(hparams.input_type),
        output_distribution=hparams.output_distribution,
    )
    Net = PredictNet(model)
    Net.set_train(False)
    receptive_field = model.receptive_field
    print("Receptive field (samples / ms): {} / {}".format(receptive_field, receptive_field / fs * 1000))
    param_dict = load_checkpoint(args.pretrain_ckpt)
    load_param_into_net(model, param_dict)
    print('Successfully loading the pre-trained model')
    x = np.array(np.random.random((2, 256, 10240)), dtype=np.float32)
    c = np.array(np.random.random((2, 80, 44)), dtype=np.float32)
    g = np.array([0, 0], dtype=np.int64)
    export(Net, Tensor(x), Tensor(c), Tensor(g), file_name="WaveNet", file_format='MINDIR')
--- a/model_zoo/research/audio/wavenet/src/init.py
+++ b/model_zoo/research/audio/wavenet/src/init.py
@ -0,0 +1,14 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the License);
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # httpwww.apache.orglicensesLICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
--- a/model_zoo/research/audio/wavenet/src/callback.py
+++ b/model_zoo/research/audio/wavenet/src/callback.py
@ -0,0 +1,103 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Defined callback for DeepFM.
 """
 import time
 from mindspore.train.callback import Callback
 from mindspore import Tensor
 import numpy as np
 class TimeMonitor(Callback):
    """
    Time monitor for calculating cost of each epoch.
    Args:
        data_size (int): step size of an epoch.
    """
    def __init__(self, data_size):
        super(TimeMonitor, self).__init__()
        self.data_size = data_size
    def epoch_begin(self, run_context):
        self.epoch_time = time.time()
    def epoch_end(self, run_context):
        epoch_mseconds = (time.time() - self.epoch_time) * 1000
        per_step_mseconds = epoch_mseconds / self.data_size
        print("epoch time: {0}, per step time: {1}".format(epoch_mseconds, per_step_mseconds), flush=True)
    def step_begin(self, run_context):
        self.step_time = time.time()
    def step_end(self, run_context):
        step_mseconds = (time.time() - self.step_time) * 1000
        print(f"step time {step_mseconds}", flush=True)
 class Monitor(Callback):
    """
    Monitor loss and time.
    Args:
        lr_init (numpy array): train lr
    Returns:
        None
    Examples:
        >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy())
    """
    def __init__(self, lr_init=None):
        super(Monitor, self).__init__()
        self.lr_init = lr_init
        self.lr_init_len = len(lr_init)
    def epoch_begin(self, run_context):
        self.losses = []
        self.epoch_time = time.time()
    def epoch_end(self, run_context):
        cb_params = run_context.original_args()
        epoch_mseconds = (time.time() - self.epoch_time)
        per_step_mseconds = epoch_mseconds / cb_params.batch_num
        print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.6f}".format(epoch_mseconds,
                                                                                      per_step_mseconds,
                                                                                      np.mean(self.losses)))
    def step_begin(self, run_context):
        self.step_time = time.time()
    def step_end(self, run_context):
        """step end"""
        cb_params = run_context.original_args()
        step_mseconds = (time.time() - self.step_time)
        step_loss = cb_params.net_outputs
        if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor):
            step_loss = step_loss[0]
        if isinstance(step_loss, Tensor):
            step_loss = np.mean(step_loss.asnumpy())
        self.losses.append(step_loss)
        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
        print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.6f}/{:5.6f}], time:[{:5.3f}], lr:[{:.9f}]".format(
            cb_params.cur_epoch_num -
            1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
            np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1].asnumpy()))
--- a/model_zoo/research/audio/wavenet/src/dataset.py
+++ b/model_zoo/research/audio/wavenet/src/dataset.py
--- a/model_zoo/research/audio/wavenet/src/loss.py
+++ b/model_zoo/research/audio/wavenet/src/loss.py
@ -0,0 +1,238 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """loss function definition"""
 import os
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
 from mindspore import nn, Tensor
 from mindspore.ops import operations as P
 from nnmnkwii import preprocessing as P1
 from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw
 from wavenet_vocoder.mixture import discretized_mix_logistic_loss
 from wavenet_vocoder.mixture import mix_gaussian_loss
 from train_pytorch import to_categorical
 from tqdm import tqdm
 import audio
 import librosa
 import librosa.display
 matplotlib.use('Agg')
 def sequence_mask(sequence_length, max_len=None):
    """make sequence mask"""
    sequence_length = sequence_length.asnumpy()
    if max_len is None:
        max_len = np.max(sequence_length)
    batch_size = sequence_length.shape[0]
    seq_range = np.linspace(0, max_len-1, max_len, dtype=np.int32)
    seq_range_expand = np.tile(np.expand_dims(seq_range, 0), (batch_size, 1))
    seq_length_expand = np.tile(np.expand_dims(sequence_length, 1), (1, max_len))
    seq_length_expand = np.expand_dims(np.array(seq_range_expand < seq_length_expand, dtype=np.float32), -1)
    return Tensor(seq_length_expand)
 class MaskedCrossEntropyLoss(nn.Cell):
    """MaskedCrossEntropyLoss"""
    def __init__(self):
        super(MaskedCrossEntropyLoss, self).__init__()
        self.criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    def construct(self, inputs, target):
        losses = self.criterion(inputs, target)
        return losses
 class DiscretizedMixturelogisticLoss(nn.Cell):
    """DiscretizedMixturelogisticLoss"""
    def __init__(self, hparams):
        super(DiscretizedMixturelogisticLoss, self).__init__()
        self.quantize_channels = hparams.quantize_channels
        self.log_scale_min = hparams.log_scale_min
        self.discretized_mix_logistic_loss = discretized_mix_logistic_loss(num_classes=hparams.quantize_channels,
                                                                           log_scale_min=hparams.log_scale_min,
                                                                           reduce=False)
        self.reduce_sum_op = P.ReduceSum()
        self.reduce_mean_op = P.ReduceMean()
    def construct(self, inputs, target, mask=None):
        losses = self.discretized_mix_logistic_loss(inputs, target)
        return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
 class MixtureGaussianLoss(nn.Cell):
    """MixtureGaussianLoss"""
    def __init__(self, hparams):
        super(MixtureGaussianLoss, self).__init__()
        self.quantize_channels = hparams.quantize_channels
        self.log_scale_min = hparams.log_scale_min
        self.mix_gaussian_loss = mix_gaussian_loss(log_scale_min=hparams.log_scale_min, reduce=False)
        self.reduce_sum_op = P.ReduceSum()
        self.reduce_mean_op = P.ReduceMean()
    def construct(self, inputs, target, mask=None):
        """
        Args:
            inputs (Tensor): Predicted distribution
            target (Tensor): Target
            mask (Tensor): Mask
        Returns:
            Tensor: Loss tensor
        """
        losses = self.mix_gaussian_loss(inputs, target)
        return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
 def save_waveplot(path, y_hat, y_target, sample_rate):
    sr = sample_rate
    plt.figure(figsize=(16, 6))
    plt.subplot(2, 1, 1)
    librosa.display.waveplot(y_target, sr=sr)
    plt.subplot(2, 1, 2)
    librosa.display.waveplot(y_hat, sr=sr)
    plt.tight_layout()
    plt.savefig(path, format="png")
    plt.close()
 def eval_model(hparams, global_step, model, x, y, c, g, input_lengths, eval_dir):
    """
    Function for model evaluation. This function is used for debugging in this project.
    """
    model.set_train(False)
    idx = np.random.randint(0, len(y))
    length = input_lengths.asnumpy()[idx]
    y_target = np.reshape(y.asnumpy()[idx], (-1))
    y_target = y_target[:length]
    if c is not None:
        expand_op = P.ExpandDims()
        if hparams.upsample_conditional_features:
            c = expand_op(c[idx, :, :int(length // audio.get_hop_size() + hparams.cin_pad * 2)], 0)
        else:
            c = expand_op(c[idx, :, :length], 0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))
    # Dummy silence
    if is_mulaw_quantize(hparams.input_type):
        initial_value = P1.mulaw_quantize(0, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        initial_value = P1.mulaw(0.0, hparams.quantize_channels)
    else:
        initial_value = 0.0
    # (C,)
    if is_mulaw_quantize(hparams.input_type):
        initial_input = to_categorical(
            initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = Tensor(np.reshape(initial_input, (1, 1, hparams.quantize_channels)))
    else:
        initial_input = np.ones((1, 1, 1)) * initial_value
        initial_input = Tensor(initial_input)
    # Run the model in fast eval mode
    y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm,
                                      log_scale_min=hparams.log_scale_min)
    if is_mulaw_quantize(hparams.input_type):
        y_hat = np.reshape(np.argmax(y_hat, 1), (-1))
        y_hat = P1.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1)
        y_target = P1.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        y_hat = P1.inv_mulaw(np.reshape(y_hat, (-1)), hparams.quantize_channels)
        y_target = P1.inv_mulaw(y_target, hparams.quantize_channels)
    else:
        y_hat = np.reshape(y_hat, (-1))
    # Save audio
    os.makedirs(eval_dir, exist_ok=True)
    path = os.path.join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = os.path.join(eval_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)
    # Save figure
    path = os.path.join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
    save_waveplot(path, y_hat, y_target, hparams.sample_rate)
 class PredictNet(nn.Cell):
    """
    NetWithLossClass definition
    """
    def __init__(self, network):
        super(PredictNet, self).__init__(auto_prefix=False)
        self.network = network
    def construct(self, x, c, g):
        y_hat = self.network(x, c, g, False)
        return y_hat
 class NetWithLossClass(nn.Cell):
    """
    NetWithLossClass definition
    Args:
        network (Cell): Pre-defined WaveNet.
        hparams (optional): Parameters.
    Returns:
        Tensor, loss tensor.
    """
    def __init__(self, network, hparams):
        super(NetWithLossClass, self).__init__(auto_prefix=False)
        self.network = network
        self.hparams = hparams
        self.ReduceMean_false = P.ReduceMean(keep_dims=False)
        self.expand_op = P.ExpandDims()
        self.transpose_op = P.Transpose()
        self.reshape_op = P.Reshape()
        self.is_mulaw_quant = is_mulaw_quantize(hparams.input_type)
        if self.is_mulaw_quant:
            self.criterion = MaskedCrossEntropyLoss()
        else:
            if hparams.output_distribution == "Logistic":
                self.criterion = DiscretizedMixturelogisticLoss(hparams)
            elif hparams.output_distribution == "Normal":
                self.criterion = MixtureGaussianLoss(hparams)
            else:
                self.criterion = None
                raise RuntimeError(
                    "Not supported output distribution type: {}".format(hparams.output_distribution))
    def construct(self, x, y, c, g, input_lengths, mask):
        y_hat = self.network(x, c, g, False)
        if self.is_mulaw_quant:
            y_hat = self.transpose_op(y_hat[:, :, :-1], (0, 2, 1))
            y_hat = self.reshape_op(y_hat, (-1, y_hat.shape[-1]))
            y = self.reshape_op(y[:, 1:, 0], (-1,))
            loss = self.criterion(y_hat, y)
        else:
            loss = self.criterion(y_hat[:, :, :-1], y[:, 1:, :], mask[:, 1:, :])
        return loss
--- a/model_zoo/research/audio/wavenet/src/lr_generator.py
+++ b/model_zoo/research/audio/wavenet/src/lr_generator.py
@ -0,0 +1,41 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """learning rate generator"""
 import numpy as np
 def get_lr(init_lr, total_epoch, step_per_epoch,
           anneal_rate=0.5,
           anneal_interval=200000):
    """
    Learning rate generating
    Args:
        init_lr (float): Initial learning rate
        total_epoch (int): Total epoch
        step_per_epoch (int): Step per epoch
        anneal_rate (float): anneal rate
        anneal_interval (int ): anneal interval
    Returns:
        ndarray: learning rate
    """
    total_step = total_epoch * step_per_epoch
    lr_step = []
    for i in range(total_step):
        lr_step.append(init_lr * anneal_rate ** (i // anneal_interval))
    learning_rate = np.array(lr_step).astype(np.float32)
    return learning_rate
--- a/model_zoo/research/audio/wavenet/train.py
+++ b/model_zoo/research/audio/wavenet/train.py
@ -0,0 +1,135 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """train_criteo."""
 import os
 from os.path import join
 import json
 import argparse
 from warnings import warn
 from hparams import hparams, hparams_debug_string
 from mindspore import context, Tensor
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank, get_group_size
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.nn.optim import Adam
 from mindspore.nn import TrainOneStepCell
 from mindspore.train import Model
 from src.lr_generator import get_lr
 from src.dataset import get_data_loaders
 from src.loss import NetWithLossClass
 from src.callback import Monitor
 from wavenet_vocoder import WaveNet
 from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
 parser = argparse.ArgumentParser(description='TTS training')
 parser.add_argument('--data_path', type=str, required=True, default='',
                    help='Directory contains preprocessed features.')
 parser.add_argument('--preset', type=str, required=True, default='', help='Path of preset parameters (json).')
 parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints_test',
                    help='Directory where to save model checkpoints [default: checkpoints].')
 parser.add_argument('--checkpoint', type=str, default='', help='Restore model from checkpoint path if given.')
 parser.add_argument('--speaker_id', type=str, default='',
                    help=' Use specific speaker of data in case for multi-speaker datasets.')
 parser.add_argument('--is_distributed', action="store_true", default=False, help='Distributed training')
 args = parser.parse_args()
 if __name__ == '__main__':
    if args.is_distributed:
        init('nccl')
        rank_id = get_rank()
        group_size = get_group_size()
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
    else:
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
        rank_id = 0
        group_size = 1
    speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
    if args.preset is not None:
        with open(args.preset) as f:
            hparams.parse_json(f.read())
    assert hparams.name == "wavenet_vocoder"
    print(hparams_debug_string())
    fs = hparams.sample_rate
    os.makedirs(args.checkpoint_dir, exist_ok=True)
    output_json_path = join(args.checkpoint_dir, "hparams.json")
    with open(output_json_path, "w") as f:
        json.dump(hparams.values(), f, indent=2)
    data_loaders = get_data_loaders(args.data_path, args.speaker_id, hparams=hparams, rank_id=rank_id,
                                    group_size=group_size)
    step_size_per_epoch = data_loaders.get_dataset_size()
    if is_mulaw_quantize(hparams.input_type):
        if hparams.out_channels != hparams.quantize_channels:
            raise RuntimeError(
                "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
    if hparams.upsample_conditional_features and hparams.cin_channels < 0:
        s = "Upsample conv layers were specified while local conditioning disabled. "
        s += "Notice that upsample conv layers will never be used."
        warn(s)
    upsample_params = hparams.upsample_params
    upsample_params["cin_channels"] = hparams.cin_channels
    upsample_params["cin_pad"] = hparams.cin_pad
    model = WaveNet(
        out_channels=hparams.out_channels,
        layers=hparams.layers,
        stacks=hparams.stacks,
        residual_channels=hparams.residual_channels,
        gate_channels=hparams.gate_channels,
        skip_out_channels=hparams.skip_out_channels,
        cin_channels=hparams.cin_channels,
        gin_channels=hparams.gin_channels,
        n_speakers=hparams.n_speakers,
        dropout=hparams.dropout,
        kernel_size=hparams.kernel_size,
        cin_pad=hparams.cin_pad,
        upsample_conditional_features=hparams.upsample_conditional_features,
        upsample_params=upsample_params,
        scalar_input=is_scalar_input(hparams.input_type),
        output_distribution=hparams.output_distribution,
    )
    loss_net = NetWithLossClass(model, hparams)
    lr = get_lr(hparams.optimizer_params["lr"], hparams.nepochs, step_size_per_epoch)
    lr = Tensor(lr)
    if args.checkpoint != '':
        param_dict = load_checkpoint(args.pre_trained_model_path)
        load_param_into_net(model, param_dict)
        print('Successfully loading the pre-trained model')
    weights = model.trainable_params()
    optimizer = Adam(weights, learning_rate=lr, loss_scale=1024.)
    train_net = TrainOneStepCell(loss_net, optimizer)
    model = Model(train_net)
    lr_cb = Monitor(lr)
    callback_list = [lr_cb]
    if args.is_distributed:
        ckpt_path = os.path.join(args.checkpoint_dir, 'ckpt_' + str(get_rank()) + '/')
    else:
        ckpt_path = args.checkpoint_dir
    config_ck = CheckpointConfig(save_checkpoint_steps=step_size_per_epoch, keep_checkpoint_max=10)
    ckpt_cb = ModelCheckpoint(prefix='wavenet', directory=ckpt_path, config=config_ck)
    callback_list.append(ckpt_cb)
    model.train(hparams.nepochs, data_loaders, callbacks=callback_list)
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/init.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/init.py
@ -0,0 +1,17 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """init"""
 from __future__ import with_statement, print_function, absolute_import
 from .wavenet import WaveNet
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/conv.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/conv.py
@ -0,0 +1,176 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Extended Conv1D."""
 import math
 from mindspore import nn, Tensor
 from mindspore.ops import operations as P
 import mindspore.common.dtype as mstype
 import numpy as np
 class Conv1d(nn.Conv1d):
    """
    Extended nn.Conv1d to adapt to incremental dilated convolutions.
    During training, initial Conv1D is used and during evaluation, incremental_forward is called.
    To improve the inference speed, tensor will be converted as numpy and the following calculation is based on numpy.
    These operation will be replaced with MindSpore ops in the future. Currently, some operation is not supported by
    MindSpore and a mixed use of numpy and MindSpore will take a long time.
    """
    def __init__(self, *args, **kwargs):
        super(Conv1d, self).__init__(*args, **kwargs)
        self.clear_buffer()
        self._linearized_weight = None
        self.transpose_op = P.Transpose()
        self.reshape_op = P.Reshape()
        self.squeeze_op = P.Squeeze(-2)
        self.zeros = P.Zeros()
        self.concat_op = P.Concat(axis=1)
        self.matmul = P.MatMul(transpose_b=True)
        self.bias_add = P.BiasAdd()
        self.get_weight = None
        self.get_bias = None
    def incremental_forward(self, inputs, is_numpy=True):
        if is_numpy:
            return self.incremental_forward_numpy(inputs)
        return self.incremental_forward_pynative(inputs)
    def incremental_forward_pynative(self, inputs):
        """
        Incremental forward.
        Args:
            inputs: B x T x C
        Returns:
            ndarray
        """
        # input: (B, T, C)
        if self.training:
            raise RuntimeError('incremental_forward only supports eval mode')
        if self.get_weight is None:
            self.get_weight = self._get_linearized_weight()
        if self.get_bias is None and self.bias is not None:
            self.get_bias = self.bias
        # Note mindspore uses Conv2D to construct Conv1D
        kw = self.kernel_size[1]
        dilation = self.dilation[1]
        bsz = inputs.shape[0]  # input: bsz x len x dim
        if kw > 1:
            if self.input_buffer is None:
                init_buffer = self.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), mstype.float32)
                self.input_buffer = self.concat_op((init_buffer[:, 1:, :], inputs[:, 0:1, :]))
            else:
                # shift buffer
                self.input_buffer = self.concat_op((self.input_buffer[:, 1:, :], inputs[:, 0:1, :]))
            inputs = self.input_buffer
            if dilation > 1:
                inputs = inputs[:, 0::dilation, :]
        output = self.matmul(self.reshape_op(inputs, (bsz, -1)), self.get_weight)
        if self.bias is not None:
            output = self.bias_add(output, self.bias)
        return self.reshape_op(output, (bsz, 1, -1))
    def incremental_forward_numpy(self, inputs):
        """
        Incremental forward.
        Args:
            inputs: B x T x C
        Returns:
            ndarray
        """
        # input: (B, T, C)
        if self.training:
            raise RuntimeError('incremental_forward only supports eval mode')
        if self.get_weight is None:
            weight = self._get_linearized_weight()
            self.get_weight = weight.asnumpy()
        if self.get_bias is None and self.bias is not None:
            bias = self.bias
            self.get_bias = bias.asnumpy()
        # Note mindspore uses Conv2D to construct Conv1D
        kw = self.kernel_size[1]
        dilation = self.dilation[1]
        bsz = inputs.shape[0]  # input: bsz x len x dim
        if kw > 1:
            if self.input_buffer is None:
                self.input_buffer = np.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), dtype=np.float32)
            else:
                # shift buffer
                self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :]
            # append next
            self.input_buffer[:, -1, :] = inputs[:, -1, :]
            inputs = self.input_buffer
            if dilation > 1:
                inputs = inputs[:, 0::dilation, :]
        output = inputs.reshape(bsz, -1).dot(self.get_weight.T)
        if self.bias is not None:
            output = output + np.expand_dims(self.get_bias, 0)
        return np.reshape(output, (bsz, 1, -1))
    def clear_buffer(self):
        self.input_buffer = None
    def _get_linearized_weight(self):
        """
        get linearized weight
        """
        weight = self.squeeze_op(self.weight)
        if self._linearized_weight is None:
            # Note mindspore uses Conv2D to construct Conv1D
            kw = self.kernel_size[1]
            if weight.shape == (self.out_channels, self.in_channels, kw):
                weight = self.transpose_op(weight, (0, 2, 1))
            else:
                weight = self.transpose_op(weight, (2, 0, 1))
            self._linearized_weight = self.reshape_op(weight, (self.out_channels, -1))
        return self._linearized_weight
    def _clear_linearized_weight(self, *args):
        self._linearized_weight = None
    def _initialize_weights(self):
        """
        weight initialization
        """
        self.init_parameters_data()
        std_mul = 4.0
        for _, m in self.cells_and_names():
            if isinstance(m, nn.Conv1d):
                std = math.sqrt((std_mul * 0.1) / (m.kernel_size[1] * self.in_channels))
                m.weight.set_data(Tensor(np.random.normal(0, std, m.weight.data.shape).astype("float32")))
                if m.bias is not None:
                    m.bias.set_data(
                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
            elif isinstance(m, nn.BatchNorm2d):
                m.gamma.set_data(
                    Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
                m.beta.set_data(
                    Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/mixture.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/mixture.py
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/modules.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/modules.py
@ -0,0 +1,213 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 modules for wavenet
 """
 from __future__ import with_statement, print_function, absolute_import
 import math
 import numpy as np
 from wavenet_vocoder import conv
 from mindspore import nn
 from mindspore.ops import operations as P
 def Conv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
    m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs)
    return m
 def Conv1d1x1(in_channels, out_channels, has_bias=True):
    return Conv1d(in_channels, out_channels, kernel_size=1, pad_mode='pad', padding=0, dilation=1, has_bias=has_bias)
 def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    return m
 def _conv1x1_forward(conv_, x, is_incremental, is_numpy=True):
    """
    Conv1x1 forward
    """
    if is_incremental:
        x = conv_.incremental_forward(x, is_numpy=is_numpy)
    else:
        x = conv_(x)
    return x
 class ResidualConv1dGLU(nn.Cell):
    """Residual dilated conv1d with gated activation units
    Args:
        residual_channels (int): Residual input / output channels
        gate_channels (int): Gated activation channels.
        kernel_size (int): Kernel size
        skip_out_channels (int): Skip connection channels. If None, it will set to the same as residual_channels.
        cin_channels (int): Local conditioning channels. If given negative value, local conditioning is disabled.
        gin_channels (int): Global conditioning channels. If given negative value, global conditioning is disabled.
        dropout (float): Dropout rate.
        padding (int): Padding for convolution layers. If None, padding value will be computed according to dilation
        and kernel_size.
        dilation (int): Dilation factor.
    """
    def __init__(self, residual_channels=None, gate_channels=None, kernel_size=None, skip_out_channels=None, bias=True,
                 dropout=1 - 0.95, dilation=1, cin_channels=-1, gin_channels=-1, padding=None, causal=True):
        super(ResidualConv1dGLU, self).__init__()
        self.dropout = dropout
        self.dropout_op = nn.Dropout(keep_prob=1. - self.dropout)
        self.eval_split_op = P.Split(axis=-1, output_num=2)
        self.train_split_op = P.Split(axis=1, output_num=2)
        self.tanh = P.Tanh()
        self.sigmoid = P.Sigmoid()
        self.mul = P.Mul()
        self.add = P.TensorAdd()
        if skip_out_channels is None:
            skip_out_channels = residual_channels
        if padding is None:
            if causal:
                padding = (kernel_size - 1) * dilation
            else:
                padding = (kernel_size - 1) // 2 * dilation
        self.causal = causal
        self.conv = Conv1d(residual_channels, gate_channels, kernel_size, pad_mode='pad',
                           padding=padding, dilation=dilation, has_bias=bias)
        # local conditioning
        if cin_channels > 0:
            self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, has_bias=False)
        else:
            self.conv1x1c = None
        # global conditioning
        if gin_channels > 0:
            self.conv1x1g = Conv1d(gin_channels, gate_channels, has_bias=False, kernel_size=1, dilation=1)
        else:
            self.conv1x1g = None
        gate_out_channels = gate_channels // 2
        self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, has_bias=bias)
        self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, has_bias=bias)
        self.factor = math.sqrt(0.5)
    def construct(self, x, c=None, g=None):
        """
        Args:
            x(Tensor): One-hot audio signal, the shape is B x C x T
            c(Tensor): local conditional feature, the shape is B x cin_channels x T
            g(Tensor): global conditional feature, not used currently
        Returns:
            Tensor: Output tensor
        """
        residual = x
        x = self.dropout_op(x)
        x = self.conv(x)
        # remove future time steps
        x = x[:, :, :residual.shape[-1]] if self.causal else x
        split_op = self.train_split_op
        a, b = split_op(x)
        # local conditioning
        if c is not None:
            c = _conv1x1_forward(self.conv1x1c, c, is_incremental=False)
            ca, cb = split_op(c)
            a, b = a + ca, b + cb
        # global conditioning
        if g is not None:
            g = _conv1x1_forward(self.conv1x1g, g, is_incremental=False)
            ga, gb = self.split(g)
            a, b = a + ga, b + gb
        x = self.mul(self.tanh(a), self.sigmoid(b))
        # For skip connection
        s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=False)
        # For residual connection
        x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=False)
        x = self.add(x, residual) * self.factor
        return x, s
    def sigmoid_numpy(self, x):
        return 1. / (1 + np.exp(-x))
    def incremental_forward(self, x, c=None, g=None, is_numpy=True):
        """
        Incremental forward. Used for inference stage
        Args:
            x (Tensor): One-hot audio signal, the shape is B x C x T
            c (Tensor): local conditional feature, the shape is B x cin_channels x T
            g (Tensor): global conditional feature, not used currently
        Returns:
            ndarray
        """
        residual = x
        x = self.conv.incremental_forward(x, is_numpy=is_numpy)
        if is_numpy:
            a, b = np.split(x, indices_or_sections=2, axis=-1)
        else:
            a, b = self.eval_split_op(x)
        # local conditioning
        if c is not None:
            c = _conv1x1_forward(self.conv1x1c, c, is_incremental=True, is_numpy=is_numpy)
            if is_numpy:
                ca, cb = np.split(c, indices_or_sections=2, axis=-1)
            else:
                ca, cb = self.eval_split_op(c)
            a, b = a + ca, b + cb
        # global conditioning
        if g is not None:
            g = _conv1x1_forward(self.conv1x1g, g, is_incremental=True, is_numpy=is_numpy)
            if is_numpy:
                ga, gb = np.split(g, indices_or_sections=2, axis=-1)
            else:
                ga, gb = self.eval_split_op(c)
            a, b = a + ga, b + gb
        if is_numpy:
            x = np.tanh(a) * self.sigmoid_numpy(b)
        else:
            x = self.mul(self.tanh(a), self.sigmoid(b))
        # For skip connection
        s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=True, is_numpy=is_numpy)
        # For residual connection
        x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=True, is_numpy=is_numpy)
        x = (x + residual) * self.factor
        return x, s
    def clear_buffer(self):
        """clear buffer"""
        for c in [self.conv, self.conv1x1_out, self.conv1x1_skip,
                  self.conv1x1c, self.conv1x1g]:
            if c is not None:
                c.clear_buffer()
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/upsample.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/upsample.py
@ -0,0 +1,118 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Upsampling
 """
 from __future__ import with_statement, print_function, absolute_import
 import numpy as np
 from mindspore import nn
 from mindspore.ops import operations as P
 class Resize(nn.Cell):
    """
    Resize input Tensor
    """
    def __init__(self, x_scale, y_scale, mode="nearest"):
        super(Resize, self).__init__()
        self.x_scale = x_scale
        self.y_scale = y_scale
        self.mode = mode
    def construct(self, x):
        _, _, h, w = x.shape
        interpolate_op = P.ResizeNearestNeighbor((self.y_scale * h, self.x_scale * w))
        return interpolate_op(x)
 def _get_activation(upsample_activation):
    """get activation"""
    nonlinear = getattr(nn, upsample_activation)
    return nonlinear
 class UpsampleNetwork(nn.Cell):
    """UpsampleNetwork"""
    def __init__(self, upsample_scales, mode="nearest",
                 freq_axis_kernel_size=1, cin_pad=0, cin_channels=80):
        super(UpsampleNetwork, self).__init__()
        self.expand_op = P.ExpandDims()
        self.squeeze_op = P.Squeeze(1)
        up_layers = []
        total_scale = np.prod(upsample_scales)
        self.indent = cin_pad * total_scale
        for scale in upsample_scales:
            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
            k_size = (freq_axis_kernel_size, scale * 2 + 1)
            # padding = (freq_axis_padding, scale)
            padding = (freq_axis_padding, freq_axis_padding, scale, scale)
            stretch = Resize(scale, 1, mode)
            conv = nn.Conv2d(1, 1, kernel_size=k_size, has_bias=False, pad_mode='pad', padding=padding)
            up_layers.append(stretch)
            up_layers.append(conv)
            # if upsample_activation != "none":
            #     nonlinear = _get_activation(upsample_activation)
            #     up_layers.append(nonlinear(**upsample_activation_params))
        self.up_layers = nn.CellList(up_layers)
    def construct(self, c):
        """
        Args:
            c (Tensor): Local conditioning feature
        Returns:
            Tensor: Upsampling feature
        """
        # B x 1 x C x T
        c = self.expand_op(c, 1)
        for f in self.up_layers:
            c = f(c)
        # B x C x T
        c = self.squeeze_op(c)
        # if self.indent > 0:
        #     c = c[:, :, self.indent:-self.indent]
        return c
 class ConvInUpsampleNetwork(nn.Cell):
    """Upsample Network
    Args:
        upsample_scales (list): Upsample_scales list.
        upsample_activation (str): Upsample_activation.
        mode (str): Resize mode, default is NearestNeighbor.
        cin_channels (int): Local conditioning channels.
        freq_axis_kernel_size (int): Freq-axis kernel_size for the convolution layers after resize.
    """
    def __init__(self, upsample_scales, mode="nearest",
                 freq_axis_kernel_size=1, cin_pad=0,
                 cin_channels=80):
        super(ConvInUpsampleNetwork, self).__init__()
        ks = 2 * cin_pad + 1
        self.conv_in = nn.Conv1d(cin_channels, cin_channels, kernel_size=ks, has_bias=False, pad_mode='pad', padding=0)
        self.upsample = UpsampleNetwork(upsample_scales, mode, freq_axis_kernel_size, cin_pad=0,
                                        cin_channels=cin_channels)
    def construct(self, c):
        c = self.conv_in(c)
        c_up = self.upsample(c)
        return c_up
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/wavenet.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/wavenet.py