wavenet

4 years ago · acd40e37e2
parent cfe8d6f32a
commit acd40e37e2
15 changed files with 2584 additions and 0 deletions
--- a/model_zoo/research/audio/wavenet/README.md
+++ b/model_zoo/research/audio/wavenet/README.md
--- a/model_zoo/research/audio/wavenet/evaluate.py
+++ b/model_zoo/research/audio/wavenet/evaluate.py
--- a/model_zoo/research/audio/wavenet/export.py
+++ b/model_zoo/research/audio/wavenet/export.py
@ -0,0 +1,95 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""export mindir."""
+import json
+from os.path import join
+import argparse
+from warnings import warn
+from hparams import hparams, hparams_debug_string
+from mindspore import context, Tensor
+from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
+from wavenet_vocoder import WaveNet
+from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
+import numpy as np
+from src.loss import PredictNet
+
+parser = argparse.ArgumentParser(description='TTS training')
+parser.add_argument('--preset', type=str, default='', help='Path of preset parameters (json).')
+parser.add_argument('--speaker_id', type=str, default='',
+                    help=' Use specific speaker of data in case for multi-speaker datasets.')
+parser.add_argument('--pretrain_ckpt', type=str, default='', help='Pretrained checkpoint path')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
+
+    speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
+    if args.preset is not None:
+        with open(args.preset) as f:
+            hparams.parse_json(f.read())
+
+    assert hparams.name == "wavenet_vocoder"
+    print(hparams_debug_string())
+
+    fs = hparams.sample_rate
+    output_json_path = join(args.checkpoint_dir, "hparams.json")
+    with open(output_json_path, "w") as f:
+        json.dump(hparams.values(), f, indent=2)
+
+    if is_mulaw_quantize(hparams.input_type):
+        if hparams.out_channels != hparams.quantize_channels:
+            raise RuntimeError(
+                "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
+    if hparams.upsample_conditional_features and hparams.cin_channels < 0:
+        s = "Upsample conv layers were specified while local conditioning disabled. "
+        s += "Notice that upsample conv layers will never be used."
+        warn(s)
+
+    upsample_params = hparams.upsample_params
+    upsample_params["cin_channels"] = hparams.cin_channels
+    upsample_params["cin_pad"] = hparams.cin_pad
+    model = WaveNet(
+        out_channels=hparams.out_channels,
+        layers=hparams.layers,
+        stacks=hparams.stacks,
+        residual_channels=hparams.residual_channels,
+        gate_channels=hparams.gate_channels,
+        skip_out_channels=hparams.skip_out_channels,
+        cin_channels=hparams.cin_channels,
+        gin_channels=hparams.gin_channels,
+        n_speakers=hparams.n_speakers,
+        dropout=hparams.dropout,
+        kernel_size=hparams.kernel_size,
+        cin_pad=hparams.cin_pad,
+        upsample_conditional_features=hparams.upsample_conditional_features,
+        upsample_params=upsample_params,
+        scalar_input=is_scalar_input(hparams.input_type),
+        output_distribution=hparams.output_distribution,
+    )
+
+    Net = PredictNet(model)
+    Net.set_train(False)
+    receptive_field = model.receptive_field
+    print("Receptive field (samples / ms): {} / {}".format(receptive_field, receptive_field / fs * 1000))
+    param_dict = load_checkpoint(args.pretrain_ckpt)
+    load_param_into_net(model, param_dict)
+    print('Successfully loading the pre-trained model')
+
+    x = np.array(np.random.random((2, 256, 10240)), dtype=np.float32)
+    c = np.array(np.random.random((2, 80, 44)), dtype=np.float32)
+    g = np.array([0, 0], dtype=np.int64)
+
+    export(Net, Tensor(x), Tensor(c), Tensor(g), file_name="WaveNet", file_format='MINDIR')
--- a/model_zoo/research/audio/wavenet/src/init.py
+++ b/model_zoo/research/audio/wavenet/src/init.py
@ -0,0 +1,14 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
--- a/model_zoo/research/audio/wavenet/src/callback.py
+++ b/model_zoo/research/audio/wavenet/src/callback.py
@ -0,0 +1,103 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Defined callback for DeepFM.
+"""
+import time
+from mindspore.train.callback import Callback
+from mindspore import Tensor
+import numpy as np
+
+
+class TimeMonitor(Callback):
+    """
+    Time monitor for calculating cost of each epoch.
+
+    Args:
+        data_size (int): step size of an epoch.
+    """
+
+    def __init__(self, data_size):
+        super(TimeMonitor, self).__init__()
+        self.data_size = data_size
+
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        per_step_mseconds = epoch_mseconds / self.data_size
+        print("epoch time: {0}, per step time: {1}".format(epoch_mseconds, per_step_mseconds), flush=True)
+
+    def step_begin(self, run_context):
+        self.step_time = time.time()
+
+    def step_end(self, run_context):
+        step_mseconds = (time.time() - self.step_time) * 1000
+        print(f"step time {step_mseconds}", flush=True)
+
+
+class Monitor(Callback):
+    """
+    Monitor loss and time.
+
+    Args:
+        lr_init (numpy array): train lr
+
+    Returns:
+        None
+
+    Examples:
+        >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy())
+    """
+
+    def __init__(self, lr_init=None):
+        super(Monitor, self).__init__()
+        self.lr_init = lr_init
+        self.lr_init_len = len(lr_init)
+
+    def epoch_begin(self, run_context):
+        self.losses = []
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        cb_params = run_context.original_args()
+
+        epoch_mseconds = (time.time() - self.epoch_time)
+        per_step_mseconds = epoch_mseconds / cb_params.batch_num
+        print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.6f}".format(epoch_mseconds,
+                                                                                      per_step_mseconds,
+                                                                                      np.mean(self.losses)))
+
+    def step_begin(self, run_context):
+        self.step_time = time.time()
+
+    def step_end(self, run_context):
+        """step end"""
+        cb_params = run_context.original_args()
+        step_mseconds = (time.time() - self.step_time)
+        step_loss = cb_params.net_outputs
+
+        if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor):
+            step_loss = step_loss[0]
+        if isinstance(step_loss, Tensor):
+            step_loss = np.mean(step_loss.asnumpy())
+
+        self.losses.append(step_loss)
+        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
+
+        print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.6f}/{:5.6f}], time:[{:5.3f}], lr:[{:.9f}]".format(
+            cb_params.cur_epoch_num -
+            1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
+            np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1].asnumpy()))
--- a/model_zoo/research/audio/wavenet/src/dataset.py
+++ b/model_zoo/research/audio/wavenet/src/dataset.py
--- a/model_zoo/research/audio/wavenet/src/loss.py
+++ b/model_zoo/research/audio/wavenet/src/loss.py
@ -0,0 +1,238 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""loss function definition"""
+import os
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+
+from mindspore import nn, Tensor
+from mindspore.ops import operations as P
+from nnmnkwii import preprocessing as P1
+
+from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw
+from wavenet_vocoder.mixture import discretized_mix_logistic_loss
+from wavenet_vocoder.mixture import mix_gaussian_loss
+from train_pytorch import to_categorical
+from tqdm import tqdm
+import audio
+import librosa
+import librosa.display
+matplotlib.use('Agg')
+
+def sequence_mask(sequence_length, max_len=None):
+    """make sequence mask"""
+    sequence_length = sequence_length.asnumpy()
+    if max_len is None:
+        max_len = np.max(sequence_length)
+    batch_size = sequence_length.shape[0]
+    seq_range = np.linspace(0, max_len-1, max_len, dtype=np.int32)
+    seq_range_expand = np.tile(np.expand_dims(seq_range, 0), (batch_size, 1))
+    seq_length_expand = np.tile(np.expand_dims(sequence_length, 1), (1, max_len))
+    seq_length_expand = np.expand_dims(np.array(seq_range_expand < seq_length_expand, dtype=np.float32), -1)
+    return Tensor(seq_length_expand)
+
+class MaskedCrossEntropyLoss(nn.Cell):
+    """MaskedCrossEntropyLoss"""
+    def __init__(self):
+        super(MaskedCrossEntropyLoss, self).__init__()
+        self.criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+
+    def construct(self, inputs, target):
+        losses = self.criterion(inputs, target)
+        return losses
+
+
+class DiscretizedMixturelogisticLoss(nn.Cell):
+    """DiscretizedMixturelogisticLoss"""
+    def __init__(self, hparams):
+        super(DiscretizedMixturelogisticLoss, self).__init__()
+        self.quantize_channels = hparams.quantize_channels
+        self.log_scale_min = hparams.log_scale_min
+        self.discretized_mix_logistic_loss = discretized_mix_logistic_loss(num_classes=hparams.quantize_channels,
+                                                                           log_scale_min=hparams.log_scale_min,
+                                                                           reduce=False)
+        self.reduce_sum_op = P.ReduceSum()
+        self.reduce_mean_op = P.ReduceMean()
+
+    def construct(self, inputs, target, mask=None):
+        losses = self.discretized_mix_logistic_loss(inputs, target)
+        return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
+
+
+class MixtureGaussianLoss(nn.Cell):
+    """MixtureGaussianLoss"""
+    def __init__(self, hparams):
+        super(MixtureGaussianLoss, self).__init__()
+        self.quantize_channels = hparams.quantize_channels
+        self.log_scale_min = hparams.log_scale_min
+        self.mix_gaussian_loss = mix_gaussian_loss(log_scale_min=hparams.log_scale_min, reduce=False)
+        self.reduce_sum_op = P.ReduceSum()
+        self.reduce_mean_op = P.ReduceMean()
+
+    def construct(self, inputs, target, mask=None):
+        """
+
+        Args:
+            inputs (Tensor): Predicted distribution
+            target (Tensor): Target
+            mask (Tensor): Mask
+
+        Returns:
+            Tensor: Loss tensor
+
+        """
+        losses = self.mix_gaussian_loss(inputs, target)
+        return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
+
+
+def save_waveplot(path, y_hat, y_target, sample_rate):
+    sr = sample_rate
+    plt.figure(figsize=(16, 6))
+    plt.subplot(2, 1, 1)
+    librosa.display.waveplot(y_target, sr=sr)
+    plt.subplot(2, 1, 2)
+    librosa.display.waveplot(y_hat, sr=sr)
+    plt.tight_layout()
+    plt.savefig(path, format="png")
+    plt.close()
+
+
+def eval_model(hparams, global_step, model, x, y, c, g, input_lengths, eval_dir):
+    """
+    Function for model evaluation. This function is used for debugging in this project.
+    """
+
+    model.set_train(False)
+    idx = np.random.randint(0, len(y))
+    length = input_lengths.asnumpy()[idx]
+    y_target = np.reshape(y.asnumpy()[idx], (-1))
+    y_target = y_target[:length]
+
+    if c is not None:
+        expand_op = P.ExpandDims()
+        if hparams.upsample_conditional_features:
+            c = expand_op(c[idx, :, :int(length // audio.get_hop_size() + hparams.cin_pad * 2)], 0)
+        else:
+            c = expand_op(c[idx, :, :length], 0)
+        assert c.dim() == 3
+        print("Shape of local conditioning features: {}".format(c.size()))
+
+    if g is not None:
+        g = g[idx]
+        print("Shape of global conditioning features: {}".format(g.size()))
+
+    # Dummy silence
+    if is_mulaw_quantize(hparams.input_type):
+        initial_value = P1.mulaw_quantize(0, hparams.quantize_channels - 1)
+    elif is_mulaw(hparams.input_type):
+        initial_value = P1.mulaw(0.0, hparams.quantize_channels)
+    else:
+        initial_value = 0.0
+
+    # (C,)
+    if is_mulaw_quantize(hparams.input_type):
+        initial_input = to_categorical(
+            initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
+        initial_input = Tensor(np.reshape(initial_input, (1, 1, hparams.quantize_channels)))
+
+    else:
+        initial_input = np.ones((1, 1, 1)) * initial_value
+        initial_input = Tensor(initial_input)
+
+    # Run the model in fast eval mode
+    y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm,
+                                      log_scale_min=hparams.log_scale_min)
+
+    if is_mulaw_quantize(hparams.input_type):
+        y_hat = np.reshape(np.argmax(y_hat, 1), (-1))
+        y_hat = P1.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1)
+        y_target = P1.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1)
+    elif is_mulaw(hparams.input_type):
+        y_hat = P1.inv_mulaw(np.reshape(y_hat, (-1)), hparams.quantize_channels)
+        y_target = P1.inv_mulaw(y_target, hparams.quantize_channels)
+    else:
+        y_hat = np.reshape(y_hat, (-1))
+
+    # Save audio
+    os.makedirs(eval_dir, exist_ok=True)
+    path = os.path.join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
+    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
+
+    path = os.path.join(eval_dir, "step{:09d}_target.wav".format(global_step))
+    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)
+
+    # Save figure
+    path = os.path.join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
+    save_waveplot(path, y_hat, y_target, hparams.sample_rate)
+
+
+class PredictNet(nn.Cell):
+    """
+    NetWithLossClass definition
+    """
+
+    def __init__(self, network):
+        super(PredictNet, self).__init__(auto_prefix=False)
+        self.network = network
+
+    def construct(self, x, c, g):
+        y_hat = self.network(x, c, g, False)
+        return y_hat
+
+
+class NetWithLossClass(nn.Cell):
+    """
+    NetWithLossClass definition
+
+    Args:
+        network (Cell): Pre-defined WaveNet.
+        hparams (optional): Parameters.
+
+    Returns:
+        Tensor, loss tensor.
+    """
+    def __init__(self, network, hparams):
+        super(NetWithLossClass, self).__init__(auto_prefix=False)
+        self.network = network
+        self.hparams = hparams
+        self.ReduceMean_false = P.ReduceMean(keep_dims=False)
+        self.expand_op = P.ExpandDims()
+        self.transpose_op = P.Transpose()
+        self.reshape_op = P.Reshape()
+        self.is_mulaw_quant = is_mulaw_quantize(hparams.input_type)
+
+        if self.is_mulaw_quant:
+            self.criterion = MaskedCrossEntropyLoss()
+        else:
+            if hparams.output_distribution == "Logistic":
+                self.criterion = DiscretizedMixturelogisticLoss(hparams)
+            elif hparams.output_distribution == "Normal":
+                self.criterion = MixtureGaussianLoss(hparams)
+            else:
+                self.criterion = None
+                raise RuntimeError(
+                    "Not supported output distribution type: {}".format(hparams.output_distribution))
+
+    def construct(self, x, y, c, g, input_lengths, mask):
+        y_hat = self.network(x, c, g, False)
+        if self.is_mulaw_quant:
+            y_hat = self.transpose_op(y_hat[:, :, :-1], (0, 2, 1))
+            y_hat = self.reshape_op(y_hat, (-1, y_hat.shape[-1]))
+            y = self.reshape_op(y[:, 1:, 0], (-1,))
+            loss = self.criterion(y_hat, y)
+        else:
+            loss = self.criterion(y_hat[:, :, :-1], y[:, 1:, :], mask[:, 1:, :])
+        return loss
--- a/model_zoo/research/audio/wavenet/src/lr_generator.py
+++ b/model_zoo/research/audio/wavenet/src/lr_generator.py
@ -0,0 +1,41 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""learning rate generator"""
+import numpy as np
+
+
+def get_lr(init_lr, total_epoch, step_per_epoch,
+           anneal_rate=0.5,
+           anneal_interval=200000):
+    """
+    Learning rate generating
+
+    Args:
+        init_lr (float): Initial learning rate
+        total_epoch (int): Total epoch
+        step_per_epoch (int): Step per epoch
+        anneal_rate (float): anneal rate
+        anneal_interval (int ): anneal interval
+
+    Returns:
+        ndarray: learning rate
+
+    """
+    total_step = total_epoch * step_per_epoch
+    lr_step = []
+    for i in range(total_step):
+        lr_step.append(init_lr * anneal_rate ** (i // anneal_interval))
+    learning_rate = np.array(lr_step).astype(np.float32)
+    return learning_rate
--- a/model_zoo/research/audio/wavenet/train.py
+++ b/model_zoo/research/audio/wavenet/train.py
@ -0,0 +1,135 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_criteo."""
+import os
+from os.path import join
+import json
+import argparse
+from warnings import warn
+from hparams import hparams, hparams_debug_string
+
+from mindspore import context, Tensor
+from mindspore.context import ParallelMode
+from mindspore.communication.management import init, get_rank, get_group_size
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore.nn.optim import Adam
+from mindspore.nn import TrainOneStepCell
+from mindspore.train import Model
+from src.lr_generator import get_lr
+from src.dataset import get_data_loaders
+from src.loss import NetWithLossClass
+from src.callback import Monitor
+from wavenet_vocoder import WaveNet
+from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
+
+parser = argparse.ArgumentParser(description='TTS training')
+parser.add_argument('--data_path', type=str, required=True, default='',
+                    help='Directory contains preprocessed features.')
+parser.add_argument('--preset', type=str, required=True, default='', help='Path of preset parameters (json).')
+parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints_test',
+                    help='Directory where to save model checkpoints [default: checkpoints].')
+parser.add_argument('--checkpoint', type=str, default='', help='Restore model from checkpoint path if given.')
+parser.add_argument('--speaker_id', type=str, default='',
+                    help=' Use specific speaker of data in case for multi-speaker datasets.')
+parser.add_argument('--is_distributed', action="store_true", default=False, help='Distributed training')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.is_distributed:
+        init('nccl')
+        rank_id = get_rank()
+        group_size = get_group_size()
+        context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
+        context.reset_auto_parallel_context()
+        context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
+                                          gradients_mean=True)
+    else:
+        context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
+        rank_id = 0
+        group_size = 1
+
+    speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
+    if args.preset is not None:
+        with open(args.preset) as f:
+            hparams.parse_json(f.read())
+
+    assert hparams.name == "wavenet_vocoder"
+    print(hparams_debug_string())
+    fs = hparams.sample_rate
+    os.makedirs(args.checkpoint_dir, exist_ok=True)
+
+    output_json_path = join(args.checkpoint_dir, "hparams.json")
+    with open(output_json_path, "w") as f:
+        json.dump(hparams.values(), f, indent=2)
+
+    data_loaders = get_data_loaders(args.data_path, args.speaker_id, hparams=hparams, rank_id=rank_id,
+                                    group_size=group_size)
+    step_size_per_epoch = data_loaders.get_dataset_size()
+
+    if is_mulaw_quantize(hparams.input_type):
+        if hparams.out_channels != hparams.quantize_channels:
+            raise RuntimeError(
+                "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
+    if hparams.upsample_conditional_features and hparams.cin_channels < 0:
+        s = "Upsample conv layers were specified while local conditioning disabled. "
+        s += "Notice that upsample conv layers will never be used."
+        warn(s)
+
+    upsample_params = hparams.upsample_params
+    upsample_params["cin_channels"] = hparams.cin_channels
+    upsample_params["cin_pad"] = hparams.cin_pad
+    model = WaveNet(
+        out_channels=hparams.out_channels,
+        layers=hparams.layers,
+        stacks=hparams.stacks,
+        residual_channels=hparams.residual_channels,
+        gate_channels=hparams.gate_channels,
+        skip_out_channels=hparams.skip_out_channels,
+        cin_channels=hparams.cin_channels,
+        gin_channels=hparams.gin_channels,
+        n_speakers=hparams.n_speakers,
+        dropout=hparams.dropout,
+        kernel_size=hparams.kernel_size,
+        cin_pad=hparams.cin_pad,
+        upsample_conditional_features=hparams.upsample_conditional_features,
+        upsample_params=upsample_params,
+        scalar_input=is_scalar_input(hparams.input_type),
+        output_distribution=hparams.output_distribution,
+    )
+    loss_net = NetWithLossClass(model, hparams)
+    lr = get_lr(hparams.optimizer_params["lr"], hparams.nepochs, step_size_per_epoch)
+    lr = Tensor(lr)
+
+    if args.checkpoint != '':
+        param_dict = load_checkpoint(args.pre_trained_model_path)
+        load_param_into_net(model, param_dict)
+        print('Successfully loading the pre-trained model')
+
+    weights = model.trainable_params()
+    optimizer = Adam(weights, learning_rate=lr, loss_scale=1024.)
+    train_net = TrainOneStepCell(loss_net, optimizer)
+
+    model = Model(train_net)
+    lr_cb = Monitor(lr)
+    callback_list = [lr_cb]
+    if args.is_distributed:
+        ckpt_path = os.path.join(args.checkpoint_dir, 'ckpt_' + str(get_rank()) + '/')
+    else:
+        ckpt_path = args.checkpoint_dir
+    config_ck = CheckpointConfig(save_checkpoint_steps=step_size_per_epoch, keep_checkpoint_max=10)
+    ckpt_cb = ModelCheckpoint(prefix='wavenet', directory=ckpt_path, config=config_ck)
+    callback_list.append(ckpt_cb)
+    model.train(hparams.nepochs, data_loaders, callbacks=callback_list)
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/init.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/init.py
@ -0,0 +1,17 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""init"""
+from __future__ import with_statement, print_function, absolute_import
+from .wavenet import WaveNet
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/conv.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/conv.py
@ -0,0 +1,176 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Extended Conv1D."""
+
+import math
+from mindspore import nn, Tensor
+from mindspore.ops import operations as P
+import mindspore.common.dtype as mstype
+import numpy as np
+
+class Conv1d(nn.Conv1d):
+    """
+    Extended nn.Conv1d to adapt to incremental dilated convolutions.
+    During training, initial Conv1D is used and during evaluation, incremental_forward is called.
+    To improve the inference speed, tensor will be converted as numpy and the following calculation is based on numpy.
+    These operation will be replaced with MindSpore ops in the future. Currently, some operation is not supported by
+    MindSpore and a mixed use of numpy and MindSpore will take a long time.
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(Conv1d, self).__init__(*args, **kwargs)
+        self.clear_buffer()
+        self._linearized_weight = None
+        self.transpose_op = P.Transpose()
+        self.reshape_op = P.Reshape()
+        self.squeeze_op = P.Squeeze(-2)
+        self.zeros = P.Zeros()
+        self.concat_op = P.Concat(axis=1)
+        self.matmul = P.MatMul(transpose_b=True)
+        self.bias_add = P.BiasAdd()
+        self.get_weight = None
+        self.get_bias = None
+
+    def incremental_forward(self, inputs, is_numpy=True):
+        if is_numpy:
+            return self.incremental_forward_numpy(inputs)
+        return self.incremental_forward_pynative(inputs)
+
+    def incremental_forward_pynative(self, inputs):
+        """
+        Incremental forward.
+
+        Args:
+            inputs: B x T x C
+
+        Returns:
+            ndarray
+
+        """
+        # input: (B, T, C)
+        if self.training:
+            raise RuntimeError('incremental_forward only supports eval mode')
+
+        if self.get_weight is None:
+            self.get_weight = self._get_linearized_weight()
+
+        if self.get_bias is None and self.bias is not None:
+            self.get_bias = self.bias
+
+        # Note mindspore uses Conv2D to construct Conv1D
+        kw = self.kernel_size[1]
+        dilation = self.dilation[1]
+
+        bsz = inputs.shape[0]  # input: bsz x len x dim
+        if kw > 1:
+            if self.input_buffer is None:
+                init_buffer = self.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), mstype.float32)
+                self.input_buffer = self.concat_op((init_buffer[:, 1:, :], inputs[:, 0:1, :]))
+            else:
+                # shift buffer
+                self.input_buffer = self.concat_op((self.input_buffer[:, 1:, :], inputs[:, 0:1, :]))
+            inputs = self.input_buffer
+            if dilation > 1:
+                inputs = inputs[:, 0::dilation, :]
+
+        output = self.matmul(self.reshape_op(inputs, (bsz, -1)), self.get_weight)
+        if self.bias is not None:
+            output = self.bias_add(output, self.bias)
+        return self.reshape_op(output, (bsz, 1, -1))
+
+    def incremental_forward_numpy(self, inputs):
+        """
+        Incremental forward.
+
+        Args:
+            inputs: B x T x C
+
+        Returns:
+            ndarray
+
+        """
+        # input: (B, T, C)
+        if self.training:
+            raise RuntimeError('incremental_forward only supports eval mode')
+
+        if self.get_weight is None:
+            weight = self._get_linearized_weight()
+            self.get_weight = weight.asnumpy()
+
+        if self.get_bias is None and self.bias is not None:
+            bias = self.bias
+            self.get_bias = bias.asnumpy()
+
+        # Note mindspore uses Conv2D to construct Conv1D
+        kw = self.kernel_size[1]
+        dilation = self.dilation[1]
+
+        bsz = inputs.shape[0]  # input: bsz x len x dim
+        if kw > 1:
+            if self.input_buffer is None:
+                self.input_buffer = np.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), dtype=np.float32)
+            else:
+                # shift buffer
+                self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :]
+            # append next
+            self.input_buffer[:, -1, :] = inputs[:, -1, :]
+            inputs = self.input_buffer
+            if dilation > 1:
+                inputs = inputs[:, 0::dilation, :]
+        output = inputs.reshape(bsz, -1).dot(self.get_weight.T)
+        if self.bias is not None:
+            output = output + np.expand_dims(self.get_bias, 0)
+        return np.reshape(output, (bsz, 1, -1))
+
+    def clear_buffer(self):
+        self.input_buffer = None
+
+    def _get_linearized_weight(self):
+        """
+        get linearized weight
+        """
+        weight = self.squeeze_op(self.weight)
+        if self._linearized_weight is None:
+            # Note mindspore uses Conv2D to construct Conv1D
+            kw = self.kernel_size[1]
+            if weight.shape == (self.out_channels, self.in_channels, kw):
+                weight = self.transpose_op(weight, (0, 2, 1))
+            else:
+                weight = self.transpose_op(weight, (2, 0, 1))
+            self._linearized_weight = self.reshape_op(weight, (self.out_channels, -1))
+        return self._linearized_weight
+
+    def _clear_linearized_weight(self, *args):
+        self._linearized_weight = None
+
+    def _initialize_weights(self):
+        """
+        weight initialization
+        """
+        self.init_parameters_data()
+        std_mul = 4.0
+        for _, m in self.cells_and_names():
+            if isinstance(m, nn.Conv1d):
+                std = math.sqrt((std_mul * 0.1) / (m.kernel_size[1] * self.in_channels))
+                m.weight.set_data(Tensor(np.random.normal(0, std, m.weight.data.shape).astype("float32")))
+                if m.bias is not None:
+                    m.bias.set_data(
+                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.gamma.set_data(
+                    Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
+                m.beta.set_data(
+                    Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/mixture.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/mixture.py
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/modules.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/modules.py
@ -0,0 +1,213 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+modules for wavenet
+"""
+from __future__ import with_statement, print_function, absolute_import
+import math
+import numpy as np
+from wavenet_vocoder import conv
+from mindspore import nn
+from mindspore.ops import operations as P
+
+
+def Conv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs)
+    return m
+
+
+def Conv1d1x1(in_channels, out_channels, has_bias=True):
+    return Conv1d(in_channels, out_channels, kernel_size=1, pad_mode='pad', padding=0, dilation=1, has_bias=has_bias)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    return m
+
+
+def _conv1x1_forward(conv_, x, is_incremental, is_numpy=True):
+    """
+    Conv1x1 forward
+    """
+    if is_incremental:
+        x = conv_.incremental_forward(x, is_numpy=is_numpy)
+    else:
+        x = conv_(x)
+    return x
+
+
+class ResidualConv1dGLU(nn.Cell):
+    """Residual dilated conv1d with gated activation units
+
+    Args:
+        residual_channels (int): Residual input / output channels
+        gate_channels (int): Gated activation channels.
+        kernel_size (int): Kernel size
+        skip_out_channels (int): Skip connection channels. If None, it will set to the same as residual_channels.
+        cin_channels (int): Local conditioning channels. If given negative value, local conditioning is disabled.
+        gin_channels (int): Global conditioning channels. If given negative value, global conditioning is disabled.
+        dropout (float): Dropout rate.
+        padding (int): Padding for convolution layers. If None, padding value will be computed according to dilation
+        and kernel_size.
+        dilation (int): Dilation factor.
+
+    """
+
+    def __init__(self, residual_channels=None, gate_channels=None, kernel_size=None, skip_out_channels=None, bias=True,
+                 dropout=1 - 0.95, dilation=1, cin_channels=-1, gin_channels=-1, padding=None, causal=True):
+        super(ResidualConv1dGLU, self).__init__()
+        self.dropout = dropout
+        self.dropout_op = nn.Dropout(keep_prob=1. - self.dropout)
+        self.eval_split_op = P.Split(axis=-1, output_num=2)
+        self.train_split_op = P.Split(axis=1, output_num=2)
+        self.tanh = P.Tanh()
+        self.sigmoid = P.Sigmoid()
+        self.mul = P.Mul()
+        self.add = P.TensorAdd()
+
+        if skip_out_channels is None:
+            skip_out_channels = residual_channels
+        if padding is None:
+            if causal:
+                padding = (kernel_size - 1) * dilation
+            else:
+                padding = (kernel_size - 1) // 2 * dilation
+        self.causal = causal
+
+        self.conv = Conv1d(residual_channels, gate_channels, kernel_size, pad_mode='pad',
+                           padding=padding, dilation=dilation, has_bias=bias)
+
+        # local conditioning
+        if cin_channels > 0:
+            self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, has_bias=False)
+        else:
+            self.conv1x1c = None
+
+        # global conditioning
+        if gin_channels > 0:
+            self.conv1x1g = Conv1d(gin_channels, gate_channels, has_bias=False, kernel_size=1, dilation=1)
+        else:
+            self.conv1x1g = None
+
+        gate_out_channels = gate_channels // 2
+        self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, has_bias=bias)
+        self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, has_bias=bias)
+        self.factor = math.sqrt(0.5)
+
+    def construct(self, x, c=None, g=None):
+        """
+
+        Args:
+            x(Tensor): One-hot audio signal, the shape is B x C x T
+            c(Tensor): local conditional feature, the shape is B x cin_channels x T
+            g(Tensor): global conditional feature, not used currently
+
+        Returns:
+            Tensor: Output tensor
+
+        """
+
+        residual = x
+        x = self.dropout_op(x)
+        x = self.conv(x)
+        # remove future time steps
+        x = x[:, :, :residual.shape[-1]] if self.causal else x
+        split_op = self.train_split_op
+
+        a, b = split_op(x)
+
+        # local conditioning
+        if c is not None:
+            c = _conv1x1_forward(self.conv1x1c, c, is_incremental=False)
+            ca, cb = split_op(c)
+            a, b = a + ca, b + cb
+
+        # global conditioning
+        if g is not None:
+            g = _conv1x1_forward(self.conv1x1g, g, is_incremental=False)
+            ga, gb = self.split(g)
+            a, b = a + ga, b + gb
+
+        x = self.mul(self.tanh(a), self.sigmoid(b))
+
+        # For skip connection
+        s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=False)
+
+        # For residual connection
+        x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=False)
+
+        x = self.add(x, residual) * self.factor
+        return x, s
+
+    def sigmoid_numpy(self, x):
+        return 1. / (1 + np.exp(-x))
+
+    def incremental_forward(self, x, c=None, g=None, is_numpy=True):
+        """
+        Incremental forward. Used for inference stage
+
+        Args:
+            x (Tensor): One-hot audio signal, the shape is B x C x T
+            c (Tensor): local conditional feature, the shape is B x cin_channels x T
+            g (Tensor): global conditional feature, not used currently
+
+        Returns:
+            ndarray
+        """
+        residual = x
+        x = self.conv.incremental_forward(x, is_numpy=is_numpy)
+        if is_numpy:
+            a, b = np.split(x, indices_or_sections=2, axis=-1)
+        else:
+            a, b = self.eval_split_op(x)
+
+        # local conditioning
+        if c is not None:
+            c = _conv1x1_forward(self.conv1x1c, c, is_incremental=True, is_numpy=is_numpy)
+            if is_numpy:
+                ca, cb = np.split(c, indices_or_sections=2, axis=-1)
+            else:
+                ca, cb = self.eval_split_op(c)
+            a, b = a + ca, b + cb
+
+        # global conditioning
+        if g is not None:
+            g = _conv1x1_forward(self.conv1x1g, g, is_incremental=True, is_numpy=is_numpy)
+            if is_numpy:
+                ga, gb = np.split(g, indices_or_sections=2, axis=-1)
+            else:
+                ga, gb = self.eval_split_op(c)
+            a, b = a + ga, b + gb
+
+        if is_numpy:
+            x = np.tanh(a) * self.sigmoid_numpy(b)
+        else:
+            x = self.mul(self.tanh(a), self.sigmoid(b))
+
+        # For skip connection
+        s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=True, is_numpy=is_numpy)
+
+        # For residual connection
+        x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=True, is_numpy=is_numpy)
+
+        x = (x + residual) * self.factor
+        return x, s
+
+    def clear_buffer(self):
+        """clear buffer"""
+        for c in [self.conv, self.conv1x1_out, self.conv1x1_skip,
+                  self.conv1x1c, self.conv1x1g]:
+            if c is not None:
+                c.clear_buffer()
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/upsample.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/upsample.py
@ -0,0 +1,118 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Upsampling
+
+"""
+from __future__ import with_statement, print_function, absolute_import
+import numpy as np
+from mindspore import nn
+from mindspore.ops import operations as P
+
+
+class Resize(nn.Cell):
+    """
+    Resize input Tensor
+    """
+
+    def __init__(self, x_scale, y_scale, mode="nearest"):
+        super(Resize, self).__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+        self.mode = mode
+
+    def construct(self, x):
+        _, _, h, w = x.shape
+        interpolate_op = P.ResizeNearestNeighbor((self.y_scale * h, self.x_scale * w))
+        return interpolate_op(x)
+
+
+def _get_activation(upsample_activation):
+    """get activation"""
+    nonlinear = getattr(nn, upsample_activation)
+    return nonlinear
+
+
+class UpsampleNetwork(nn.Cell):
+    """UpsampleNetwork"""
+    def __init__(self, upsample_scales, mode="nearest",
+                 freq_axis_kernel_size=1, cin_pad=0, cin_channels=80):
+        super(UpsampleNetwork, self).__init__()
+        self.expand_op = P.ExpandDims()
+        self.squeeze_op = P.Squeeze(1)
+        up_layers = []
+        total_scale = np.prod(upsample_scales)
+        self.indent = cin_pad * total_scale
+        for scale in upsample_scales:
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            k_size = (freq_axis_kernel_size, scale * 2 + 1)
+            # padding = (freq_axis_padding, scale)
+            padding = (freq_axis_padding, freq_axis_padding, scale, scale)
+            stretch = Resize(scale, 1, mode)
+            conv = nn.Conv2d(1, 1, kernel_size=k_size, has_bias=False, pad_mode='pad', padding=padding)
+            up_layers.append(stretch)
+            up_layers.append(conv)
+            # if upsample_activation != "none":
+            #     nonlinear = _get_activation(upsample_activation)
+            #     up_layers.append(nonlinear(**upsample_activation_params))
+        self.up_layers = nn.CellList(up_layers)
+
+    def construct(self, c):
+        """
+
+        Args:
+            c (Tensor): Local conditioning feature
+
+        Returns:
+            Tensor: Upsampling feature
+
+        """
+        # B x 1 x C x T
+        c = self.expand_op(c, 1)
+        for f in self.up_layers:
+            c = f(c)
+        # B x C x T
+        c = self.squeeze_op(c)
+
+        # if self.indent > 0:
+        #     c = c[:, :, self.indent:-self.indent]
+        return c
+
+
+class ConvInUpsampleNetwork(nn.Cell):
+    """Upsample Network
+
+    Args:
+        upsample_scales (list): Upsample_scales list.
+        upsample_activation (str): Upsample_activation.
+        mode (str): Resize mode, default is NearestNeighbor.
+        cin_channels (int): Local conditioning channels.
+        freq_axis_kernel_size (int): Freq-axis kernel_size for the convolution layers after resize.
+
+    """
+
+    def __init__(self, upsample_scales, mode="nearest",
+                 freq_axis_kernel_size=1, cin_pad=0,
+                 cin_channels=80):
+        super(ConvInUpsampleNetwork, self).__init__()
+        ks = 2 * cin_pad + 1
+        self.conv_in = nn.Conv1d(cin_channels, cin_channels, kernel_size=ks, has_bias=False, pad_mode='pad', padding=0)
+        self.upsample = UpsampleNetwork(upsample_scales, mode, freq_axis_kernel_size, cin_pad=0,
+                                        cin_channels=cin_channels)
+
+    def construct(self, c):
+        c = self.conv_in(c)
+        c_up = self.upsample(c)
+        return c_up
--- a/model_zoo/research/audio/wavenet/wavenet_vocoder/wavenet.py
+++ b/model_zoo/research/audio/wavenet/wavenet_vocoder/wavenet.py