pull/11773/head
wanyiming 4 years ago
parent cfe8d6f32a
commit acd40e37e2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,95 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""export mindir."""
import json
from os.path import join
import argparse
from warnings import warn
from hparams import hparams, hparams_debug_string
from mindspore import context, Tensor
from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
from wavenet_vocoder import WaveNet
from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
import numpy as np
from src.loss import PredictNet
parser = argparse.ArgumentParser(description='TTS training')
parser.add_argument('--preset', type=str, default='', help='Path of preset parameters (json).')
parser.add_argument('--speaker_id', type=str, default='',
help=' Use specific speaker of data in case for multi-speaker datasets.')
parser.add_argument('--pretrain_ckpt', type=str, default='', help='Pretrained checkpoint path')
args = parser.parse_args()
if __name__ == '__main__':
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
if args.preset is not None:
with open(args.preset) as f:
hparams.parse_json(f.read())
assert hparams.name == "wavenet_vocoder"
print(hparams_debug_string())
fs = hparams.sample_rate
output_json_path = join(args.checkpoint_dir, "hparams.json")
with open(output_json_path, "w") as f:
json.dump(hparams.values(), f, indent=2)
if is_mulaw_quantize(hparams.input_type):
if hparams.out_channels != hparams.quantize_channels:
raise RuntimeError(
"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
if hparams.upsample_conditional_features and hparams.cin_channels < 0:
s = "Upsample conv layers were specified while local conditioning disabled. "
s += "Notice that upsample conv layers will never be used."
warn(s)
upsample_params = hparams.upsample_params
upsample_params["cin_channels"] = hparams.cin_channels
upsample_params["cin_pad"] = hparams.cin_pad
model = WaveNet(
out_channels=hparams.out_channels,
layers=hparams.layers,
stacks=hparams.stacks,
residual_channels=hparams.residual_channels,
gate_channels=hparams.gate_channels,
skip_out_channels=hparams.skip_out_channels,
cin_channels=hparams.cin_channels,
gin_channels=hparams.gin_channels,
n_speakers=hparams.n_speakers,
dropout=hparams.dropout,
kernel_size=hparams.kernel_size,
cin_pad=hparams.cin_pad,
upsample_conditional_features=hparams.upsample_conditional_features,
upsample_params=upsample_params,
scalar_input=is_scalar_input(hparams.input_type),
output_distribution=hparams.output_distribution,
)
Net = PredictNet(model)
Net.set_train(False)
receptive_field = model.receptive_field
print("Receptive field (samples / ms): {} / {}".format(receptive_field, receptive_field / fs * 1000))
param_dict = load_checkpoint(args.pretrain_ckpt)
load_param_into_net(model, param_dict)
print('Successfully loading the pre-trained model')
x = np.array(np.random.random((2, 256, 10240)), dtype=np.float32)
c = np.array(np.random.random((2, 80, 44)), dtype=np.float32)
g = np.array([0, 0], dtype=np.int64)
export(Net, Tensor(x), Tensor(c), Tensor(g), file_name="WaveNet", file_format='MINDIR')

@ -0,0 +1,14 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the License);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# httpwww.apache.orglicensesLICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

@ -0,0 +1,103 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Defined callback for DeepFM.
"""
import time
from mindspore.train.callback import Callback
from mindspore import Tensor
import numpy as np
class TimeMonitor(Callback):
"""
Time monitor for calculating cost of each epoch.
Args:
data_size (int): step size of an epoch.
"""
def __init__(self, data_size):
super(TimeMonitor, self).__init__()
self.data_size = data_size
def epoch_begin(self, run_context):
self.epoch_time = time.time()
def epoch_end(self, run_context):
epoch_mseconds = (time.time() - self.epoch_time) * 1000
per_step_mseconds = epoch_mseconds / self.data_size
print("epoch time: {0}, per step time: {1}".format(epoch_mseconds, per_step_mseconds), flush=True)
def step_begin(self, run_context):
self.step_time = time.time()
def step_end(self, run_context):
step_mseconds = (time.time() - self.step_time) * 1000
print(f"step time {step_mseconds}", flush=True)
class Monitor(Callback):
"""
Monitor loss and time.
Args:
lr_init (numpy array): train lr
Returns:
None
Examples:
>>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy())
"""
def __init__(self, lr_init=None):
super(Monitor, self).__init__()
self.lr_init = lr_init
self.lr_init_len = len(lr_init)
def epoch_begin(self, run_context):
self.losses = []
self.epoch_time = time.time()
def epoch_end(self, run_context):
cb_params = run_context.original_args()
epoch_mseconds = (time.time() - self.epoch_time)
per_step_mseconds = epoch_mseconds / cb_params.batch_num
print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.6f}".format(epoch_mseconds,
per_step_mseconds,
np.mean(self.losses)))
def step_begin(self, run_context):
self.step_time = time.time()
def step_end(self, run_context):
"""step end"""
cb_params = run_context.original_args()
step_mseconds = (time.time() - self.step_time)
step_loss = cb_params.net_outputs
if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor):
step_loss = step_loss[0]
if isinstance(step_loss, Tensor):
step_loss = np.mean(step_loss.asnumpy())
self.losses.append(step_loss)
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.6f}/{:5.6f}], time:[{:5.3f}], lr:[{:.9f}]".format(
cb_params.cur_epoch_num -
1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1].asnumpy()))

File diff suppressed because it is too large Load Diff

@ -0,0 +1,238 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""loss function definition"""
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mindspore import nn, Tensor
from mindspore.ops import operations as P
from nnmnkwii import preprocessing as P1
from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw
from wavenet_vocoder.mixture import discretized_mix_logistic_loss
from wavenet_vocoder.mixture import mix_gaussian_loss
from train_pytorch import to_categorical
from tqdm import tqdm
import audio
import librosa
import librosa.display
matplotlib.use('Agg')
def sequence_mask(sequence_length, max_len=None):
"""make sequence mask"""
sequence_length = sequence_length.asnumpy()
if max_len is None:
max_len = np.max(sequence_length)
batch_size = sequence_length.shape[0]
seq_range = np.linspace(0, max_len-1, max_len, dtype=np.int32)
seq_range_expand = np.tile(np.expand_dims(seq_range, 0), (batch_size, 1))
seq_length_expand = np.tile(np.expand_dims(sequence_length, 1), (1, max_len))
seq_length_expand = np.expand_dims(np.array(seq_range_expand < seq_length_expand, dtype=np.float32), -1)
return Tensor(seq_length_expand)
class MaskedCrossEntropyLoss(nn.Cell):
"""MaskedCrossEntropyLoss"""
def __init__(self):
super(MaskedCrossEntropyLoss, self).__init__()
self.criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
def construct(self, inputs, target):
losses = self.criterion(inputs, target)
return losses
class DiscretizedMixturelogisticLoss(nn.Cell):
"""DiscretizedMixturelogisticLoss"""
def __init__(self, hparams):
super(DiscretizedMixturelogisticLoss, self).__init__()
self.quantize_channels = hparams.quantize_channels
self.log_scale_min = hparams.log_scale_min
self.discretized_mix_logistic_loss = discretized_mix_logistic_loss(num_classes=hparams.quantize_channels,
log_scale_min=hparams.log_scale_min,
reduce=False)
self.reduce_sum_op = P.ReduceSum()
self.reduce_mean_op = P.ReduceMean()
def construct(self, inputs, target, mask=None):
losses = self.discretized_mix_logistic_loss(inputs, target)
return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
class MixtureGaussianLoss(nn.Cell):
"""MixtureGaussianLoss"""
def __init__(self, hparams):
super(MixtureGaussianLoss, self).__init__()
self.quantize_channels = hparams.quantize_channels
self.log_scale_min = hparams.log_scale_min
self.mix_gaussian_loss = mix_gaussian_loss(log_scale_min=hparams.log_scale_min, reduce=False)
self.reduce_sum_op = P.ReduceSum()
self.reduce_mean_op = P.ReduceMean()
def construct(self, inputs, target, mask=None):
"""
Args:
inputs (Tensor): Predicted distribution
target (Tensor): Target
mask (Tensor): Mask
Returns:
Tensor: Loss tensor
"""
losses = self.mix_gaussian_loss(inputs, target)
return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
def save_waveplot(path, y_hat, y_target, sample_rate):
sr = sample_rate
plt.figure(figsize=(16, 6))
plt.subplot(2, 1, 1)
librosa.display.waveplot(y_target, sr=sr)
plt.subplot(2, 1, 2)
librosa.display.waveplot(y_hat, sr=sr)
plt.tight_layout()
plt.savefig(path, format="png")
plt.close()
def eval_model(hparams, global_step, model, x, y, c, g, input_lengths, eval_dir):
"""
Function for model evaluation. This function is used for debugging in this project.
"""
model.set_train(False)
idx = np.random.randint(0, len(y))
length = input_lengths.asnumpy()[idx]
y_target = np.reshape(y.asnumpy()[idx], (-1))
y_target = y_target[:length]
if c is not None:
expand_op = P.ExpandDims()
if hparams.upsample_conditional_features:
c = expand_op(c[idx, :, :int(length // audio.get_hop_size() + hparams.cin_pad * 2)], 0)
else:
c = expand_op(c[idx, :, :length], 0)
assert c.dim() == 3
print("Shape of local conditioning features: {}".format(c.size()))
if g is not None:
g = g[idx]
print("Shape of global conditioning features: {}".format(g.size()))
# Dummy silence
if is_mulaw_quantize(hparams.input_type):
initial_value = P1.mulaw_quantize(0, hparams.quantize_channels - 1)
elif is_mulaw(hparams.input_type):
initial_value = P1.mulaw(0.0, hparams.quantize_channels)
else:
initial_value = 0.0
# (C,)
if is_mulaw_quantize(hparams.input_type):
initial_input = to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = Tensor(np.reshape(initial_input, (1, 1, hparams.quantize_channels)))
else:
initial_input = np.ones((1, 1, 1)) * initial_value
initial_input = Tensor(initial_input)
# Run the model in fast eval mode
y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm,
log_scale_min=hparams.log_scale_min)
if is_mulaw_quantize(hparams.input_type):
y_hat = np.reshape(np.argmax(y_hat, 1), (-1))
y_hat = P1.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1)
y_target = P1.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1)
elif is_mulaw(hparams.input_type):
y_hat = P1.inv_mulaw(np.reshape(y_hat, (-1)), hparams.quantize_channels)
y_target = P1.inv_mulaw(y_target, hparams.quantize_channels)
else:
y_hat = np.reshape(y_hat, (-1))
# Save audio
os.makedirs(eval_dir, exist_ok=True)
path = os.path.join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
path = os.path.join(eval_dir, "step{:09d}_target.wav".format(global_step))
librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)
# Save figure
path = os.path.join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
save_waveplot(path, y_hat, y_target, hparams.sample_rate)
class PredictNet(nn.Cell):
"""
NetWithLossClass definition
"""
def __init__(self, network):
super(PredictNet, self).__init__(auto_prefix=False)
self.network = network
def construct(self, x, c, g):
y_hat = self.network(x, c, g, False)
return y_hat
class NetWithLossClass(nn.Cell):
"""
NetWithLossClass definition
Args:
network (Cell): Pre-defined WaveNet.
hparams (optional): Parameters.
Returns:
Tensor, loss tensor.
"""
def __init__(self, network, hparams):
super(NetWithLossClass, self).__init__(auto_prefix=False)
self.network = network
self.hparams = hparams
self.ReduceMean_false = P.ReduceMean(keep_dims=False)
self.expand_op = P.ExpandDims()
self.transpose_op = P.Transpose()
self.reshape_op = P.Reshape()
self.is_mulaw_quant = is_mulaw_quantize(hparams.input_type)
if self.is_mulaw_quant:
self.criterion = MaskedCrossEntropyLoss()
else:
if hparams.output_distribution == "Logistic":
self.criterion = DiscretizedMixturelogisticLoss(hparams)
elif hparams.output_distribution == "Normal":
self.criterion = MixtureGaussianLoss(hparams)
else:
self.criterion = None
raise RuntimeError(
"Not supported output distribution type: {}".format(hparams.output_distribution))
def construct(self, x, y, c, g, input_lengths, mask):
y_hat = self.network(x, c, g, False)
if self.is_mulaw_quant:
y_hat = self.transpose_op(y_hat[:, :, :-1], (0, 2, 1))
y_hat = self.reshape_op(y_hat, (-1, y_hat.shape[-1]))
y = self.reshape_op(y[:, 1:, 0], (-1,))
loss = self.criterion(y_hat, y)
else:
loss = self.criterion(y_hat[:, :, :-1], y[:, 1:, :], mask[:, 1:, :])
return loss

@ -0,0 +1,41 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""learning rate generator"""
import numpy as np
def get_lr(init_lr, total_epoch, step_per_epoch,
anneal_rate=0.5,
anneal_interval=200000):
"""
Learning rate generating
Args:
init_lr (float): Initial learning rate
total_epoch (int): Total epoch
step_per_epoch (int): Step per epoch
anneal_rate (float): anneal rate
anneal_interval (int ): anneal interval
Returns:
ndarray: learning rate
"""
total_step = total_epoch * step_per_epoch
lr_step = []
for i in range(total_step):
lr_step.append(init_lr * anneal_rate ** (i // anneal_interval))
learning_rate = np.array(lr_step).astype(np.float32)
return learning_rate

@ -0,0 +1,135 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_criteo."""
import os
from os.path import join
import json
import argparse
from warnings import warn
from hparams import hparams, hparams_debug_string
from mindspore import context, Tensor
from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank, get_group_size
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.nn.optim import Adam
from mindspore.nn import TrainOneStepCell
from mindspore.train import Model
from src.lr_generator import get_lr
from src.dataset import get_data_loaders
from src.loss import NetWithLossClass
from src.callback import Monitor
from wavenet_vocoder import WaveNet
from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
parser = argparse.ArgumentParser(description='TTS training')
parser.add_argument('--data_path', type=str, required=True, default='',
help='Directory contains preprocessed features.')
parser.add_argument('--preset', type=str, required=True, default='', help='Path of preset parameters (json).')
parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints_test',
help='Directory where to save model checkpoints [default: checkpoints].')
parser.add_argument('--checkpoint', type=str, default='', help='Restore model from checkpoint path if given.')
parser.add_argument('--speaker_id', type=str, default='',
help=' Use specific speaker of data in case for multi-speaker datasets.')
parser.add_argument('--is_distributed', action="store_true", default=False, help='Distributed training')
args = parser.parse_args()
if __name__ == '__main__':
if args.is_distributed:
init('nccl')
rank_id = get_rank()
group_size = get_group_size()
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
else:
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
rank_id = 0
group_size = 1
speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
if args.preset is not None:
with open(args.preset) as f:
hparams.parse_json(f.read())
assert hparams.name == "wavenet_vocoder"
print(hparams_debug_string())
fs = hparams.sample_rate
os.makedirs(args.checkpoint_dir, exist_ok=True)
output_json_path = join(args.checkpoint_dir, "hparams.json")
with open(output_json_path, "w") as f:
json.dump(hparams.values(), f, indent=2)
data_loaders = get_data_loaders(args.data_path, args.speaker_id, hparams=hparams, rank_id=rank_id,
group_size=group_size)
step_size_per_epoch = data_loaders.get_dataset_size()
if is_mulaw_quantize(hparams.input_type):
if hparams.out_channels != hparams.quantize_channels:
raise RuntimeError(
"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
if hparams.upsample_conditional_features and hparams.cin_channels < 0:
s = "Upsample conv layers were specified while local conditioning disabled. "
s += "Notice that upsample conv layers will never be used."
warn(s)
upsample_params = hparams.upsample_params
upsample_params["cin_channels"] = hparams.cin_channels
upsample_params["cin_pad"] = hparams.cin_pad
model = WaveNet(
out_channels=hparams.out_channels,
layers=hparams.layers,
stacks=hparams.stacks,
residual_channels=hparams.residual_channels,
gate_channels=hparams.gate_channels,
skip_out_channels=hparams.skip_out_channels,
cin_channels=hparams.cin_channels,
gin_channels=hparams.gin_channels,
n_speakers=hparams.n_speakers,
dropout=hparams.dropout,
kernel_size=hparams.kernel_size,
cin_pad=hparams.cin_pad,
upsample_conditional_features=hparams.upsample_conditional_features,
upsample_params=upsample_params,
scalar_input=is_scalar_input(hparams.input_type),
output_distribution=hparams.output_distribution,
)
loss_net = NetWithLossClass(model, hparams)
lr = get_lr(hparams.optimizer_params["lr"], hparams.nepochs, step_size_per_epoch)
lr = Tensor(lr)
if args.checkpoint != '':
param_dict = load_checkpoint(args.pre_trained_model_path)
load_param_into_net(model, param_dict)
print('Successfully loading the pre-trained model')
weights = model.trainable_params()
optimizer = Adam(weights, learning_rate=lr, loss_scale=1024.)
train_net = TrainOneStepCell(loss_net, optimizer)
model = Model(train_net)
lr_cb = Monitor(lr)
callback_list = [lr_cb]
if args.is_distributed:
ckpt_path = os.path.join(args.checkpoint_dir, 'ckpt_' + str(get_rank()) + '/')
else:
ckpt_path = args.checkpoint_dir
config_ck = CheckpointConfig(save_checkpoint_steps=step_size_per_epoch, keep_checkpoint_max=10)
ckpt_cb = ModelCheckpoint(prefix='wavenet', directory=ckpt_path, config=config_ck)
callback_list.append(ckpt_cb)
model.train(hparams.nepochs, data_loaders, callbacks=callback_list)

@ -0,0 +1,17 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""init"""
from __future__ import with_statement, print_function, absolute_import
from .wavenet import WaveNet

@ -0,0 +1,176 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Extended Conv1D."""
import math
from mindspore import nn, Tensor
from mindspore.ops import operations as P
import mindspore.common.dtype as mstype
import numpy as np
class Conv1d(nn.Conv1d):
"""
Extended nn.Conv1d to adapt to incremental dilated convolutions.
During training, initial Conv1D is used and during evaluation, incremental_forward is called.
To improve the inference speed, tensor will be converted as numpy and the following calculation is based on numpy.
These operation will be replaced with MindSpore ops in the future. Currently, some operation is not supported by
MindSpore and a mixed use of numpy and MindSpore will take a long time.
"""
def __init__(self, *args, **kwargs):
super(Conv1d, self).__init__(*args, **kwargs)
self.clear_buffer()
self._linearized_weight = None
self.transpose_op = P.Transpose()
self.reshape_op = P.Reshape()
self.squeeze_op = P.Squeeze(-2)
self.zeros = P.Zeros()
self.concat_op = P.Concat(axis=1)
self.matmul = P.MatMul(transpose_b=True)
self.bias_add = P.BiasAdd()
self.get_weight = None
self.get_bias = None
def incremental_forward(self, inputs, is_numpy=True):
if is_numpy:
return self.incremental_forward_numpy(inputs)
return self.incremental_forward_pynative(inputs)
def incremental_forward_pynative(self, inputs):
"""
Incremental forward.
Args:
inputs: B x T x C
Returns:
ndarray
"""
# input: (B, T, C)
if self.training:
raise RuntimeError('incremental_forward only supports eval mode')
if self.get_weight is None:
self.get_weight = self._get_linearized_weight()
if self.get_bias is None and self.bias is not None:
self.get_bias = self.bias
# Note mindspore uses Conv2D to construct Conv1D
kw = self.kernel_size[1]
dilation = self.dilation[1]
bsz = inputs.shape[0] # input: bsz x len x dim
if kw > 1:
if self.input_buffer is None:
init_buffer = self.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), mstype.float32)
self.input_buffer = self.concat_op((init_buffer[:, 1:, :], inputs[:, 0:1, :]))
else:
# shift buffer
self.input_buffer = self.concat_op((self.input_buffer[:, 1:, :], inputs[:, 0:1, :]))
inputs = self.input_buffer
if dilation > 1:
inputs = inputs[:, 0::dilation, :]
output = self.matmul(self.reshape_op(inputs, (bsz, -1)), self.get_weight)
if self.bias is not None:
output = self.bias_add(output, self.bias)
return self.reshape_op(output, (bsz, 1, -1))
def incremental_forward_numpy(self, inputs):
"""
Incremental forward.
Args:
inputs: B x T x C
Returns:
ndarray
"""
# input: (B, T, C)
if self.training:
raise RuntimeError('incremental_forward only supports eval mode')
if self.get_weight is None:
weight = self._get_linearized_weight()
self.get_weight = weight.asnumpy()
if self.get_bias is None and self.bias is not None:
bias = self.bias
self.get_bias = bias.asnumpy()
# Note mindspore uses Conv2D to construct Conv1D
kw = self.kernel_size[1]
dilation = self.dilation[1]
bsz = inputs.shape[0] # input: bsz x len x dim
if kw > 1:
if self.input_buffer is None:
self.input_buffer = np.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), dtype=np.float32)
else:
# shift buffer
self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :]
# append next
self.input_buffer[:, -1, :] = inputs[:, -1, :]
inputs = self.input_buffer
if dilation > 1:
inputs = inputs[:, 0::dilation, :]
output = inputs.reshape(bsz, -1).dot(self.get_weight.T)
if self.bias is not None:
output = output + np.expand_dims(self.get_bias, 0)
return np.reshape(output, (bsz, 1, -1))
def clear_buffer(self):
self.input_buffer = None
def _get_linearized_weight(self):
"""
get linearized weight
"""
weight = self.squeeze_op(self.weight)
if self._linearized_weight is None:
# Note mindspore uses Conv2D to construct Conv1D
kw = self.kernel_size[1]
if weight.shape == (self.out_channels, self.in_channels, kw):
weight = self.transpose_op(weight, (0, 2, 1))
else:
weight = self.transpose_op(weight, (2, 0, 1))
self._linearized_weight = self.reshape_op(weight, (self.out_channels, -1))
return self._linearized_weight
def _clear_linearized_weight(self, *args):
self._linearized_weight = None
def _initialize_weights(self):
"""
weight initialization
"""
self.init_parameters_data()
std_mul = 4.0
for _, m in self.cells_and_names():
if isinstance(m, nn.Conv1d):
std = math.sqrt((std_mul * 0.1) / (m.kernel_size[1] * self.in_channels))
m.weight.set_data(Tensor(np.random.normal(0, std, m.weight.data.shape).astype("float32")))
if m.bias is not None:
m.bias.set_data(
Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
elif isinstance(m, nn.BatchNorm2d):
m.gamma.set_data(
Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
m.beta.set_data(
Tensor(np.zeros(m.beta.data.shape, dtype="float32")))

File diff suppressed because it is too large Load Diff

@ -0,0 +1,213 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
modules for wavenet
"""
from __future__ import with_statement, print_function, absolute_import
import math
import numpy as np
from wavenet_vocoder import conv
from mindspore import nn
from mindspore.ops import operations as P
def Conv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs)
return m
def Conv1d1x1(in_channels, out_channels, has_bias=True):
return Conv1d(in_channels, out_channels, kernel_size=1, pad_mode='pad', padding=0, dilation=1, has_bias=has_bias)
def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
return m
def _conv1x1_forward(conv_, x, is_incremental, is_numpy=True):
"""
Conv1x1 forward
"""
if is_incremental:
x = conv_.incremental_forward(x, is_numpy=is_numpy)
else:
x = conv_(x)
return x
class ResidualConv1dGLU(nn.Cell):
"""Residual dilated conv1d with gated activation units
Args:
residual_channels (int): Residual input / output channels
gate_channels (int): Gated activation channels.
kernel_size (int): Kernel size
skip_out_channels (int): Skip connection channels. If None, it will set to the same as residual_channels.
cin_channels (int): Local conditioning channels. If given negative value, local conditioning is disabled.
gin_channels (int): Global conditioning channels. If given negative value, global conditioning is disabled.
dropout (float): Dropout rate.
padding (int): Padding for convolution layers. If None, padding value will be computed according to dilation
and kernel_size.
dilation (int): Dilation factor.
"""
def __init__(self, residual_channels=None, gate_channels=None, kernel_size=None, skip_out_channels=None, bias=True,
dropout=1 - 0.95, dilation=1, cin_channels=-1, gin_channels=-1, padding=None, causal=True):
super(ResidualConv1dGLU, self).__init__()
self.dropout = dropout
self.dropout_op = nn.Dropout(keep_prob=1. - self.dropout)
self.eval_split_op = P.Split(axis=-1, output_num=2)
self.train_split_op = P.Split(axis=1, output_num=2)
self.tanh = P.Tanh()
self.sigmoid = P.Sigmoid()
self.mul = P.Mul()
self.add = P.TensorAdd()
if skip_out_channels is None:
skip_out_channels = residual_channels
if padding is None:
if causal:
padding = (kernel_size - 1) * dilation
else:
padding = (kernel_size - 1) // 2 * dilation
self.causal = causal
self.conv = Conv1d(residual_channels, gate_channels, kernel_size, pad_mode='pad',
padding=padding, dilation=dilation, has_bias=bias)
# local conditioning
if cin_channels > 0:
self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, has_bias=False)
else:
self.conv1x1c = None
# global conditioning
if gin_channels > 0:
self.conv1x1g = Conv1d(gin_channels, gate_channels, has_bias=False, kernel_size=1, dilation=1)
else:
self.conv1x1g = None
gate_out_channels = gate_channels // 2
self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, has_bias=bias)
self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, has_bias=bias)
self.factor = math.sqrt(0.5)
def construct(self, x, c=None, g=None):
"""
Args:
x(Tensor): One-hot audio signal, the shape is B x C x T
c(Tensor): local conditional feature, the shape is B x cin_channels x T
g(Tensor): global conditional feature, not used currently
Returns:
Tensor: Output tensor
"""
residual = x
x = self.dropout_op(x)
x = self.conv(x)
# remove future time steps
x = x[:, :, :residual.shape[-1]] if self.causal else x
split_op = self.train_split_op
a, b = split_op(x)
# local conditioning
if c is not None:
c = _conv1x1_forward(self.conv1x1c, c, is_incremental=False)
ca, cb = split_op(c)
a, b = a + ca, b + cb
# global conditioning
if g is not None:
g = _conv1x1_forward(self.conv1x1g, g, is_incremental=False)
ga, gb = self.split(g)
a, b = a + ga, b + gb
x = self.mul(self.tanh(a), self.sigmoid(b))
# For skip connection
s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=False)
# For residual connection
x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=False)
x = self.add(x, residual) * self.factor
return x, s
def sigmoid_numpy(self, x):
return 1. / (1 + np.exp(-x))
def incremental_forward(self, x, c=None, g=None, is_numpy=True):
"""
Incremental forward. Used for inference stage
Args:
x (Tensor): One-hot audio signal, the shape is B x C x T
c (Tensor): local conditional feature, the shape is B x cin_channels x T
g (Tensor): global conditional feature, not used currently
Returns:
ndarray
"""
residual = x
x = self.conv.incremental_forward(x, is_numpy=is_numpy)
if is_numpy:
a, b = np.split(x, indices_or_sections=2, axis=-1)
else:
a, b = self.eval_split_op(x)
# local conditioning
if c is not None:
c = _conv1x1_forward(self.conv1x1c, c, is_incremental=True, is_numpy=is_numpy)
if is_numpy:
ca, cb = np.split(c, indices_or_sections=2, axis=-1)
else:
ca, cb = self.eval_split_op(c)
a, b = a + ca, b + cb
# global conditioning
if g is not None:
g = _conv1x1_forward(self.conv1x1g, g, is_incremental=True, is_numpy=is_numpy)
if is_numpy:
ga, gb = np.split(g, indices_or_sections=2, axis=-1)
else:
ga, gb = self.eval_split_op(c)
a, b = a + ga, b + gb
if is_numpy:
x = np.tanh(a) * self.sigmoid_numpy(b)
else:
x = self.mul(self.tanh(a), self.sigmoid(b))
# For skip connection
s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=True, is_numpy=is_numpy)
# For residual connection
x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=True, is_numpy=is_numpy)
x = (x + residual) * self.factor
return x, s
def clear_buffer(self):
"""clear buffer"""
for c in [self.conv, self.conv1x1_out, self.conv1x1_skip,
self.conv1x1c, self.conv1x1g]:
if c is not None:
c.clear_buffer()

@ -0,0 +1,118 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Upsampling
"""
from __future__ import with_statement, print_function, absolute_import
import numpy as np
from mindspore import nn
from mindspore.ops import operations as P
class Resize(nn.Cell):
"""
Resize input Tensor
"""
def __init__(self, x_scale, y_scale, mode="nearest"):
super(Resize, self).__init__()
self.x_scale = x_scale
self.y_scale = y_scale
self.mode = mode
def construct(self, x):
_, _, h, w = x.shape
interpolate_op = P.ResizeNearestNeighbor((self.y_scale * h, self.x_scale * w))
return interpolate_op(x)
def _get_activation(upsample_activation):
"""get activation"""
nonlinear = getattr(nn, upsample_activation)
return nonlinear
class UpsampleNetwork(nn.Cell):
"""UpsampleNetwork"""
def __init__(self, upsample_scales, mode="nearest",
freq_axis_kernel_size=1, cin_pad=0, cin_channels=80):
super(UpsampleNetwork, self).__init__()
self.expand_op = P.ExpandDims()
self.squeeze_op = P.Squeeze(1)
up_layers = []
total_scale = np.prod(upsample_scales)
self.indent = cin_pad * total_scale
for scale in upsample_scales:
freq_axis_padding = (freq_axis_kernel_size - 1) // 2
k_size = (freq_axis_kernel_size, scale * 2 + 1)
# padding = (freq_axis_padding, scale)
padding = (freq_axis_padding, freq_axis_padding, scale, scale)
stretch = Resize(scale, 1, mode)
conv = nn.Conv2d(1, 1, kernel_size=k_size, has_bias=False, pad_mode='pad', padding=padding)
up_layers.append(stretch)
up_layers.append(conv)
# if upsample_activation != "none":
# nonlinear = _get_activation(upsample_activation)
# up_layers.append(nonlinear(**upsample_activation_params))
self.up_layers = nn.CellList(up_layers)
def construct(self, c):
"""
Args:
c (Tensor): Local conditioning feature
Returns:
Tensor: Upsampling feature
"""
# B x 1 x C x T
c = self.expand_op(c, 1)
for f in self.up_layers:
c = f(c)
# B x C x T
c = self.squeeze_op(c)
# if self.indent > 0:
# c = c[:, :, self.indent:-self.indent]
return c
class ConvInUpsampleNetwork(nn.Cell):
"""Upsample Network
Args:
upsample_scales (list): Upsample_scales list.
upsample_activation (str): Upsample_activation.
mode (str): Resize mode, default is NearestNeighbor.
cin_channels (int): Local conditioning channels.
freq_axis_kernel_size (int): Freq-axis kernel_size for the convolution layers after resize.
"""
def __init__(self, upsample_scales, mode="nearest",
freq_axis_kernel_size=1, cin_pad=0,
cin_channels=80):
super(ConvInUpsampleNetwork, self).__init__()
ks = 2 * cin_pad + 1
self.conv_in = nn.Conv1d(cin_channels, cin_channels, kernel_size=ks, has_bias=False, pad_mode='pad', padding=0)
self.upsample = UpsampleNetwork(upsample_scales, mode, freq_axis_kernel_size, cin_pad=0,
cin_channels=cin_channels)
def construct(self, c):
c = self.conv_in(c)
c_up = self.upsample(c)
return c_up

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save