commit
b82df95b43
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,95 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""export mindir."""
|
||||||
|
import json
|
||||||
|
from os.path import join
|
||||||
|
import argparse
|
||||||
|
from warnings import warn
|
||||||
|
from hparams import hparams, hparams_debug_string
|
||||||
|
from mindspore import context, Tensor
|
||||||
|
from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
|
||||||
|
from wavenet_vocoder import WaveNet
|
||||||
|
from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
|
||||||
|
import numpy as np
|
||||||
|
from src.loss import PredictNet
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='TTS training')
|
||||||
|
parser.add_argument('--preset', type=str, default='', help='Path of preset parameters (json).')
|
||||||
|
parser.add_argument('--speaker_id', type=str, default='',
|
||||||
|
help=' Use specific speaker of data in case for multi-speaker datasets.')
|
||||||
|
parser.add_argument('--pretrain_ckpt', type=str, default='', help='Pretrained checkpoint path')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
|
||||||
|
|
||||||
|
speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
|
||||||
|
if args.preset is not None:
|
||||||
|
with open(args.preset) as f:
|
||||||
|
hparams.parse_json(f.read())
|
||||||
|
|
||||||
|
assert hparams.name == "wavenet_vocoder"
|
||||||
|
print(hparams_debug_string())
|
||||||
|
|
||||||
|
fs = hparams.sample_rate
|
||||||
|
output_json_path = join(args.checkpoint_dir, "hparams.json")
|
||||||
|
with open(output_json_path, "w") as f:
|
||||||
|
json.dump(hparams.values(), f, indent=2)
|
||||||
|
|
||||||
|
if is_mulaw_quantize(hparams.input_type):
|
||||||
|
if hparams.out_channels != hparams.quantize_channels:
|
||||||
|
raise RuntimeError(
|
||||||
|
"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
|
||||||
|
if hparams.upsample_conditional_features and hparams.cin_channels < 0:
|
||||||
|
s = "Upsample conv layers were specified while local conditioning disabled. "
|
||||||
|
s += "Notice that upsample conv layers will never be used."
|
||||||
|
warn(s)
|
||||||
|
|
||||||
|
upsample_params = hparams.upsample_params
|
||||||
|
upsample_params["cin_channels"] = hparams.cin_channels
|
||||||
|
upsample_params["cin_pad"] = hparams.cin_pad
|
||||||
|
model = WaveNet(
|
||||||
|
out_channels=hparams.out_channels,
|
||||||
|
layers=hparams.layers,
|
||||||
|
stacks=hparams.stacks,
|
||||||
|
residual_channels=hparams.residual_channels,
|
||||||
|
gate_channels=hparams.gate_channels,
|
||||||
|
skip_out_channels=hparams.skip_out_channels,
|
||||||
|
cin_channels=hparams.cin_channels,
|
||||||
|
gin_channels=hparams.gin_channels,
|
||||||
|
n_speakers=hparams.n_speakers,
|
||||||
|
dropout=hparams.dropout,
|
||||||
|
kernel_size=hparams.kernel_size,
|
||||||
|
cin_pad=hparams.cin_pad,
|
||||||
|
upsample_conditional_features=hparams.upsample_conditional_features,
|
||||||
|
upsample_params=upsample_params,
|
||||||
|
scalar_input=is_scalar_input(hparams.input_type),
|
||||||
|
output_distribution=hparams.output_distribution,
|
||||||
|
)
|
||||||
|
|
||||||
|
Net = PredictNet(model)
|
||||||
|
Net.set_train(False)
|
||||||
|
receptive_field = model.receptive_field
|
||||||
|
print("Receptive field (samples / ms): {} / {}".format(receptive_field, receptive_field / fs * 1000))
|
||||||
|
param_dict = load_checkpoint(args.pretrain_ckpt)
|
||||||
|
load_param_into_net(model, param_dict)
|
||||||
|
print('Successfully loading the pre-trained model')
|
||||||
|
|
||||||
|
x = np.array(np.random.random((2, 256, 10240)), dtype=np.float32)
|
||||||
|
c = np.array(np.random.random((2, 80, 44)), dtype=np.float32)
|
||||||
|
g = np.array([0, 0], dtype=np.int64)
|
||||||
|
|
||||||
|
export(Net, Tensor(x), Tensor(c), Tensor(g), file_name="WaveNet", file_format='MINDIR')
|
@ -0,0 +1,14 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the License);
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# httpwww.apache.orglicensesLICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an AS IS BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
@ -0,0 +1,103 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Defined callback for DeepFM.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
from mindspore.train.callback import Callback
|
||||||
|
from mindspore import Tensor
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class TimeMonitor(Callback):
|
||||||
|
"""
|
||||||
|
Time monitor for calculating cost of each epoch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_size (int): step size of an epoch.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, data_size):
|
||||||
|
super(TimeMonitor, self).__init__()
|
||||||
|
self.data_size = data_size
|
||||||
|
|
||||||
|
def epoch_begin(self, run_context):
|
||||||
|
self.epoch_time = time.time()
|
||||||
|
|
||||||
|
def epoch_end(self, run_context):
|
||||||
|
epoch_mseconds = (time.time() - self.epoch_time) * 1000
|
||||||
|
per_step_mseconds = epoch_mseconds / self.data_size
|
||||||
|
print("epoch time: {0}, per step time: {1}".format(epoch_mseconds, per_step_mseconds), flush=True)
|
||||||
|
|
||||||
|
def step_begin(self, run_context):
|
||||||
|
self.step_time = time.time()
|
||||||
|
|
||||||
|
def step_end(self, run_context):
|
||||||
|
step_mseconds = (time.time() - self.step_time) * 1000
|
||||||
|
print(f"step time {step_mseconds}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Monitor(Callback):
|
||||||
|
"""
|
||||||
|
Monitor loss and time.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lr_init (numpy array): train lr
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy())
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, lr_init=None):
|
||||||
|
super(Monitor, self).__init__()
|
||||||
|
self.lr_init = lr_init
|
||||||
|
self.lr_init_len = len(lr_init)
|
||||||
|
|
||||||
|
def epoch_begin(self, run_context):
|
||||||
|
self.losses = []
|
||||||
|
self.epoch_time = time.time()
|
||||||
|
|
||||||
|
def epoch_end(self, run_context):
|
||||||
|
cb_params = run_context.original_args()
|
||||||
|
|
||||||
|
epoch_mseconds = (time.time() - self.epoch_time)
|
||||||
|
per_step_mseconds = epoch_mseconds / cb_params.batch_num
|
||||||
|
print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.6f}".format(epoch_mseconds,
|
||||||
|
per_step_mseconds,
|
||||||
|
np.mean(self.losses)))
|
||||||
|
|
||||||
|
def step_begin(self, run_context):
|
||||||
|
self.step_time = time.time()
|
||||||
|
|
||||||
|
def step_end(self, run_context):
|
||||||
|
"""step end"""
|
||||||
|
cb_params = run_context.original_args()
|
||||||
|
step_mseconds = (time.time() - self.step_time)
|
||||||
|
step_loss = cb_params.net_outputs
|
||||||
|
|
||||||
|
if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor):
|
||||||
|
step_loss = step_loss[0]
|
||||||
|
if isinstance(step_loss, Tensor):
|
||||||
|
step_loss = np.mean(step_loss.asnumpy())
|
||||||
|
|
||||||
|
self.losses.append(step_loss)
|
||||||
|
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
|
||||||
|
|
||||||
|
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.6f}/{:5.6f}], time:[{:5.3f}], lr:[{:.9f}]".format(
|
||||||
|
cb_params.cur_epoch_num -
|
||||||
|
1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
|
||||||
|
np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1].asnumpy()))
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,238 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""loss function definition"""
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from mindspore import nn, Tensor
|
||||||
|
from mindspore.ops import operations as P
|
||||||
|
from nnmnkwii import preprocessing as P1
|
||||||
|
|
||||||
|
from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw
|
||||||
|
from wavenet_vocoder.mixture import discretized_mix_logistic_loss
|
||||||
|
from wavenet_vocoder.mixture import mix_gaussian_loss
|
||||||
|
from train_pytorch import to_categorical
|
||||||
|
from tqdm import tqdm
|
||||||
|
import audio
|
||||||
|
import librosa
|
||||||
|
import librosa.display
|
||||||
|
matplotlib.use('Agg')
|
||||||
|
|
||||||
|
def sequence_mask(sequence_length, max_len=None):
|
||||||
|
"""make sequence mask"""
|
||||||
|
sequence_length = sequence_length.asnumpy()
|
||||||
|
if max_len is None:
|
||||||
|
max_len = np.max(sequence_length)
|
||||||
|
batch_size = sequence_length.shape[0]
|
||||||
|
seq_range = np.linspace(0, max_len-1, max_len, dtype=np.int32)
|
||||||
|
seq_range_expand = np.tile(np.expand_dims(seq_range, 0), (batch_size, 1))
|
||||||
|
seq_length_expand = np.tile(np.expand_dims(sequence_length, 1), (1, max_len))
|
||||||
|
seq_length_expand = np.expand_dims(np.array(seq_range_expand < seq_length_expand, dtype=np.float32), -1)
|
||||||
|
return Tensor(seq_length_expand)
|
||||||
|
|
||||||
|
class MaskedCrossEntropyLoss(nn.Cell):
|
||||||
|
"""MaskedCrossEntropyLoss"""
|
||||||
|
def __init__(self):
|
||||||
|
super(MaskedCrossEntropyLoss, self).__init__()
|
||||||
|
self.criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||||
|
|
||||||
|
def construct(self, inputs, target):
|
||||||
|
losses = self.criterion(inputs, target)
|
||||||
|
return losses
|
||||||
|
|
||||||
|
|
||||||
|
class DiscretizedMixturelogisticLoss(nn.Cell):
|
||||||
|
"""DiscretizedMixturelogisticLoss"""
|
||||||
|
def __init__(self, hparams):
|
||||||
|
super(DiscretizedMixturelogisticLoss, self).__init__()
|
||||||
|
self.quantize_channels = hparams.quantize_channels
|
||||||
|
self.log_scale_min = hparams.log_scale_min
|
||||||
|
self.discretized_mix_logistic_loss = discretized_mix_logistic_loss(num_classes=hparams.quantize_channels,
|
||||||
|
log_scale_min=hparams.log_scale_min,
|
||||||
|
reduce=False)
|
||||||
|
self.reduce_sum_op = P.ReduceSum()
|
||||||
|
self.reduce_mean_op = P.ReduceMean()
|
||||||
|
|
||||||
|
def construct(self, inputs, target, mask=None):
|
||||||
|
losses = self.discretized_mix_logistic_loss(inputs, target)
|
||||||
|
return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
|
||||||
|
|
||||||
|
|
||||||
|
class MixtureGaussianLoss(nn.Cell):
|
||||||
|
"""MixtureGaussianLoss"""
|
||||||
|
def __init__(self, hparams):
|
||||||
|
super(MixtureGaussianLoss, self).__init__()
|
||||||
|
self.quantize_channels = hparams.quantize_channels
|
||||||
|
self.log_scale_min = hparams.log_scale_min
|
||||||
|
self.mix_gaussian_loss = mix_gaussian_loss(log_scale_min=hparams.log_scale_min, reduce=False)
|
||||||
|
self.reduce_sum_op = P.ReduceSum()
|
||||||
|
self.reduce_mean_op = P.ReduceMean()
|
||||||
|
|
||||||
|
def construct(self, inputs, target, mask=None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs (Tensor): Predicted distribution
|
||||||
|
target (Tensor): Target
|
||||||
|
mask (Tensor): Mask
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Loss tensor
|
||||||
|
|
||||||
|
"""
|
||||||
|
losses = self.mix_gaussian_loss(inputs, target)
|
||||||
|
return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
|
||||||
|
|
||||||
|
|
||||||
|
def save_waveplot(path, y_hat, y_target, sample_rate):
|
||||||
|
sr = sample_rate
|
||||||
|
plt.figure(figsize=(16, 6))
|
||||||
|
plt.subplot(2, 1, 1)
|
||||||
|
librosa.display.waveplot(y_target, sr=sr)
|
||||||
|
plt.subplot(2, 1, 2)
|
||||||
|
librosa.display.waveplot(y_hat, sr=sr)
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(path, format="png")
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def eval_model(hparams, global_step, model, x, y, c, g, input_lengths, eval_dir):
|
||||||
|
"""
|
||||||
|
Function for model evaluation. This function is used for debugging in this project.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model.set_train(False)
|
||||||
|
idx = np.random.randint(0, len(y))
|
||||||
|
length = input_lengths.asnumpy()[idx]
|
||||||
|
y_target = np.reshape(y.asnumpy()[idx], (-1))
|
||||||
|
y_target = y_target[:length]
|
||||||
|
|
||||||
|
if c is not None:
|
||||||
|
expand_op = P.ExpandDims()
|
||||||
|
if hparams.upsample_conditional_features:
|
||||||
|
c = expand_op(c[idx, :, :int(length // audio.get_hop_size() + hparams.cin_pad * 2)], 0)
|
||||||
|
else:
|
||||||
|
c = expand_op(c[idx, :, :length], 0)
|
||||||
|
assert c.dim() == 3
|
||||||
|
print("Shape of local conditioning features: {}".format(c.size()))
|
||||||
|
|
||||||
|
if g is not None:
|
||||||
|
g = g[idx]
|
||||||
|
print("Shape of global conditioning features: {}".format(g.size()))
|
||||||
|
|
||||||
|
# Dummy silence
|
||||||
|
if is_mulaw_quantize(hparams.input_type):
|
||||||
|
initial_value = P1.mulaw_quantize(0, hparams.quantize_channels - 1)
|
||||||
|
elif is_mulaw(hparams.input_type):
|
||||||
|
initial_value = P1.mulaw(0.0, hparams.quantize_channels)
|
||||||
|
else:
|
||||||
|
initial_value = 0.0
|
||||||
|
|
||||||
|
# (C,)
|
||||||
|
if is_mulaw_quantize(hparams.input_type):
|
||||||
|
initial_input = to_categorical(
|
||||||
|
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
|
||||||
|
initial_input = Tensor(np.reshape(initial_input, (1, 1, hparams.quantize_channels)))
|
||||||
|
|
||||||
|
else:
|
||||||
|
initial_input = np.ones((1, 1, 1)) * initial_value
|
||||||
|
initial_input = Tensor(initial_input)
|
||||||
|
|
||||||
|
# Run the model in fast eval mode
|
||||||
|
y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm,
|
||||||
|
log_scale_min=hparams.log_scale_min)
|
||||||
|
|
||||||
|
if is_mulaw_quantize(hparams.input_type):
|
||||||
|
y_hat = np.reshape(np.argmax(y_hat, 1), (-1))
|
||||||
|
y_hat = P1.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1)
|
||||||
|
y_target = P1.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1)
|
||||||
|
elif is_mulaw(hparams.input_type):
|
||||||
|
y_hat = P1.inv_mulaw(np.reshape(y_hat, (-1)), hparams.quantize_channels)
|
||||||
|
y_target = P1.inv_mulaw(y_target, hparams.quantize_channels)
|
||||||
|
else:
|
||||||
|
y_hat = np.reshape(y_hat, (-1))
|
||||||
|
|
||||||
|
# Save audio
|
||||||
|
os.makedirs(eval_dir, exist_ok=True)
|
||||||
|
path = os.path.join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
|
||||||
|
librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
|
||||||
|
|
||||||
|
path = os.path.join(eval_dir, "step{:09d}_target.wav".format(global_step))
|
||||||
|
librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)
|
||||||
|
|
||||||
|
# Save figure
|
||||||
|
path = os.path.join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
|
||||||
|
save_waveplot(path, y_hat, y_target, hparams.sample_rate)
|
||||||
|
|
||||||
|
|
||||||
|
class PredictNet(nn.Cell):
|
||||||
|
"""
|
||||||
|
NetWithLossClass definition
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, network):
|
||||||
|
super(PredictNet, self).__init__(auto_prefix=False)
|
||||||
|
self.network = network
|
||||||
|
|
||||||
|
def construct(self, x, c, g):
|
||||||
|
y_hat = self.network(x, c, g, False)
|
||||||
|
return y_hat
|
||||||
|
|
||||||
|
|
||||||
|
class NetWithLossClass(nn.Cell):
|
||||||
|
"""
|
||||||
|
NetWithLossClass definition
|
||||||
|
|
||||||
|
Args:
|
||||||
|
network (Cell): Pre-defined WaveNet.
|
||||||
|
hparams (optional): Parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor, loss tensor.
|
||||||
|
"""
|
||||||
|
def __init__(self, network, hparams):
|
||||||
|
super(NetWithLossClass, self).__init__(auto_prefix=False)
|
||||||
|
self.network = network
|
||||||
|
self.hparams = hparams
|
||||||
|
self.ReduceMean_false = P.ReduceMean(keep_dims=False)
|
||||||
|
self.expand_op = P.ExpandDims()
|
||||||
|
self.transpose_op = P.Transpose()
|
||||||
|
self.reshape_op = P.Reshape()
|
||||||
|
self.is_mulaw_quant = is_mulaw_quantize(hparams.input_type)
|
||||||
|
|
||||||
|
if self.is_mulaw_quant:
|
||||||
|
self.criterion = MaskedCrossEntropyLoss()
|
||||||
|
else:
|
||||||
|
if hparams.output_distribution == "Logistic":
|
||||||
|
self.criterion = DiscretizedMixturelogisticLoss(hparams)
|
||||||
|
elif hparams.output_distribution == "Normal":
|
||||||
|
self.criterion = MixtureGaussianLoss(hparams)
|
||||||
|
else:
|
||||||
|
self.criterion = None
|
||||||
|
raise RuntimeError(
|
||||||
|
"Not supported output distribution type: {}".format(hparams.output_distribution))
|
||||||
|
|
||||||
|
def construct(self, x, y, c, g, input_lengths, mask):
|
||||||
|
y_hat = self.network(x, c, g, False)
|
||||||
|
if self.is_mulaw_quant:
|
||||||
|
y_hat = self.transpose_op(y_hat[:, :, :-1], (0, 2, 1))
|
||||||
|
y_hat = self.reshape_op(y_hat, (-1, y_hat.shape[-1]))
|
||||||
|
y = self.reshape_op(y[:, 1:, 0], (-1,))
|
||||||
|
loss = self.criterion(y_hat, y)
|
||||||
|
else:
|
||||||
|
loss = self.criterion(y_hat[:, :, :-1], y[:, 1:, :], mask[:, 1:, :])
|
||||||
|
return loss
|
@ -0,0 +1,41 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""learning rate generator"""
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def get_lr(init_lr, total_epoch, step_per_epoch,
|
||||||
|
anneal_rate=0.5,
|
||||||
|
anneal_interval=200000):
|
||||||
|
"""
|
||||||
|
Learning rate generating
|
||||||
|
|
||||||
|
Args:
|
||||||
|
init_lr (float): Initial learning rate
|
||||||
|
total_epoch (int): Total epoch
|
||||||
|
step_per_epoch (int): Step per epoch
|
||||||
|
anneal_rate (float): anneal rate
|
||||||
|
anneal_interval (int ): anneal interval
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ndarray: learning rate
|
||||||
|
|
||||||
|
"""
|
||||||
|
total_step = total_epoch * step_per_epoch
|
||||||
|
lr_step = []
|
||||||
|
for i in range(total_step):
|
||||||
|
lr_step.append(init_lr * anneal_rate ** (i // anneal_interval))
|
||||||
|
learning_rate = np.array(lr_step).astype(np.float32)
|
||||||
|
return learning_rate
|
@ -0,0 +1,135 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""train_criteo."""
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from warnings import warn
|
||||||
|
from hparams import hparams, hparams_debug_string
|
||||||
|
|
||||||
|
from mindspore import context, Tensor
|
||||||
|
from mindspore.context import ParallelMode
|
||||||
|
from mindspore.communication.management import init, get_rank, get_group_size
|
||||||
|
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
|
||||||
|
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||||
|
from mindspore.nn.optim import Adam
|
||||||
|
from mindspore.nn import TrainOneStepCell
|
||||||
|
from mindspore.train import Model
|
||||||
|
from src.lr_generator import get_lr
|
||||||
|
from src.dataset import get_data_loaders
|
||||||
|
from src.loss import NetWithLossClass
|
||||||
|
from src.callback import Monitor
|
||||||
|
from wavenet_vocoder import WaveNet
|
||||||
|
from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='TTS training')
|
||||||
|
parser.add_argument('--data_path', type=str, required=True, default='',
|
||||||
|
help='Directory contains preprocessed features.')
|
||||||
|
parser.add_argument('--preset', type=str, required=True, default='', help='Path of preset parameters (json).')
|
||||||
|
parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints_test',
|
||||||
|
help='Directory where to save model checkpoints [default: checkpoints].')
|
||||||
|
parser.add_argument('--checkpoint', type=str, default='', help='Restore model from checkpoint path if given.')
|
||||||
|
parser.add_argument('--speaker_id', type=str, default='',
|
||||||
|
help=' Use specific speaker of data in case for multi-speaker datasets.')
|
||||||
|
parser.add_argument('--is_distributed', action="store_true", default=False, help='Distributed training')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if args.is_distributed:
|
||||||
|
init('nccl')
|
||||||
|
rank_id = get_rank()
|
||||||
|
group_size = get_group_size()
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
|
||||||
|
context.reset_auto_parallel_context()
|
||||||
|
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||||
|
gradients_mean=True)
|
||||||
|
else:
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
|
||||||
|
rank_id = 0
|
||||||
|
group_size = 1
|
||||||
|
|
||||||
|
speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
|
||||||
|
if args.preset is not None:
|
||||||
|
with open(args.preset) as f:
|
||||||
|
hparams.parse_json(f.read())
|
||||||
|
|
||||||
|
assert hparams.name == "wavenet_vocoder"
|
||||||
|
print(hparams_debug_string())
|
||||||
|
fs = hparams.sample_rate
|
||||||
|
os.makedirs(args.checkpoint_dir, exist_ok=True)
|
||||||
|
|
||||||
|
output_json_path = join(args.checkpoint_dir, "hparams.json")
|
||||||
|
with open(output_json_path, "w") as f:
|
||||||
|
json.dump(hparams.values(), f, indent=2)
|
||||||
|
|
||||||
|
data_loaders = get_data_loaders(args.data_path, args.speaker_id, hparams=hparams, rank_id=rank_id,
|
||||||
|
group_size=group_size)
|
||||||
|
step_size_per_epoch = data_loaders.get_dataset_size()
|
||||||
|
|
||||||
|
if is_mulaw_quantize(hparams.input_type):
|
||||||
|
if hparams.out_channels != hparams.quantize_channels:
|
||||||
|
raise RuntimeError(
|
||||||
|
"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
|
||||||
|
if hparams.upsample_conditional_features and hparams.cin_channels < 0:
|
||||||
|
s = "Upsample conv layers were specified while local conditioning disabled. "
|
||||||
|
s += "Notice that upsample conv layers will never be used."
|
||||||
|
warn(s)
|
||||||
|
|
||||||
|
upsample_params = hparams.upsample_params
|
||||||
|
upsample_params["cin_channels"] = hparams.cin_channels
|
||||||
|
upsample_params["cin_pad"] = hparams.cin_pad
|
||||||
|
model = WaveNet(
|
||||||
|
out_channels=hparams.out_channels,
|
||||||
|
layers=hparams.layers,
|
||||||
|
stacks=hparams.stacks,
|
||||||
|
residual_channels=hparams.residual_channels,
|
||||||
|
gate_channels=hparams.gate_channels,
|
||||||
|
skip_out_channels=hparams.skip_out_channels,
|
||||||
|
cin_channels=hparams.cin_channels,
|
||||||
|
gin_channels=hparams.gin_channels,
|
||||||
|
n_speakers=hparams.n_speakers,
|
||||||
|
dropout=hparams.dropout,
|
||||||
|
kernel_size=hparams.kernel_size,
|
||||||
|
cin_pad=hparams.cin_pad,
|
||||||
|
upsample_conditional_features=hparams.upsample_conditional_features,
|
||||||
|
upsample_params=upsample_params,
|
||||||
|
scalar_input=is_scalar_input(hparams.input_type),
|
||||||
|
output_distribution=hparams.output_distribution,
|
||||||
|
)
|
||||||
|
loss_net = NetWithLossClass(model, hparams)
|
||||||
|
lr = get_lr(hparams.optimizer_params["lr"], hparams.nepochs, step_size_per_epoch)
|
||||||
|
lr = Tensor(lr)
|
||||||
|
|
||||||
|
if args.checkpoint != '':
|
||||||
|
param_dict = load_checkpoint(args.pre_trained_model_path)
|
||||||
|
load_param_into_net(model, param_dict)
|
||||||
|
print('Successfully loading the pre-trained model')
|
||||||
|
|
||||||
|
weights = model.trainable_params()
|
||||||
|
optimizer = Adam(weights, learning_rate=lr, loss_scale=1024.)
|
||||||
|
train_net = TrainOneStepCell(loss_net, optimizer)
|
||||||
|
|
||||||
|
model = Model(train_net)
|
||||||
|
lr_cb = Monitor(lr)
|
||||||
|
callback_list = [lr_cb]
|
||||||
|
if args.is_distributed:
|
||||||
|
ckpt_path = os.path.join(args.checkpoint_dir, 'ckpt_' + str(get_rank()) + '/')
|
||||||
|
else:
|
||||||
|
ckpt_path = args.checkpoint_dir
|
||||||
|
config_ck = CheckpointConfig(save_checkpoint_steps=step_size_per_epoch, keep_checkpoint_max=10)
|
||||||
|
ckpt_cb = ModelCheckpoint(prefix='wavenet', directory=ckpt_path, config=config_ck)
|
||||||
|
callback_list.append(ckpt_cb)
|
||||||
|
model.train(hparams.nepochs, data_loaders, callbacks=callback_list)
|
@ -0,0 +1,17 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""init"""
|
||||||
|
from __future__ import with_statement, print_function, absolute_import
|
||||||
|
from .wavenet import WaveNet
|
@ -0,0 +1,176 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""Extended Conv1D."""
|
||||||
|
|
||||||
|
import math
|
||||||
|
from mindspore import nn, Tensor
|
||||||
|
from mindspore.ops import operations as P
|
||||||
|
import mindspore.common.dtype as mstype
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class Conv1d(nn.Conv1d):
|
||||||
|
"""
|
||||||
|
Extended nn.Conv1d to adapt to incremental dilated convolutions.
|
||||||
|
During training, initial Conv1D is used and during evaluation, incremental_forward is called.
|
||||||
|
To improve the inference speed, tensor will be converted as numpy and the following calculation is based on numpy.
|
||||||
|
These operation will be replaced with MindSpore ops in the future. Currently, some operation is not supported by
|
||||||
|
MindSpore and a mixed use of numpy and MindSpore will take a long time.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(Conv1d, self).__init__(*args, **kwargs)
|
||||||
|
self.clear_buffer()
|
||||||
|
self._linearized_weight = None
|
||||||
|
self.transpose_op = P.Transpose()
|
||||||
|
self.reshape_op = P.Reshape()
|
||||||
|
self.squeeze_op = P.Squeeze(-2)
|
||||||
|
self.zeros = P.Zeros()
|
||||||
|
self.concat_op = P.Concat(axis=1)
|
||||||
|
self.matmul = P.MatMul(transpose_b=True)
|
||||||
|
self.bias_add = P.BiasAdd()
|
||||||
|
self.get_weight = None
|
||||||
|
self.get_bias = None
|
||||||
|
|
||||||
|
def incremental_forward(self, inputs, is_numpy=True):
|
||||||
|
if is_numpy:
|
||||||
|
return self.incremental_forward_numpy(inputs)
|
||||||
|
return self.incremental_forward_pynative(inputs)
|
||||||
|
|
||||||
|
def incremental_forward_pynative(self, inputs):
|
||||||
|
"""
|
||||||
|
Incremental forward.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: B x T x C
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ndarray
|
||||||
|
|
||||||
|
"""
|
||||||
|
# input: (B, T, C)
|
||||||
|
if self.training:
|
||||||
|
raise RuntimeError('incremental_forward only supports eval mode')
|
||||||
|
|
||||||
|
if self.get_weight is None:
|
||||||
|
self.get_weight = self._get_linearized_weight()
|
||||||
|
|
||||||
|
if self.get_bias is None and self.bias is not None:
|
||||||
|
self.get_bias = self.bias
|
||||||
|
|
||||||
|
# Note mindspore uses Conv2D to construct Conv1D
|
||||||
|
kw = self.kernel_size[1]
|
||||||
|
dilation = self.dilation[1]
|
||||||
|
|
||||||
|
bsz = inputs.shape[0] # input: bsz x len x dim
|
||||||
|
if kw > 1:
|
||||||
|
if self.input_buffer is None:
|
||||||
|
init_buffer = self.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), mstype.float32)
|
||||||
|
self.input_buffer = self.concat_op((init_buffer[:, 1:, :], inputs[:, 0:1, :]))
|
||||||
|
else:
|
||||||
|
# shift buffer
|
||||||
|
self.input_buffer = self.concat_op((self.input_buffer[:, 1:, :], inputs[:, 0:1, :]))
|
||||||
|
inputs = self.input_buffer
|
||||||
|
if dilation > 1:
|
||||||
|
inputs = inputs[:, 0::dilation, :]
|
||||||
|
|
||||||
|
output = self.matmul(self.reshape_op(inputs, (bsz, -1)), self.get_weight)
|
||||||
|
if self.bias is not None:
|
||||||
|
output = self.bias_add(output, self.bias)
|
||||||
|
return self.reshape_op(output, (bsz, 1, -1))
|
||||||
|
|
||||||
|
def incremental_forward_numpy(self, inputs):
|
||||||
|
"""
|
||||||
|
Incremental forward.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: B x T x C
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ndarray
|
||||||
|
|
||||||
|
"""
|
||||||
|
# input: (B, T, C)
|
||||||
|
if self.training:
|
||||||
|
raise RuntimeError('incremental_forward only supports eval mode')
|
||||||
|
|
||||||
|
if self.get_weight is None:
|
||||||
|
weight = self._get_linearized_weight()
|
||||||
|
self.get_weight = weight.asnumpy()
|
||||||
|
|
||||||
|
if self.get_bias is None and self.bias is not None:
|
||||||
|
bias = self.bias
|
||||||
|
self.get_bias = bias.asnumpy()
|
||||||
|
|
||||||
|
# Note mindspore uses Conv2D to construct Conv1D
|
||||||
|
kw = self.kernel_size[1]
|
||||||
|
dilation = self.dilation[1]
|
||||||
|
|
||||||
|
bsz = inputs.shape[0] # input: bsz x len x dim
|
||||||
|
if kw > 1:
|
||||||
|
if self.input_buffer is None:
|
||||||
|
self.input_buffer = np.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), dtype=np.float32)
|
||||||
|
else:
|
||||||
|
# shift buffer
|
||||||
|
self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :]
|
||||||
|
# append next
|
||||||
|
self.input_buffer[:, -1, :] = inputs[:, -1, :]
|
||||||
|
inputs = self.input_buffer
|
||||||
|
if dilation > 1:
|
||||||
|
inputs = inputs[:, 0::dilation, :]
|
||||||
|
output = inputs.reshape(bsz, -1).dot(self.get_weight.T)
|
||||||
|
if self.bias is not None:
|
||||||
|
output = output + np.expand_dims(self.get_bias, 0)
|
||||||
|
return np.reshape(output, (bsz, 1, -1))
|
||||||
|
|
||||||
|
def clear_buffer(self):
|
||||||
|
self.input_buffer = None
|
||||||
|
|
||||||
|
def _get_linearized_weight(self):
|
||||||
|
"""
|
||||||
|
get linearized weight
|
||||||
|
"""
|
||||||
|
weight = self.squeeze_op(self.weight)
|
||||||
|
if self._linearized_weight is None:
|
||||||
|
# Note mindspore uses Conv2D to construct Conv1D
|
||||||
|
kw = self.kernel_size[1]
|
||||||
|
if weight.shape == (self.out_channels, self.in_channels, kw):
|
||||||
|
weight = self.transpose_op(weight, (0, 2, 1))
|
||||||
|
else:
|
||||||
|
weight = self.transpose_op(weight, (2, 0, 1))
|
||||||
|
self._linearized_weight = self.reshape_op(weight, (self.out_channels, -1))
|
||||||
|
return self._linearized_weight
|
||||||
|
|
||||||
|
def _clear_linearized_weight(self, *args):
|
||||||
|
self._linearized_weight = None
|
||||||
|
|
||||||
|
def _initialize_weights(self):
|
||||||
|
"""
|
||||||
|
weight initialization
|
||||||
|
"""
|
||||||
|
self.init_parameters_data()
|
||||||
|
std_mul = 4.0
|
||||||
|
for _, m in self.cells_and_names():
|
||||||
|
if isinstance(m, nn.Conv1d):
|
||||||
|
std = math.sqrt((std_mul * 0.1) / (m.kernel_size[1] * self.in_channels))
|
||||||
|
m.weight.set_data(Tensor(np.random.normal(0, std, m.weight.data.shape).astype("float32")))
|
||||||
|
if m.bias is not None:
|
||||||
|
m.bias.set_data(
|
||||||
|
Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
|
||||||
|
elif isinstance(m, nn.BatchNorm2d):
|
||||||
|
m.gamma.set_data(
|
||||||
|
Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
|
||||||
|
m.beta.set_data(
|
||||||
|
Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,213 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""
|
||||||
|
modules for wavenet
|
||||||
|
"""
|
||||||
|
from __future__ import with_statement, print_function, absolute_import
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
from wavenet_vocoder import conv
|
||||||
|
from mindspore import nn
|
||||||
|
from mindspore.ops import operations as P
|
||||||
|
|
||||||
|
|
||||||
|
def Conv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
|
||||||
|
m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs)
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
def Conv1d1x1(in_channels, out_channels, has_bias=True):
|
||||||
|
return Conv1d(in_channels, out_channels, kernel_size=1, pad_mode='pad', padding=0, dilation=1, has_bias=has_bias)
|
||||||
|
|
||||||
|
|
||||||
|
def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
|
||||||
|
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
def _conv1x1_forward(conv_, x, is_incremental, is_numpy=True):
|
||||||
|
"""
|
||||||
|
Conv1x1 forward
|
||||||
|
"""
|
||||||
|
if is_incremental:
|
||||||
|
x = conv_.incremental_forward(x, is_numpy=is_numpy)
|
||||||
|
else:
|
||||||
|
x = conv_(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualConv1dGLU(nn.Cell):
|
||||||
|
"""Residual dilated conv1d with gated activation units
|
||||||
|
|
||||||
|
Args:
|
||||||
|
residual_channels (int): Residual input / output channels
|
||||||
|
gate_channels (int): Gated activation channels.
|
||||||
|
kernel_size (int): Kernel size
|
||||||
|
skip_out_channels (int): Skip connection channels. If None, it will set to the same as residual_channels.
|
||||||
|
cin_channels (int): Local conditioning channels. If given negative value, local conditioning is disabled.
|
||||||
|
gin_channels (int): Global conditioning channels. If given negative value, global conditioning is disabled.
|
||||||
|
dropout (float): Dropout rate.
|
||||||
|
padding (int): Padding for convolution layers. If None, padding value will be computed according to dilation
|
||||||
|
and kernel_size.
|
||||||
|
dilation (int): Dilation factor.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, residual_channels=None, gate_channels=None, kernel_size=None, skip_out_channels=None, bias=True,
|
||||||
|
dropout=1 - 0.95, dilation=1, cin_channels=-1, gin_channels=-1, padding=None, causal=True):
|
||||||
|
super(ResidualConv1dGLU, self).__init__()
|
||||||
|
self.dropout = dropout
|
||||||
|
self.dropout_op = nn.Dropout(keep_prob=1. - self.dropout)
|
||||||
|
self.eval_split_op = P.Split(axis=-1, output_num=2)
|
||||||
|
self.train_split_op = P.Split(axis=1, output_num=2)
|
||||||
|
self.tanh = P.Tanh()
|
||||||
|
self.sigmoid = P.Sigmoid()
|
||||||
|
self.mul = P.Mul()
|
||||||
|
self.add = P.TensorAdd()
|
||||||
|
|
||||||
|
if skip_out_channels is None:
|
||||||
|
skip_out_channels = residual_channels
|
||||||
|
if padding is None:
|
||||||
|
if causal:
|
||||||
|
padding = (kernel_size - 1) * dilation
|
||||||
|
else:
|
||||||
|
padding = (kernel_size - 1) // 2 * dilation
|
||||||
|
self.causal = causal
|
||||||
|
|
||||||
|
self.conv = Conv1d(residual_channels, gate_channels, kernel_size, pad_mode='pad',
|
||||||
|
padding=padding, dilation=dilation, has_bias=bias)
|
||||||
|
|
||||||
|
# local conditioning
|
||||||
|
if cin_channels > 0:
|
||||||
|
self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, has_bias=False)
|
||||||
|
else:
|
||||||
|
self.conv1x1c = None
|
||||||
|
|
||||||
|
# global conditioning
|
||||||
|
if gin_channels > 0:
|
||||||
|
self.conv1x1g = Conv1d(gin_channels, gate_channels, has_bias=False, kernel_size=1, dilation=1)
|
||||||
|
else:
|
||||||
|
self.conv1x1g = None
|
||||||
|
|
||||||
|
gate_out_channels = gate_channels // 2
|
||||||
|
self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, has_bias=bias)
|
||||||
|
self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, has_bias=bias)
|
||||||
|
self.factor = math.sqrt(0.5)
|
||||||
|
|
||||||
|
def construct(self, x, c=None, g=None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x(Tensor): One-hot audio signal, the shape is B x C x T
|
||||||
|
c(Tensor): local conditional feature, the shape is B x cin_channels x T
|
||||||
|
g(Tensor): global conditional feature, not used currently
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Output tensor
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
residual = x
|
||||||
|
x = self.dropout_op(x)
|
||||||
|
x = self.conv(x)
|
||||||
|
# remove future time steps
|
||||||
|
x = x[:, :, :residual.shape[-1]] if self.causal else x
|
||||||
|
split_op = self.train_split_op
|
||||||
|
|
||||||
|
a, b = split_op(x)
|
||||||
|
|
||||||
|
# local conditioning
|
||||||
|
if c is not None:
|
||||||
|
c = _conv1x1_forward(self.conv1x1c, c, is_incremental=False)
|
||||||
|
ca, cb = split_op(c)
|
||||||
|
a, b = a + ca, b + cb
|
||||||
|
|
||||||
|
# global conditioning
|
||||||
|
if g is not None:
|
||||||
|
g = _conv1x1_forward(self.conv1x1g, g, is_incremental=False)
|
||||||
|
ga, gb = self.split(g)
|
||||||
|
a, b = a + ga, b + gb
|
||||||
|
|
||||||
|
x = self.mul(self.tanh(a), self.sigmoid(b))
|
||||||
|
|
||||||
|
# For skip connection
|
||||||
|
s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=False)
|
||||||
|
|
||||||
|
# For residual connection
|
||||||
|
x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=False)
|
||||||
|
|
||||||
|
x = self.add(x, residual) * self.factor
|
||||||
|
return x, s
|
||||||
|
|
||||||
|
def sigmoid_numpy(self, x):
|
||||||
|
return 1. / (1 + np.exp(-x))
|
||||||
|
|
||||||
|
def incremental_forward(self, x, c=None, g=None, is_numpy=True):
|
||||||
|
"""
|
||||||
|
Incremental forward. Used for inference stage
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (Tensor): One-hot audio signal, the shape is B x C x T
|
||||||
|
c (Tensor): local conditional feature, the shape is B x cin_channels x T
|
||||||
|
g (Tensor): global conditional feature, not used currently
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ndarray
|
||||||
|
"""
|
||||||
|
residual = x
|
||||||
|
x = self.conv.incremental_forward(x, is_numpy=is_numpy)
|
||||||
|
if is_numpy:
|
||||||
|
a, b = np.split(x, indices_or_sections=2, axis=-1)
|
||||||
|
else:
|
||||||
|
a, b = self.eval_split_op(x)
|
||||||
|
|
||||||
|
# local conditioning
|
||||||
|
if c is not None:
|
||||||
|
c = _conv1x1_forward(self.conv1x1c, c, is_incremental=True, is_numpy=is_numpy)
|
||||||
|
if is_numpy:
|
||||||
|
ca, cb = np.split(c, indices_or_sections=2, axis=-1)
|
||||||
|
else:
|
||||||
|
ca, cb = self.eval_split_op(c)
|
||||||
|
a, b = a + ca, b + cb
|
||||||
|
|
||||||
|
# global conditioning
|
||||||
|
if g is not None:
|
||||||
|
g = _conv1x1_forward(self.conv1x1g, g, is_incremental=True, is_numpy=is_numpy)
|
||||||
|
if is_numpy:
|
||||||
|
ga, gb = np.split(g, indices_or_sections=2, axis=-1)
|
||||||
|
else:
|
||||||
|
ga, gb = self.eval_split_op(c)
|
||||||
|
a, b = a + ga, b + gb
|
||||||
|
|
||||||
|
if is_numpy:
|
||||||
|
x = np.tanh(a) * self.sigmoid_numpy(b)
|
||||||
|
else:
|
||||||
|
x = self.mul(self.tanh(a), self.sigmoid(b))
|
||||||
|
|
||||||
|
# For skip connection
|
||||||
|
s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=True, is_numpy=is_numpy)
|
||||||
|
|
||||||
|
# For residual connection
|
||||||
|
x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=True, is_numpy=is_numpy)
|
||||||
|
|
||||||
|
x = (x + residual) * self.factor
|
||||||
|
return x, s
|
||||||
|
|
||||||
|
def clear_buffer(self):
|
||||||
|
"""clear buffer"""
|
||||||
|
for c in [self.conv, self.conv1x1_out, self.conv1x1_skip,
|
||||||
|
self.conv1x1c, self.conv1x1g]:
|
||||||
|
if c is not None:
|
||||||
|
c.clear_buffer()
|
@ -0,0 +1,118 @@
|
|||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""
|
||||||
|
Upsampling
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import with_statement, print_function, absolute_import
|
||||||
|
import numpy as np
|
||||||
|
from mindspore import nn
|
||||||
|
from mindspore.ops import operations as P
|
||||||
|
|
||||||
|
|
||||||
|
class Resize(nn.Cell):
|
||||||
|
"""
|
||||||
|
Resize input Tensor
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, x_scale, y_scale, mode="nearest"):
|
||||||
|
super(Resize, self).__init__()
|
||||||
|
self.x_scale = x_scale
|
||||||
|
self.y_scale = y_scale
|
||||||
|
self.mode = mode
|
||||||
|
|
||||||
|
def construct(self, x):
|
||||||
|
_, _, h, w = x.shape
|
||||||
|
interpolate_op = P.ResizeNearestNeighbor((self.y_scale * h, self.x_scale * w))
|
||||||
|
return interpolate_op(x)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_activation(upsample_activation):
|
||||||
|
"""get activation"""
|
||||||
|
nonlinear = getattr(nn, upsample_activation)
|
||||||
|
return nonlinear
|
||||||
|
|
||||||
|
|
||||||
|
class UpsampleNetwork(nn.Cell):
|
||||||
|
"""UpsampleNetwork"""
|
||||||
|
def __init__(self, upsample_scales, mode="nearest",
|
||||||
|
freq_axis_kernel_size=1, cin_pad=0, cin_channels=80):
|
||||||
|
super(UpsampleNetwork, self).__init__()
|
||||||
|
self.expand_op = P.ExpandDims()
|
||||||
|
self.squeeze_op = P.Squeeze(1)
|
||||||
|
up_layers = []
|
||||||
|
total_scale = np.prod(upsample_scales)
|
||||||
|
self.indent = cin_pad * total_scale
|
||||||
|
for scale in upsample_scales:
|
||||||
|
freq_axis_padding = (freq_axis_kernel_size - 1) // 2
|
||||||
|
k_size = (freq_axis_kernel_size, scale * 2 + 1)
|
||||||
|
# padding = (freq_axis_padding, scale)
|
||||||
|
padding = (freq_axis_padding, freq_axis_padding, scale, scale)
|
||||||
|
stretch = Resize(scale, 1, mode)
|
||||||
|
conv = nn.Conv2d(1, 1, kernel_size=k_size, has_bias=False, pad_mode='pad', padding=padding)
|
||||||
|
up_layers.append(stretch)
|
||||||
|
up_layers.append(conv)
|
||||||
|
# if upsample_activation != "none":
|
||||||
|
# nonlinear = _get_activation(upsample_activation)
|
||||||
|
# up_layers.append(nonlinear(**upsample_activation_params))
|
||||||
|
self.up_layers = nn.CellList(up_layers)
|
||||||
|
|
||||||
|
def construct(self, c):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
c (Tensor): Local conditioning feature
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Upsampling feature
|
||||||
|
|
||||||
|
"""
|
||||||
|
# B x 1 x C x T
|
||||||
|
c = self.expand_op(c, 1)
|
||||||
|
for f in self.up_layers:
|
||||||
|
c = f(c)
|
||||||
|
# B x C x T
|
||||||
|
c = self.squeeze_op(c)
|
||||||
|
|
||||||
|
# if self.indent > 0:
|
||||||
|
# c = c[:, :, self.indent:-self.indent]
|
||||||
|
return c
|
||||||
|
|
||||||
|
|
||||||
|
class ConvInUpsampleNetwork(nn.Cell):
|
||||||
|
"""Upsample Network
|
||||||
|
|
||||||
|
Args:
|
||||||
|
upsample_scales (list): Upsample_scales list.
|
||||||
|
upsample_activation (str): Upsample_activation.
|
||||||
|
mode (str): Resize mode, default is NearestNeighbor.
|
||||||
|
cin_channels (int): Local conditioning channels.
|
||||||
|
freq_axis_kernel_size (int): Freq-axis kernel_size for the convolution layers after resize.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, upsample_scales, mode="nearest",
|
||||||
|
freq_axis_kernel_size=1, cin_pad=0,
|
||||||
|
cin_channels=80):
|
||||||
|
super(ConvInUpsampleNetwork, self).__init__()
|
||||||
|
ks = 2 * cin_pad + 1
|
||||||
|
self.conv_in = nn.Conv1d(cin_channels, cin_channels, kernel_size=ks, has_bias=False, pad_mode='pad', padding=0)
|
||||||
|
self.upsample = UpsampleNetwork(upsample_scales, mode, freq_axis_kernel_size, cin_pad=0,
|
||||||
|
cin_channels=cin_channels)
|
||||||
|
|
||||||
|
def construct(self, c):
|
||||||
|
c = self.conv_in(c)
|
||||||
|
c_up = self.upsample(c)
|
||||||
|
return c_up
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue