parent
cfe8d6f32a
commit
acd40e37e2
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,95 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""export mindir."""
|
||||
import json
|
||||
from os.path import join
|
||||
import argparse
|
||||
from warnings import warn
|
||||
from hparams import hparams, hparams_debug_string
|
||||
from mindspore import context, Tensor
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
|
||||
from wavenet_vocoder import WaveNet
|
||||
from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
|
||||
import numpy as np
|
||||
from src.loss import PredictNet
|
||||
|
||||
parser = argparse.ArgumentParser(description='TTS training')
|
||||
parser.add_argument('--preset', type=str, default='', help='Path of preset parameters (json).')
|
||||
parser.add_argument('--speaker_id', type=str, default='',
|
||||
help=' Use specific speaker of data in case for multi-speaker datasets.')
|
||||
parser.add_argument('--pretrain_ckpt', type=str, default='', help='Pretrained checkpoint path')
|
||||
args = parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
|
||||
|
||||
speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
|
||||
if args.preset is not None:
|
||||
with open(args.preset) as f:
|
||||
hparams.parse_json(f.read())
|
||||
|
||||
assert hparams.name == "wavenet_vocoder"
|
||||
print(hparams_debug_string())
|
||||
|
||||
fs = hparams.sample_rate
|
||||
output_json_path = join(args.checkpoint_dir, "hparams.json")
|
||||
with open(output_json_path, "w") as f:
|
||||
json.dump(hparams.values(), f, indent=2)
|
||||
|
||||
if is_mulaw_quantize(hparams.input_type):
|
||||
if hparams.out_channels != hparams.quantize_channels:
|
||||
raise RuntimeError(
|
||||
"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
|
||||
if hparams.upsample_conditional_features and hparams.cin_channels < 0:
|
||||
s = "Upsample conv layers were specified while local conditioning disabled. "
|
||||
s += "Notice that upsample conv layers will never be used."
|
||||
warn(s)
|
||||
|
||||
upsample_params = hparams.upsample_params
|
||||
upsample_params["cin_channels"] = hparams.cin_channels
|
||||
upsample_params["cin_pad"] = hparams.cin_pad
|
||||
model = WaveNet(
|
||||
out_channels=hparams.out_channels,
|
||||
layers=hparams.layers,
|
||||
stacks=hparams.stacks,
|
||||
residual_channels=hparams.residual_channels,
|
||||
gate_channels=hparams.gate_channels,
|
||||
skip_out_channels=hparams.skip_out_channels,
|
||||
cin_channels=hparams.cin_channels,
|
||||
gin_channels=hparams.gin_channels,
|
||||
n_speakers=hparams.n_speakers,
|
||||
dropout=hparams.dropout,
|
||||
kernel_size=hparams.kernel_size,
|
||||
cin_pad=hparams.cin_pad,
|
||||
upsample_conditional_features=hparams.upsample_conditional_features,
|
||||
upsample_params=upsample_params,
|
||||
scalar_input=is_scalar_input(hparams.input_type),
|
||||
output_distribution=hparams.output_distribution,
|
||||
)
|
||||
|
||||
Net = PredictNet(model)
|
||||
Net.set_train(False)
|
||||
receptive_field = model.receptive_field
|
||||
print("Receptive field (samples / ms): {} / {}".format(receptive_field, receptive_field / fs * 1000))
|
||||
param_dict = load_checkpoint(args.pretrain_ckpt)
|
||||
load_param_into_net(model, param_dict)
|
||||
print('Successfully loading the pre-trained model')
|
||||
|
||||
x = np.array(np.random.random((2, 256, 10240)), dtype=np.float32)
|
||||
c = np.array(np.random.random((2, 80, 44)), dtype=np.float32)
|
||||
g = np.array([0, 0], dtype=np.int64)
|
||||
|
||||
export(Net, Tensor(x), Tensor(c), Tensor(g), file_name="WaveNet", file_format='MINDIR')
|
@ -0,0 +1,14 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the License);
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# httpwww.apache.orglicensesLICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an AS IS BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
@ -0,0 +1,103 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Defined callback for DeepFM.
|
||||
"""
|
||||
import time
|
||||
from mindspore.train.callback import Callback
|
||||
from mindspore import Tensor
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TimeMonitor(Callback):
|
||||
"""
|
||||
Time monitor for calculating cost of each epoch.
|
||||
|
||||
Args:
|
||||
data_size (int): step size of an epoch.
|
||||
"""
|
||||
|
||||
def __init__(self, data_size):
|
||||
super(TimeMonitor, self).__init__()
|
||||
self.data_size = data_size
|
||||
|
||||
def epoch_begin(self, run_context):
|
||||
self.epoch_time = time.time()
|
||||
|
||||
def epoch_end(self, run_context):
|
||||
epoch_mseconds = (time.time() - self.epoch_time) * 1000
|
||||
per_step_mseconds = epoch_mseconds / self.data_size
|
||||
print("epoch time: {0}, per step time: {1}".format(epoch_mseconds, per_step_mseconds), flush=True)
|
||||
|
||||
def step_begin(self, run_context):
|
||||
self.step_time = time.time()
|
||||
|
||||
def step_end(self, run_context):
|
||||
step_mseconds = (time.time() - self.step_time) * 1000
|
||||
print(f"step time {step_mseconds}", flush=True)
|
||||
|
||||
|
||||
class Monitor(Callback):
|
||||
"""
|
||||
Monitor loss and time.
|
||||
|
||||
Args:
|
||||
lr_init (numpy array): train lr
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Examples:
|
||||
>>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy())
|
||||
"""
|
||||
|
||||
def __init__(self, lr_init=None):
|
||||
super(Monitor, self).__init__()
|
||||
self.lr_init = lr_init
|
||||
self.lr_init_len = len(lr_init)
|
||||
|
||||
def epoch_begin(self, run_context):
|
||||
self.losses = []
|
||||
self.epoch_time = time.time()
|
||||
|
||||
def epoch_end(self, run_context):
|
||||
cb_params = run_context.original_args()
|
||||
|
||||
epoch_mseconds = (time.time() - self.epoch_time)
|
||||
per_step_mseconds = epoch_mseconds / cb_params.batch_num
|
||||
print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.6f}".format(epoch_mseconds,
|
||||
per_step_mseconds,
|
||||
np.mean(self.losses)))
|
||||
|
||||
def step_begin(self, run_context):
|
||||
self.step_time = time.time()
|
||||
|
||||
def step_end(self, run_context):
|
||||
"""step end"""
|
||||
cb_params = run_context.original_args()
|
||||
step_mseconds = (time.time() - self.step_time)
|
||||
step_loss = cb_params.net_outputs
|
||||
|
||||
if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor):
|
||||
step_loss = step_loss[0]
|
||||
if isinstance(step_loss, Tensor):
|
||||
step_loss = np.mean(step_loss.asnumpy())
|
||||
|
||||
self.losses.append(step_loss)
|
||||
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
|
||||
|
||||
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.6f}/{:5.6f}], time:[{:5.3f}], lr:[{:.9f}]".format(
|
||||
cb_params.cur_epoch_num -
|
||||
1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
|
||||
np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1].asnumpy()))
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,238 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""loss function definition"""
|
||||
import os
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from mindspore import nn, Tensor
|
||||
from mindspore.ops import operations as P
|
||||
from nnmnkwii import preprocessing as P1
|
||||
|
||||
from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw
|
||||
from wavenet_vocoder.mixture import discretized_mix_logistic_loss
|
||||
from wavenet_vocoder.mixture import mix_gaussian_loss
|
||||
from train_pytorch import to_categorical
|
||||
from tqdm import tqdm
|
||||
import audio
|
||||
import librosa
|
||||
import librosa.display
|
||||
matplotlib.use('Agg')
|
||||
|
||||
def sequence_mask(sequence_length, max_len=None):
|
||||
"""make sequence mask"""
|
||||
sequence_length = sequence_length.asnumpy()
|
||||
if max_len is None:
|
||||
max_len = np.max(sequence_length)
|
||||
batch_size = sequence_length.shape[0]
|
||||
seq_range = np.linspace(0, max_len-1, max_len, dtype=np.int32)
|
||||
seq_range_expand = np.tile(np.expand_dims(seq_range, 0), (batch_size, 1))
|
||||
seq_length_expand = np.tile(np.expand_dims(sequence_length, 1), (1, max_len))
|
||||
seq_length_expand = np.expand_dims(np.array(seq_range_expand < seq_length_expand, dtype=np.float32), -1)
|
||||
return Tensor(seq_length_expand)
|
||||
|
||||
class MaskedCrossEntropyLoss(nn.Cell):
|
||||
"""MaskedCrossEntropyLoss"""
|
||||
def __init__(self):
|
||||
super(MaskedCrossEntropyLoss, self).__init__()
|
||||
self.criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
|
||||
def construct(self, inputs, target):
|
||||
losses = self.criterion(inputs, target)
|
||||
return losses
|
||||
|
||||
|
||||
class DiscretizedMixturelogisticLoss(nn.Cell):
|
||||
"""DiscretizedMixturelogisticLoss"""
|
||||
def __init__(self, hparams):
|
||||
super(DiscretizedMixturelogisticLoss, self).__init__()
|
||||
self.quantize_channels = hparams.quantize_channels
|
||||
self.log_scale_min = hparams.log_scale_min
|
||||
self.discretized_mix_logistic_loss = discretized_mix_logistic_loss(num_classes=hparams.quantize_channels,
|
||||
log_scale_min=hparams.log_scale_min,
|
||||
reduce=False)
|
||||
self.reduce_sum_op = P.ReduceSum()
|
||||
self.reduce_mean_op = P.ReduceMean()
|
||||
|
||||
def construct(self, inputs, target, mask=None):
|
||||
losses = self.discretized_mix_logistic_loss(inputs, target)
|
||||
return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
|
||||
|
||||
|
||||
class MixtureGaussianLoss(nn.Cell):
|
||||
"""MixtureGaussianLoss"""
|
||||
def __init__(self, hparams):
|
||||
super(MixtureGaussianLoss, self).__init__()
|
||||
self.quantize_channels = hparams.quantize_channels
|
||||
self.log_scale_min = hparams.log_scale_min
|
||||
self.mix_gaussian_loss = mix_gaussian_loss(log_scale_min=hparams.log_scale_min, reduce=False)
|
||||
self.reduce_sum_op = P.ReduceSum()
|
||||
self.reduce_mean_op = P.ReduceMean()
|
||||
|
||||
def construct(self, inputs, target, mask=None):
|
||||
"""
|
||||
|
||||
Args:
|
||||
inputs (Tensor): Predicted distribution
|
||||
target (Tensor): Target
|
||||
mask (Tensor): Mask
|
||||
|
||||
Returns:
|
||||
Tensor: Loss tensor
|
||||
|
||||
"""
|
||||
losses = self.mix_gaussian_loss(inputs, target)
|
||||
return self.reduce_sum_op(losses * mask) / self.reduce_sum_op(mask)
|
||||
|
||||
|
||||
def save_waveplot(path, y_hat, y_target, sample_rate):
|
||||
sr = sample_rate
|
||||
plt.figure(figsize=(16, 6))
|
||||
plt.subplot(2, 1, 1)
|
||||
librosa.display.waveplot(y_target, sr=sr)
|
||||
plt.subplot(2, 1, 2)
|
||||
librosa.display.waveplot(y_hat, sr=sr)
|
||||
plt.tight_layout()
|
||||
plt.savefig(path, format="png")
|
||||
plt.close()
|
||||
|
||||
|
||||
def eval_model(hparams, global_step, model, x, y, c, g, input_lengths, eval_dir):
|
||||
"""
|
||||
Function for model evaluation. This function is used for debugging in this project.
|
||||
"""
|
||||
|
||||
model.set_train(False)
|
||||
idx = np.random.randint(0, len(y))
|
||||
length = input_lengths.asnumpy()[idx]
|
||||
y_target = np.reshape(y.asnumpy()[idx], (-1))
|
||||
y_target = y_target[:length]
|
||||
|
||||
if c is not None:
|
||||
expand_op = P.ExpandDims()
|
||||
if hparams.upsample_conditional_features:
|
||||
c = expand_op(c[idx, :, :int(length // audio.get_hop_size() + hparams.cin_pad * 2)], 0)
|
||||
else:
|
||||
c = expand_op(c[idx, :, :length], 0)
|
||||
assert c.dim() == 3
|
||||
print("Shape of local conditioning features: {}".format(c.size()))
|
||||
|
||||
if g is not None:
|
||||
g = g[idx]
|
||||
print("Shape of global conditioning features: {}".format(g.size()))
|
||||
|
||||
# Dummy silence
|
||||
if is_mulaw_quantize(hparams.input_type):
|
||||
initial_value = P1.mulaw_quantize(0, hparams.quantize_channels - 1)
|
||||
elif is_mulaw(hparams.input_type):
|
||||
initial_value = P1.mulaw(0.0, hparams.quantize_channels)
|
||||
else:
|
||||
initial_value = 0.0
|
||||
|
||||
# (C,)
|
||||
if is_mulaw_quantize(hparams.input_type):
|
||||
initial_input = to_categorical(
|
||||
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
|
||||
initial_input = Tensor(np.reshape(initial_input, (1, 1, hparams.quantize_channels)))
|
||||
|
||||
else:
|
||||
initial_input = np.ones((1, 1, 1)) * initial_value
|
||||
initial_input = Tensor(initial_input)
|
||||
|
||||
# Run the model in fast eval mode
|
||||
y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm,
|
||||
log_scale_min=hparams.log_scale_min)
|
||||
|
||||
if is_mulaw_quantize(hparams.input_type):
|
||||
y_hat = np.reshape(np.argmax(y_hat, 1), (-1))
|
||||
y_hat = P1.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1)
|
||||
y_target = P1.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1)
|
||||
elif is_mulaw(hparams.input_type):
|
||||
y_hat = P1.inv_mulaw(np.reshape(y_hat, (-1)), hparams.quantize_channels)
|
||||
y_target = P1.inv_mulaw(y_target, hparams.quantize_channels)
|
||||
else:
|
||||
y_hat = np.reshape(y_hat, (-1))
|
||||
|
||||
# Save audio
|
||||
os.makedirs(eval_dir, exist_ok=True)
|
||||
path = os.path.join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
|
||||
librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
|
||||
|
||||
path = os.path.join(eval_dir, "step{:09d}_target.wav".format(global_step))
|
||||
librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)
|
||||
|
||||
# Save figure
|
||||
path = os.path.join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
|
||||
save_waveplot(path, y_hat, y_target, hparams.sample_rate)
|
||||
|
||||
|
||||
class PredictNet(nn.Cell):
|
||||
"""
|
||||
NetWithLossClass definition
|
||||
"""
|
||||
|
||||
def __init__(self, network):
|
||||
super(PredictNet, self).__init__(auto_prefix=False)
|
||||
self.network = network
|
||||
|
||||
def construct(self, x, c, g):
|
||||
y_hat = self.network(x, c, g, False)
|
||||
return y_hat
|
||||
|
||||
|
||||
class NetWithLossClass(nn.Cell):
|
||||
"""
|
||||
NetWithLossClass definition
|
||||
|
||||
Args:
|
||||
network (Cell): Pre-defined WaveNet.
|
||||
hparams (optional): Parameters.
|
||||
|
||||
Returns:
|
||||
Tensor, loss tensor.
|
||||
"""
|
||||
def __init__(self, network, hparams):
|
||||
super(NetWithLossClass, self).__init__(auto_prefix=False)
|
||||
self.network = network
|
||||
self.hparams = hparams
|
||||
self.ReduceMean_false = P.ReduceMean(keep_dims=False)
|
||||
self.expand_op = P.ExpandDims()
|
||||
self.transpose_op = P.Transpose()
|
||||
self.reshape_op = P.Reshape()
|
||||
self.is_mulaw_quant = is_mulaw_quantize(hparams.input_type)
|
||||
|
||||
if self.is_mulaw_quant:
|
||||
self.criterion = MaskedCrossEntropyLoss()
|
||||
else:
|
||||
if hparams.output_distribution == "Logistic":
|
||||
self.criterion = DiscretizedMixturelogisticLoss(hparams)
|
||||
elif hparams.output_distribution == "Normal":
|
||||
self.criterion = MixtureGaussianLoss(hparams)
|
||||
else:
|
||||
self.criterion = None
|
||||
raise RuntimeError(
|
||||
"Not supported output distribution type: {}".format(hparams.output_distribution))
|
||||
|
||||
def construct(self, x, y, c, g, input_lengths, mask):
|
||||
y_hat = self.network(x, c, g, False)
|
||||
if self.is_mulaw_quant:
|
||||
y_hat = self.transpose_op(y_hat[:, :, :-1], (0, 2, 1))
|
||||
y_hat = self.reshape_op(y_hat, (-1, y_hat.shape[-1]))
|
||||
y = self.reshape_op(y[:, 1:, 0], (-1,))
|
||||
loss = self.criterion(y_hat, y)
|
||||
else:
|
||||
loss = self.criterion(y_hat[:, :, :-1], y[:, 1:, :], mask[:, 1:, :])
|
||||
return loss
|
@ -0,0 +1,41 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""learning rate generator"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_lr(init_lr, total_epoch, step_per_epoch,
|
||||
anneal_rate=0.5,
|
||||
anneal_interval=200000):
|
||||
"""
|
||||
Learning rate generating
|
||||
|
||||
Args:
|
||||
init_lr (float): Initial learning rate
|
||||
total_epoch (int): Total epoch
|
||||
step_per_epoch (int): Step per epoch
|
||||
anneal_rate (float): anneal rate
|
||||
anneal_interval (int ): anneal interval
|
||||
|
||||
Returns:
|
||||
ndarray: learning rate
|
||||
|
||||
"""
|
||||
total_step = total_epoch * step_per_epoch
|
||||
lr_step = []
|
||||
for i in range(total_step):
|
||||
lr_step.append(init_lr * anneal_rate ** (i // anneal_interval))
|
||||
learning_rate = np.array(lr_step).astype(np.float32)
|
||||
return learning_rate
|
@ -0,0 +1,135 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""train_criteo."""
|
||||
import os
|
||||
from os.path import join
|
||||
import json
|
||||
import argparse
|
||||
from warnings import warn
|
||||
from hparams import hparams, hparams_debug_string
|
||||
|
||||
from mindspore import context, Tensor
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.nn.optim import Adam
|
||||
from mindspore.nn import TrainOneStepCell
|
||||
from mindspore.train import Model
|
||||
from src.lr_generator import get_lr
|
||||
from src.dataset import get_data_loaders
|
||||
from src.loss import NetWithLossClass
|
||||
from src.callback import Monitor
|
||||
from wavenet_vocoder import WaveNet
|
||||
from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
|
||||
|
||||
parser = argparse.ArgumentParser(description='TTS training')
|
||||
parser.add_argument('--data_path', type=str, required=True, default='',
|
||||
help='Directory contains preprocessed features.')
|
||||
parser.add_argument('--preset', type=str, required=True, default='', help='Path of preset parameters (json).')
|
||||
parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints_test',
|
||||
help='Directory where to save model checkpoints [default: checkpoints].')
|
||||
parser.add_argument('--checkpoint', type=str, default='', help='Restore model from checkpoint path if given.')
|
||||
parser.add_argument('--speaker_id', type=str, default='',
|
||||
help=' Use specific speaker of data in case for multi-speaker datasets.')
|
||||
parser.add_argument('--is_distributed', action="store_true", default=False, help='Distributed training')
|
||||
args = parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
if args.is_distributed:
|
||||
init('nccl')
|
||||
rank_id = get_rank()
|
||||
group_size = get_group_size()
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
|
||||
context.reset_auto_parallel_context()
|
||||
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True)
|
||||
else:
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
|
||||
rank_id = 0
|
||||
group_size = 1
|
||||
|
||||
speaker_id = int(args.speaker_id) if args.speaker_id != '' else None
|
||||
if args.preset is not None:
|
||||
with open(args.preset) as f:
|
||||
hparams.parse_json(f.read())
|
||||
|
||||
assert hparams.name == "wavenet_vocoder"
|
||||
print(hparams_debug_string())
|
||||
fs = hparams.sample_rate
|
||||
os.makedirs(args.checkpoint_dir, exist_ok=True)
|
||||
|
||||
output_json_path = join(args.checkpoint_dir, "hparams.json")
|
||||
with open(output_json_path, "w") as f:
|
||||
json.dump(hparams.values(), f, indent=2)
|
||||
|
||||
data_loaders = get_data_loaders(args.data_path, args.speaker_id, hparams=hparams, rank_id=rank_id,
|
||||
group_size=group_size)
|
||||
step_size_per_epoch = data_loaders.get_dataset_size()
|
||||
|
||||
if is_mulaw_quantize(hparams.input_type):
|
||||
if hparams.out_channels != hparams.quantize_channels:
|
||||
raise RuntimeError(
|
||||
"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
|
||||
if hparams.upsample_conditional_features and hparams.cin_channels < 0:
|
||||
s = "Upsample conv layers were specified while local conditioning disabled. "
|
||||
s += "Notice that upsample conv layers will never be used."
|
||||
warn(s)
|
||||
|
||||
upsample_params = hparams.upsample_params
|
||||
upsample_params["cin_channels"] = hparams.cin_channels
|
||||
upsample_params["cin_pad"] = hparams.cin_pad
|
||||
model = WaveNet(
|
||||
out_channels=hparams.out_channels,
|
||||
layers=hparams.layers,
|
||||
stacks=hparams.stacks,
|
||||
residual_channels=hparams.residual_channels,
|
||||
gate_channels=hparams.gate_channels,
|
||||
skip_out_channels=hparams.skip_out_channels,
|
||||
cin_channels=hparams.cin_channels,
|
||||
gin_channels=hparams.gin_channels,
|
||||
n_speakers=hparams.n_speakers,
|
||||
dropout=hparams.dropout,
|
||||
kernel_size=hparams.kernel_size,
|
||||
cin_pad=hparams.cin_pad,
|
||||
upsample_conditional_features=hparams.upsample_conditional_features,
|
||||
upsample_params=upsample_params,
|
||||
scalar_input=is_scalar_input(hparams.input_type),
|
||||
output_distribution=hparams.output_distribution,
|
||||
)
|
||||
loss_net = NetWithLossClass(model, hparams)
|
||||
lr = get_lr(hparams.optimizer_params["lr"], hparams.nepochs, step_size_per_epoch)
|
||||
lr = Tensor(lr)
|
||||
|
||||
if args.checkpoint != '':
|
||||
param_dict = load_checkpoint(args.pre_trained_model_path)
|
||||
load_param_into_net(model, param_dict)
|
||||
print('Successfully loading the pre-trained model')
|
||||
|
||||
weights = model.trainable_params()
|
||||
optimizer = Adam(weights, learning_rate=lr, loss_scale=1024.)
|
||||
train_net = TrainOneStepCell(loss_net, optimizer)
|
||||
|
||||
model = Model(train_net)
|
||||
lr_cb = Monitor(lr)
|
||||
callback_list = [lr_cb]
|
||||
if args.is_distributed:
|
||||
ckpt_path = os.path.join(args.checkpoint_dir, 'ckpt_' + str(get_rank()) + '/')
|
||||
else:
|
||||
ckpt_path = args.checkpoint_dir
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=step_size_per_epoch, keep_checkpoint_max=10)
|
||||
ckpt_cb = ModelCheckpoint(prefix='wavenet', directory=ckpt_path, config=config_ck)
|
||||
callback_list.append(ckpt_cb)
|
||||
model.train(hparams.nepochs, data_loaders, callbacks=callback_list)
|
@ -0,0 +1,17 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""init"""
|
||||
from __future__ import with_statement, print_function, absolute_import
|
||||
from .wavenet import WaveNet
|
@ -0,0 +1,176 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Extended Conv1D."""
|
||||
|
||||
import math
|
||||
from mindspore import nn, Tensor
|
||||
from mindspore.ops import operations as P
|
||||
import mindspore.common.dtype as mstype
|
||||
import numpy as np
|
||||
|
||||
class Conv1d(nn.Conv1d):
|
||||
"""
|
||||
Extended nn.Conv1d to adapt to incremental dilated convolutions.
|
||||
During training, initial Conv1D is used and during evaluation, incremental_forward is called.
|
||||
To improve the inference speed, tensor will be converted as numpy and the following calculation is based on numpy.
|
||||
These operation will be replaced with MindSpore ops in the future. Currently, some operation is not supported by
|
||||
MindSpore and a mixed use of numpy and MindSpore will take a long time.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Conv1d, self).__init__(*args, **kwargs)
|
||||
self.clear_buffer()
|
||||
self._linearized_weight = None
|
||||
self.transpose_op = P.Transpose()
|
||||
self.reshape_op = P.Reshape()
|
||||
self.squeeze_op = P.Squeeze(-2)
|
||||
self.zeros = P.Zeros()
|
||||
self.concat_op = P.Concat(axis=1)
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.bias_add = P.BiasAdd()
|
||||
self.get_weight = None
|
||||
self.get_bias = None
|
||||
|
||||
def incremental_forward(self, inputs, is_numpy=True):
|
||||
if is_numpy:
|
||||
return self.incremental_forward_numpy(inputs)
|
||||
return self.incremental_forward_pynative(inputs)
|
||||
|
||||
def incremental_forward_pynative(self, inputs):
|
||||
"""
|
||||
Incremental forward.
|
||||
|
||||
Args:
|
||||
inputs: B x T x C
|
||||
|
||||
Returns:
|
||||
ndarray
|
||||
|
||||
"""
|
||||
# input: (B, T, C)
|
||||
if self.training:
|
||||
raise RuntimeError('incremental_forward only supports eval mode')
|
||||
|
||||
if self.get_weight is None:
|
||||
self.get_weight = self._get_linearized_weight()
|
||||
|
||||
if self.get_bias is None and self.bias is not None:
|
||||
self.get_bias = self.bias
|
||||
|
||||
# Note mindspore uses Conv2D to construct Conv1D
|
||||
kw = self.kernel_size[1]
|
||||
dilation = self.dilation[1]
|
||||
|
||||
bsz = inputs.shape[0] # input: bsz x len x dim
|
||||
if kw > 1:
|
||||
if self.input_buffer is None:
|
||||
init_buffer = self.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), mstype.float32)
|
||||
self.input_buffer = self.concat_op((init_buffer[:, 1:, :], inputs[:, 0:1, :]))
|
||||
else:
|
||||
# shift buffer
|
||||
self.input_buffer = self.concat_op((self.input_buffer[:, 1:, :], inputs[:, 0:1, :]))
|
||||
inputs = self.input_buffer
|
||||
if dilation > 1:
|
||||
inputs = inputs[:, 0::dilation, :]
|
||||
|
||||
output = self.matmul(self.reshape_op(inputs, (bsz, -1)), self.get_weight)
|
||||
if self.bias is not None:
|
||||
output = self.bias_add(output, self.bias)
|
||||
return self.reshape_op(output, (bsz, 1, -1))
|
||||
|
||||
def incremental_forward_numpy(self, inputs):
|
||||
"""
|
||||
Incremental forward.
|
||||
|
||||
Args:
|
||||
inputs: B x T x C
|
||||
|
||||
Returns:
|
||||
ndarray
|
||||
|
||||
"""
|
||||
# input: (B, T, C)
|
||||
if self.training:
|
||||
raise RuntimeError('incremental_forward only supports eval mode')
|
||||
|
||||
if self.get_weight is None:
|
||||
weight = self._get_linearized_weight()
|
||||
self.get_weight = weight.asnumpy()
|
||||
|
||||
if self.get_bias is None and self.bias is not None:
|
||||
bias = self.bias
|
||||
self.get_bias = bias.asnumpy()
|
||||
|
||||
# Note mindspore uses Conv2D to construct Conv1D
|
||||
kw = self.kernel_size[1]
|
||||
dilation = self.dilation[1]
|
||||
|
||||
bsz = inputs.shape[0] # input: bsz x len x dim
|
||||
if kw > 1:
|
||||
if self.input_buffer is None:
|
||||
self.input_buffer = np.zeros((bsz, kw + (kw - 1) * (dilation - 1), inputs.shape[2]), dtype=np.float32)
|
||||
else:
|
||||
# shift buffer
|
||||
self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :]
|
||||
# append next
|
||||
self.input_buffer[:, -1, :] = inputs[:, -1, :]
|
||||
inputs = self.input_buffer
|
||||
if dilation > 1:
|
||||
inputs = inputs[:, 0::dilation, :]
|
||||
output = inputs.reshape(bsz, -1).dot(self.get_weight.T)
|
||||
if self.bias is not None:
|
||||
output = output + np.expand_dims(self.get_bias, 0)
|
||||
return np.reshape(output, (bsz, 1, -1))
|
||||
|
||||
def clear_buffer(self):
|
||||
self.input_buffer = None
|
||||
|
||||
def _get_linearized_weight(self):
|
||||
"""
|
||||
get linearized weight
|
||||
"""
|
||||
weight = self.squeeze_op(self.weight)
|
||||
if self._linearized_weight is None:
|
||||
# Note mindspore uses Conv2D to construct Conv1D
|
||||
kw = self.kernel_size[1]
|
||||
if weight.shape == (self.out_channels, self.in_channels, kw):
|
||||
weight = self.transpose_op(weight, (0, 2, 1))
|
||||
else:
|
||||
weight = self.transpose_op(weight, (2, 0, 1))
|
||||
self._linearized_weight = self.reshape_op(weight, (self.out_channels, -1))
|
||||
return self._linearized_weight
|
||||
|
||||
def _clear_linearized_weight(self, *args):
|
||||
self._linearized_weight = None
|
||||
|
||||
def _initialize_weights(self):
|
||||
"""
|
||||
weight initialization
|
||||
"""
|
||||
self.init_parameters_data()
|
||||
std_mul = 4.0
|
||||
for _, m in self.cells_and_names():
|
||||
if isinstance(m, nn.Conv1d):
|
||||
std = math.sqrt((std_mul * 0.1) / (m.kernel_size[1] * self.in_channels))
|
||||
m.weight.set_data(Tensor(np.random.normal(0, std, m.weight.data.shape).astype("float32")))
|
||||
if m.bias is not None:
|
||||
m.bias.set_data(
|
||||
Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
m.gamma.set_data(
|
||||
Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
|
||||
m.beta.set_data(
|
||||
Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,213 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
modules for wavenet
|
||||
"""
|
||||
from __future__ import with_statement, print_function, absolute_import
|
||||
import math
|
||||
import numpy as np
|
||||
from wavenet_vocoder import conv
|
||||
from mindspore import nn
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
|
||||
def Conv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
|
||||
m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs)
|
||||
return m
|
||||
|
||||
|
||||
def Conv1d1x1(in_channels, out_channels, has_bias=True):
|
||||
return Conv1d(in_channels, out_channels, kernel_size=1, pad_mode='pad', padding=0, dilation=1, has_bias=has_bias)
|
||||
|
||||
|
||||
def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
|
||||
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
|
||||
return m
|
||||
|
||||
|
||||
def _conv1x1_forward(conv_, x, is_incremental, is_numpy=True):
|
||||
"""
|
||||
Conv1x1 forward
|
||||
"""
|
||||
if is_incremental:
|
||||
x = conv_.incremental_forward(x, is_numpy=is_numpy)
|
||||
else:
|
||||
x = conv_(x)
|
||||
return x
|
||||
|
||||
|
||||
class ResidualConv1dGLU(nn.Cell):
|
||||
"""Residual dilated conv1d with gated activation units
|
||||
|
||||
Args:
|
||||
residual_channels (int): Residual input / output channels
|
||||
gate_channels (int): Gated activation channels.
|
||||
kernel_size (int): Kernel size
|
||||
skip_out_channels (int): Skip connection channels. If None, it will set to the same as residual_channels.
|
||||
cin_channels (int): Local conditioning channels. If given negative value, local conditioning is disabled.
|
||||
gin_channels (int): Global conditioning channels. If given negative value, global conditioning is disabled.
|
||||
dropout (float): Dropout rate.
|
||||
padding (int): Padding for convolution layers. If None, padding value will be computed according to dilation
|
||||
and kernel_size.
|
||||
dilation (int): Dilation factor.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, residual_channels=None, gate_channels=None, kernel_size=None, skip_out_channels=None, bias=True,
|
||||
dropout=1 - 0.95, dilation=1, cin_channels=-1, gin_channels=-1, padding=None, causal=True):
|
||||
super(ResidualConv1dGLU, self).__init__()
|
||||
self.dropout = dropout
|
||||
self.dropout_op = nn.Dropout(keep_prob=1. - self.dropout)
|
||||
self.eval_split_op = P.Split(axis=-1, output_num=2)
|
||||
self.train_split_op = P.Split(axis=1, output_num=2)
|
||||
self.tanh = P.Tanh()
|
||||
self.sigmoid = P.Sigmoid()
|
||||
self.mul = P.Mul()
|
||||
self.add = P.TensorAdd()
|
||||
|
||||
if skip_out_channels is None:
|
||||
skip_out_channels = residual_channels
|
||||
if padding is None:
|
||||
if causal:
|
||||
padding = (kernel_size - 1) * dilation
|
||||
else:
|
||||
padding = (kernel_size - 1) // 2 * dilation
|
||||
self.causal = causal
|
||||
|
||||
self.conv = Conv1d(residual_channels, gate_channels, kernel_size, pad_mode='pad',
|
||||
padding=padding, dilation=dilation, has_bias=bias)
|
||||
|
||||
# local conditioning
|
||||
if cin_channels > 0:
|
||||
self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, has_bias=False)
|
||||
else:
|
||||
self.conv1x1c = None
|
||||
|
||||
# global conditioning
|
||||
if gin_channels > 0:
|
||||
self.conv1x1g = Conv1d(gin_channels, gate_channels, has_bias=False, kernel_size=1, dilation=1)
|
||||
else:
|
||||
self.conv1x1g = None
|
||||
|
||||
gate_out_channels = gate_channels // 2
|
||||
self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, has_bias=bias)
|
||||
self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, has_bias=bias)
|
||||
self.factor = math.sqrt(0.5)
|
||||
|
||||
def construct(self, x, c=None, g=None):
|
||||
"""
|
||||
|
||||
Args:
|
||||
x(Tensor): One-hot audio signal, the shape is B x C x T
|
||||
c(Tensor): local conditional feature, the shape is B x cin_channels x T
|
||||
g(Tensor): global conditional feature, not used currently
|
||||
|
||||
Returns:
|
||||
Tensor: Output tensor
|
||||
|
||||
"""
|
||||
|
||||
residual = x
|
||||
x = self.dropout_op(x)
|
||||
x = self.conv(x)
|
||||
# remove future time steps
|
||||
x = x[:, :, :residual.shape[-1]] if self.causal else x
|
||||
split_op = self.train_split_op
|
||||
|
||||
a, b = split_op(x)
|
||||
|
||||
# local conditioning
|
||||
if c is not None:
|
||||
c = _conv1x1_forward(self.conv1x1c, c, is_incremental=False)
|
||||
ca, cb = split_op(c)
|
||||
a, b = a + ca, b + cb
|
||||
|
||||
# global conditioning
|
||||
if g is not None:
|
||||
g = _conv1x1_forward(self.conv1x1g, g, is_incremental=False)
|
||||
ga, gb = self.split(g)
|
||||
a, b = a + ga, b + gb
|
||||
|
||||
x = self.mul(self.tanh(a), self.sigmoid(b))
|
||||
|
||||
# For skip connection
|
||||
s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=False)
|
||||
|
||||
# For residual connection
|
||||
x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=False)
|
||||
|
||||
x = self.add(x, residual) * self.factor
|
||||
return x, s
|
||||
|
||||
def sigmoid_numpy(self, x):
|
||||
return 1. / (1 + np.exp(-x))
|
||||
|
||||
def incremental_forward(self, x, c=None, g=None, is_numpy=True):
|
||||
"""
|
||||
Incremental forward. Used for inference stage
|
||||
|
||||
Args:
|
||||
x (Tensor): One-hot audio signal, the shape is B x C x T
|
||||
c (Tensor): local conditional feature, the shape is B x cin_channels x T
|
||||
g (Tensor): global conditional feature, not used currently
|
||||
|
||||
Returns:
|
||||
ndarray
|
||||
"""
|
||||
residual = x
|
||||
x = self.conv.incremental_forward(x, is_numpy=is_numpy)
|
||||
if is_numpy:
|
||||
a, b = np.split(x, indices_or_sections=2, axis=-1)
|
||||
else:
|
||||
a, b = self.eval_split_op(x)
|
||||
|
||||
# local conditioning
|
||||
if c is not None:
|
||||
c = _conv1x1_forward(self.conv1x1c, c, is_incremental=True, is_numpy=is_numpy)
|
||||
if is_numpy:
|
||||
ca, cb = np.split(c, indices_or_sections=2, axis=-1)
|
||||
else:
|
||||
ca, cb = self.eval_split_op(c)
|
||||
a, b = a + ca, b + cb
|
||||
|
||||
# global conditioning
|
||||
if g is not None:
|
||||
g = _conv1x1_forward(self.conv1x1g, g, is_incremental=True, is_numpy=is_numpy)
|
||||
if is_numpy:
|
||||
ga, gb = np.split(g, indices_or_sections=2, axis=-1)
|
||||
else:
|
||||
ga, gb = self.eval_split_op(c)
|
||||
a, b = a + ga, b + gb
|
||||
|
||||
if is_numpy:
|
||||
x = np.tanh(a) * self.sigmoid_numpy(b)
|
||||
else:
|
||||
x = self.mul(self.tanh(a), self.sigmoid(b))
|
||||
|
||||
# For skip connection
|
||||
s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental=True, is_numpy=is_numpy)
|
||||
|
||||
# For residual connection
|
||||
x = _conv1x1_forward(self.conv1x1_out, x, is_incremental=True, is_numpy=is_numpy)
|
||||
|
||||
x = (x + residual) * self.factor
|
||||
return x, s
|
||||
|
||||
def clear_buffer(self):
|
||||
"""clear buffer"""
|
||||
for c in [self.conv, self.conv1x1_out, self.conv1x1_skip,
|
||||
self.conv1x1c, self.conv1x1g]:
|
||||
if c is not None:
|
||||
c.clear_buffer()
|
@ -0,0 +1,118 @@
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
Upsampling
|
||||
|
||||
"""
|
||||
from __future__ import with_statement, print_function, absolute_import
|
||||
import numpy as np
|
||||
from mindspore import nn
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
|
||||
class Resize(nn.Cell):
|
||||
"""
|
||||
Resize input Tensor
|
||||
"""
|
||||
|
||||
def __init__(self, x_scale, y_scale, mode="nearest"):
|
||||
super(Resize, self).__init__()
|
||||
self.x_scale = x_scale
|
||||
self.y_scale = y_scale
|
||||
self.mode = mode
|
||||
|
||||
def construct(self, x):
|
||||
_, _, h, w = x.shape
|
||||
interpolate_op = P.ResizeNearestNeighbor((self.y_scale * h, self.x_scale * w))
|
||||
return interpolate_op(x)
|
||||
|
||||
|
||||
def _get_activation(upsample_activation):
|
||||
"""get activation"""
|
||||
nonlinear = getattr(nn, upsample_activation)
|
||||
return nonlinear
|
||||
|
||||
|
||||
class UpsampleNetwork(nn.Cell):
|
||||
"""UpsampleNetwork"""
|
||||
def __init__(self, upsample_scales, mode="nearest",
|
||||
freq_axis_kernel_size=1, cin_pad=0, cin_channels=80):
|
||||
super(UpsampleNetwork, self).__init__()
|
||||
self.expand_op = P.ExpandDims()
|
||||
self.squeeze_op = P.Squeeze(1)
|
||||
up_layers = []
|
||||
total_scale = np.prod(upsample_scales)
|
||||
self.indent = cin_pad * total_scale
|
||||
for scale in upsample_scales:
|
||||
freq_axis_padding = (freq_axis_kernel_size - 1) // 2
|
||||
k_size = (freq_axis_kernel_size, scale * 2 + 1)
|
||||
# padding = (freq_axis_padding, scale)
|
||||
padding = (freq_axis_padding, freq_axis_padding, scale, scale)
|
||||
stretch = Resize(scale, 1, mode)
|
||||
conv = nn.Conv2d(1, 1, kernel_size=k_size, has_bias=False, pad_mode='pad', padding=padding)
|
||||
up_layers.append(stretch)
|
||||
up_layers.append(conv)
|
||||
# if upsample_activation != "none":
|
||||
# nonlinear = _get_activation(upsample_activation)
|
||||
# up_layers.append(nonlinear(**upsample_activation_params))
|
||||
self.up_layers = nn.CellList(up_layers)
|
||||
|
||||
def construct(self, c):
|
||||
"""
|
||||
|
||||
Args:
|
||||
c (Tensor): Local conditioning feature
|
||||
|
||||
Returns:
|
||||
Tensor: Upsampling feature
|
||||
|
||||
"""
|
||||
# B x 1 x C x T
|
||||
c = self.expand_op(c, 1)
|
||||
for f in self.up_layers:
|
||||
c = f(c)
|
||||
# B x C x T
|
||||
c = self.squeeze_op(c)
|
||||
|
||||
# if self.indent > 0:
|
||||
# c = c[:, :, self.indent:-self.indent]
|
||||
return c
|
||||
|
||||
|
||||
class ConvInUpsampleNetwork(nn.Cell):
|
||||
"""Upsample Network
|
||||
|
||||
Args:
|
||||
upsample_scales (list): Upsample_scales list.
|
||||
upsample_activation (str): Upsample_activation.
|
||||
mode (str): Resize mode, default is NearestNeighbor.
|
||||
cin_channels (int): Local conditioning channels.
|
||||
freq_axis_kernel_size (int): Freq-axis kernel_size for the convolution layers after resize.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, upsample_scales, mode="nearest",
|
||||
freq_axis_kernel_size=1, cin_pad=0,
|
||||
cin_channels=80):
|
||||
super(ConvInUpsampleNetwork, self).__init__()
|
||||
ks = 2 * cin_pad + 1
|
||||
self.conv_in = nn.Conv1d(cin_channels, cin_channels, kernel_size=ks, has_bias=False, pad_mode='pad', padding=0)
|
||||
self.upsample = UpsampleNetwork(upsample_scales, mode, freq_axis_kernel_size, cin_pad=0,
|
||||
cin_channels=cin_channels)
|
||||
|
||||
def construct(self, c):
|
||||
c = self.conv_in(c)
|
||||
c_up = self.upsample(c)
|
||||
return c_up
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue