# Copyright 2021 Tomoki Hayashi
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Parallel WaveGAN Modules.
This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
"""
import logging
import math
from typing import Any, Dict, Optional
import numpy as np
import torch
from espnet2.gan_tts.parallel_wavegan import upsample
from espnet2.gan_tts.wavenet.residual_block import Conv1d, Conv1d1x1, ResidualBlock
[docs]class ParallelWaveGANGenerator(torch.nn.Module):
"""Parallel WaveGAN Generator module."""
def __init__(
self,
in_channels: int = 1,
out_channels: int = 1,
kernel_size: int = 3,
layers: int = 30,
stacks: int = 3,
residual_channels: int = 64,
gate_channels: int = 128,
skip_channels: int = 64,
aux_channels: int = 80,
aux_context_window: int = 2,
dropout_rate: float = 0.0,
bias: bool = True,
use_weight_norm: bool = True,
upsample_conditional_features: bool = True,
upsample_net: str = "ConvInUpsampleNetwork",
upsample_params: Dict[str, Any] = {"upsample_scales": [4, 4, 4, 4]},
):
"""Initialize ParallelWaveGANGenerator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Kernel size of dilated convolution.
layers (int): Number of residual block layers.
stacks (int): Number of stacks i.e., dilation cycles.
residual_channels (int): Number of channels in residual conv.
gate_channels (int): Number of channels in gated conv.
skip_channels (int): Number of channels in skip conv.
aux_channels (int): Number of channels for auxiliary feature conv.
aux_context_window (int): Context window size for auxiliary feature.
dropout_rate (float): Dropout rate. 0.0 means no dropout applied.
bias (bool): Whether to use bias parameter in conv layer.
use_weight_norm (bool): Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
upsample_conditional_features (bool): Whether to use upsampling network.
upsample_net (str): Upsampling network architecture.
upsample_params (Dict[str, Any]): Upsampling network parameters.
"""
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.aux_channels = aux_channels
self.aux_context_window = aux_context_window
self.layers = layers
self.stacks = stacks
self.kernel_size = kernel_size
# check the number of layers and stacks
assert layers % stacks == 0
layers_per_stack = layers // stacks
# define first convolution
self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)
# define conv + upsampling network
if upsample_conditional_features:
if upsample_net == "ConvInUpsampleNetwork":
upsample_params.update(
{
"aux_channels": aux_channels,
"aux_context_window": aux_context_window,
}
)
self.upsample_net = getattr(upsample, upsample_net)(**upsample_params)
self.upsample_factor = int(np.prod(upsample_params["upsample_scales"]))
else:
self.upsample_net = None
self.upsample_factor = out_channels
# define residual blocks
self.conv_layers = torch.nn.ModuleList()
for layer in range(layers):
dilation = 2 ** (layer % layers_per_stack)
conv = ResidualBlock(
kernel_size=kernel_size,
residual_channels=residual_channels,
gate_channels=gate_channels,
skip_channels=skip_channels,
aux_channels=aux_channels,
dilation=dilation,
dropout_rate=dropout_rate,
bias=bias,
scale_residual=True,
)
self.conv_layers += [conv]
# define output layers
self.last_conv_layers = torch.nn.ModuleList(
[
torch.nn.ReLU(),
Conv1d1x1(skip_channels, skip_channels, bias=True),
torch.nn.ReLU(),
Conv1d1x1(skip_channels, out_channels, bias=True),
]
)
# apply weight norm
if use_weight_norm:
self.apply_weight_norm()
# NOTE(kan-bayashi): register pre hook function for the compatibility with
# parallel_wavegan repo
self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook)
[docs] def forward(
self, c: torch.Tensor, z: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""Calculate forward propagation.
Args:
c (Tensor): Local conditioning auxiliary features (B, C ,T_feats).
z (Tensor): Input noise signal (B, 1, T_wav).
Returns:
Tensor: Output tensor (B, out_channels, T_wav)
"""
if z is None:
b, _, t = c.size()
z = torch.randn(b, 1, t * self.upsample_factor).to(
device=c.device, dtype=c.dtype
)
# perform upsampling
if self.upsample_net is not None:
c = self.upsample_net(c)
assert c.size(-1) == z.size(-1)
# encode to hidden representation
x = self.first_conv(z)
skips = 0
for f in self.conv_layers:
x, h = f(x=x, x_mask=None, c=c)
skips += h
skips *= math.sqrt(1.0 / len(self.conv_layers))
# apply final layers
x = skips
for f in self.last_conv_layers:
x = f(x)
return x
[docs] def remove_weight_norm(self):
"""Remove weight normalization module from all of the layers."""
def _remove_weight_norm(m: torch.nn.Module):
try:
logging.debug(f"Weight norm is removed from {m}.")
torch.nn.utils.remove_weight_norm(m)
except ValueError: # this module didn't have weight norm
return
self.apply(_remove_weight_norm)
[docs] def apply_weight_norm(self):
"""Apply weight normalization module from all of the layers."""
def _apply_weight_norm(m: torch.nn.Module):
if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
torch.nn.utils.weight_norm(m)
logging.debug(f"Weight norm is applied to {m}.")
self.apply(_apply_weight_norm)
@staticmethod
def _get_receptive_field_size(
layers, stacks, kernel_size, dilation=lambda x: 2**x
):
assert layers % stacks == 0
layers_per_cycle = layers // stacks
dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
return (kernel_size - 1) * sum(dilations) + 1
@property
def receptive_field_size(self):
"""Return receptive field size."""
return self._get_receptive_field_size(
self.layers, self.stacks, self.kernel_size
)
[docs] def inference(
self, c: torch.Tensor, z: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""Perform inference.
Args:
c (Tensor): Local conditioning auxiliary features (T_feats ,C).
z (Optional[Tensor]): Input noise signal (T_wav, 1).
Returns:
Tensor: Output tensor (T_wav, out_channels)
"""
if z is not None:
z = z.transpose(1, 0).unsqueeze(0)
c = c.transpose(1, 0).unsqueeze(0)
return self.forward(c, z).squeeze(0).transpose(1, 0)
def _load_state_dict_pre_hook(
self,
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
):
"""Apply pre hook function before loading state dict."""
keys = list(state_dict.keys())
for k in keys:
if "conv1x1_skip" in k.replace(prefix, ""):
v_skip = state_dict.pop(k)
v_out = state_dict[k.replace("skip", "out")]
state_dict[k.replace("skip", "out")] = torch.cat([v_out, v_skip], dim=0)
[docs]class ParallelWaveGANDiscriminator(torch.nn.Module):
"""Parallel WaveGAN Discriminator module."""
def __init__(
self,
in_channels: int = 1,
out_channels: int = 1,
kernel_size: int = 3,
layers: int = 10,
conv_channels: int = 64,
dilation_factor: int = 1,
nonlinear_activation: str = "LeakyReLU",
nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
bias: bool = True,
use_weight_norm: bool = True,
):
"""Initialize ParallelWaveGANDiscriminator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Number of output channels.
layers (int): Number of conv layers.
conv_channels (int): Number of chnn layers.
dilation_factor (int): Dilation factor. For example, if dilation_factor = 2,
the dilation will be 2, 4, 8, ..., and so on.
nonlinear_activation (str): Nonlinear function after each conv.
nonlinear_activation_params (Dict[str, Any]): Nonlinear function parameters
bias (bool): Whether to use bias parameter in conv.
use_weight_norm (bool) Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
assert dilation_factor > 0, "Dilation factor must be > 0."
self.conv_layers = torch.nn.ModuleList()
conv_in_channels = in_channels
for i in range(layers - 1):
if i == 0:
dilation = 1
else:
dilation = i if dilation_factor == 1 else dilation_factor**i
conv_in_channels = conv_channels
padding = (kernel_size - 1) // 2 * dilation
conv_layer = [
Conv1d(
conv_in_channels,
conv_channels,
kernel_size=kernel_size,
padding=padding,
dilation=dilation,
bias=bias,
),
getattr(torch.nn, nonlinear_activation)(
inplace=True, **nonlinear_activation_params
),
]
self.conv_layers += conv_layer
padding = (kernel_size - 1) // 2
last_conv_layer = Conv1d(
conv_in_channels,
out_channels,
kernel_size=kernel_size,
padding=padding,
bias=bias,
)
self.conv_layers += [last_conv_layer]
# apply weight norm
if use_weight_norm:
self.apply_weight_norm()
[docs] def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T).
Returns:
Tensor: Output tensor (B, 1, T).
"""
for f in self.conv_layers:
x = f(x)
return x
[docs] def apply_weight_norm(self):
"""Apply weight normalization module from all of the layers."""
def _apply_weight_norm(m: torch.nn.Module):
if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
torch.nn.utils.weight_norm(m)
logging.debug(f"Weight norm is applied to {m}.")
self.apply(_apply_weight_norm)
[docs] def remove_weight_norm(self):
"""Remove weight normalization module from all of the layers."""
def _remove_weight_norm(m: torch.nn.Module):
try:
logging.debug(f"Weight norm is removed from {m}.")
torch.nn.utils.remove_weight_norm(m)
except ValueError: # this module didn't have weight norm
return
self.apply(_remove_weight_norm)