Skip to content

Commit 348064d

Browse files
authored
[TTS] add opencpop HIFIGAN example (#3038)
* add opencpop voc, test=tts * soft link * add opencpop hifigan, test=tts * update
1 parent 4e9bca1 commit 348064d

File tree

15 files changed

+720
-0
lines changed

15 files changed

+720
-0
lines changed

examples/opencpop/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
# Opencpop
3+
4+
* svs1 - DiffSinger
5+
* voc1 - Parallel WaveGAN
6+
* voc5 - HiFiGAN
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# This is the configuration file for CSMSC dataset.
2+
# This configuration is based on HiFiGAN V1, which is an official configuration.
3+
# But I found that the optimizer setting does not work well with my implementation.
4+
# So I changed optimizer settings as follows:
5+
# - AdamW -> Adam
6+
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
7+
# - Scheduler: ExponentialLR -> MultiStepLR
8+
# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
9+
10+
###########################################################
11+
# FEATURE EXTRACTION SETTING #
12+
###########################################################
13+
fs: 24000 # Sampling rate.
14+
n_fft: 512 # FFT size (samples).
15+
n_shift: 128 # Hop size (samples). 12.5ms
16+
win_length: 512 # Window length (samples). 50ms
17+
# If set to null, it will be the same as fft_size.
18+
window: "hann" # Window function.
19+
n_mels: 80 # Number of mel basis.
20+
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
21+
fmax: 12000 # Maximum frequency in mel basis calculation. (Hz)
22+
23+
###########################################################
24+
# GENERATOR NETWORK ARCHITECTURE SETTING #
25+
###########################################################
26+
generator_params:
27+
in_channels: 80 # Number of input channels.
28+
out_channels: 1 # Number of output channels.
29+
channels: 512 # Number of initial channels.
30+
kernel_size: 7 # Kernel size of initial and final conv layers.
31+
upsample_scales: [8, 4, 2, 2] # Upsampling scales.
32+
upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
33+
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
34+
resblock_dilations: # Dilations for residual blocks.
35+
- [1, 3, 5]
36+
- [1, 3, 5]
37+
- [1, 3, 5]
38+
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
39+
bias: True # Whether to use bias parameter in conv.
40+
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
41+
nonlinear_activation_params: # Nonlinear activation paramters.
42+
negative_slope: 0.1
43+
use_weight_norm: True # Whether to apply weight normalization.
44+
45+
46+
###########################################################
47+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
48+
###########################################################
49+
discriminator_params:
50+
scales: 3 # Number of multi-scale discriminator.
51+
scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator.
52+
scale_downsample_pooling_params:
53+
kernel_size: 4 # Pooling kernel size.
54+
stride: 2 # Pooling stride.
55+
padding: 2 # Padding size.
56+
scale_discriminator_params:
57+
in_channels: 1 # Number of input channels.
58+
out_channels: 1 # Number of output channels.
59+
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
60+
channels: 128 # Initial number of channels.
61+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
62+
max_groups: 16 # Maximum number of groups in downsampling conv layers.
63+
bias: True
64+
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
65+
nonlinear_activation: "leakyrelu" # Nonlinear activation.
66+
nonlinear_activation_params:
67+
negative_slope: 0.1
68+
follow_official_norm: True # Whether to follow the official norm setting.
69+
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
70+
period_discriminator_params:
71+
in_channels: 1 # Number of input channels.
72+
out_channels: 1 # Number of output channels.
73+
kernel_sizes: [5, 3] # List of kernel sizes.
74+
channels: 32 # Initial number of channels.
75+
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
76+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
77+
bias: True # Whether to use bias parameter in conv layer."
78+
nonlinear_activation: "leakyrelu" # Nonlinear activation.
79+
nonlinear_activation_params: # Nonlinear activation paramters.
80+
negative_slope: 0.1
81+
use_weight_norm: True # Whether to apply weight normalization.
82+
use_spectral_norm: False # Whether to apply spectral normalization.
83+
84+
85+
###########################################################
86+
# STFT LOSS SETTING #
87+
###########################################################
88+
use_stft_loss: False # Whether to use multi-resolution STFT loss.
89+
use_mel_loss: True # Whether to use Mel-spectrogram loss.
90+
mel_loss_params:
91+
fs: 24000
92+
fft_size: 512
93+
hop_size: 128
94+
win_length: 512
95+
window: "hann"
96+
num_mels: 80
97+
fmin: 30
98+
fmax: 12000
99+
log_base: null
100+
generator_adv_loss_params:
101+
average_by_discriminators: False # Whether to average loss by #discriminators.
102+
discriminator_adv_loss_params:
103+
average_by_discriminators: False # Whether to average loss by #discriminators.
104+
use_feat_match_loss: True
105+
feat_match_loss_params:
106+
average_by_discriminators: False # Whether to average loss by #discriminators.
107+
average_by_layers: False # Whether to average loss by #layers in each discriminator.
108+
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
109+
110+
###########################################################
111+
# ADVERSARIAL LOSS SETTING #
112+
###########################################################
113+
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
114+
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
115+
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
116+
117+
###########################################################
118+
# DATA LOADER SETTING #
119+
###########################################################
120+
batch_size: 16 # Batch size.
121+
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
122+
num_workers: 1 # Number of workers in DataLoader.
123+
124+
###########################################################
125+
# OPTIMIZER & SCHEDULER SETTING #
126+
###########################################################
127+
generator_optimizer_params:
128+
beta1: 0.5
129+
beta2: 0.9
130+
weight_decay: 0.0 # Generator's weight decay coefficient.
131+
generator_scheduler_params:
132+
learning_rate: 2.0e-4 # Generator's learning rate.
133+
gamma: 0.5 # Generator's scheduler gamma.
134+
milestones: # At each milestone, lr will be multiplied by gamma.
135+
- 200000
136+
- 400000
137+
- 600000
138+
- 800000
139+
generator_grad_norm: -1 # Generator's gradient norm.
140+
discriminator_optimizer_params:
141+
beta1: 0.5
142+
beta2: 0.9
143+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
144+
discriminator_scheduler_params:
145+
learning_rate: 2.0e-4 # Discriminator's learning rate.
146+
gamma: 0.5 # Discriminator's scheduler gamma.
147+
milestones: # At each milestone, lr will be multiplied by gamma.
148+
- 200000
149+
- 400000
150+
- 600000
151+
- 800000
152+
discriminator_grad_norm: -1 # Discriminator's gradient norm.
153+
154+
###########################################################
155+
# INTERVAL SETTING #
156+
###########################################################
157+
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
158+
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
159+
train_max_steps: 2500000 # Number of training steps.
160+
save_interval_steps: 5000 # Interval steps to save checkpoint.
161+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
162+
163+
###########################################################
164+
# OTHER SETTING #
165+
###########################################################
166+
num_snapshots: 4 # max number of snapshots to keep while training
167+
seed: 42 # random seed for paddle, random, and np.random
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# This is the configuration file for CSMSC dataset.
2+
# This configuration is based on HiFiGAN V1, which is an official configuration.
3+
# But I found that the optimizer setting does not work well with my implementation.
4+
# So I changed optimizer settings as follows:
5+
# - AdamW -> Adam
6+
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
7+
# - Scheduler: ExponentialLR -> MultiStepLR
8+
# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
9+
10+
###########################################################
11+
# FEATURE EXTRACTION SETTING #
12+
###########################################################
13+
fs: 24000 # Sampling rate.
14+
n_fft: 512 # FFT size (samples).
15+
n_shift: 128 # Hop size (samples). 12.5ms
16+
win_length: 512 # Window length (samples). 50ms
17+
# If set to null, it will be the same as fft_size.
18+
window: "hann" # Window function.
19+
n_mels: 80 # Number of mel basis.
20+
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
21+
fmax: 12000 # Maximum frequency in mel basis calculation. (Hz)
22+
23+
###########################################################
24+
# GENERATOR NETWORK ARCHITECTURE SETTING #
25+
###########################################################
26+
generator_params:
27+
in_channels: 80 # Number of input channels.
28+
out_channels: 1 # Number of output channels.
29+
channels: 512 # Number of initial channels.
30+
kernel_size: 7 # Kernel size of initial and final conv layers.
31+
upsample_scales: [8, 4, 2, 2] # Upsampling scales.
32+
upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
33+
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
34+
resblock_dilations: # Dilations for residual blocks.
35+
- [1, 3, 5]
36+
- [1, 3, 5]
37+
- [1, 3, 5]
38+
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
39+
bias: True # Whether to use bias parameter in conv.
40+
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
41+
nonlinear_activation_params: # Nonlinear activation paramters.
42+
negative_slope: 0.1
43+
use_weight_norm: True # Whether to apply weight normalization.
44+
45+
46+
###########################################################
47+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
48+
###########################################################
49+
discriminator_params:
50+
scales: 3 # Number of multi-scale discriminator.
51+
scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator.
52+
scale_downsample_pooling_params:
53+
kernel_size: 4 # Pooling kernel size.
54+
stride: 2 # Pooling stride.
55+
padding: 2 # Padding size.
56+
scale_discriminator_params:
57+
in_channels: 1 # Number of input channels.
58+
out_channels: 1 # Number of output channels.
59+
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
60+
channels: 128 # Initial number of channels.
61+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
62+
max_groups: 16 # Maximum number of groups in downsampling conv layers.
63+
bias: True
64+
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
65+
nonlinear_activation: "leakyrelu" # Nonlinear activation.
66+
nonlinear_activation_params:
67+
negative_slope: 0.1
68+
follow_official_norm: True # Whether to follow the official norm setting.
69+
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
70+
period_discriminator_params:
71+
in_channels: 1 # Number of input channels.
72+
out_channels: 1 # Number of output channels.
73+
kernel_sizes: [5, 3] # List of kernel sizes.
74+
channels: 32 # Initial number of channels.
75+
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
76+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
77+
bias: True # Whether to use bias parameter in conv layer."
78+
nonlinear_activation: "leakyrelu" # Nonlinear activation.
79+
nonlinear_activation_params: # Nonlinear activation paramters.
80+
negative_slope: 0.1
81+
use_weight_norm: True # Whether to apply weight normalization.
82+
use_spectral_norm: False # Whether to apply spectral normalization.
83+
84+
85+
###########################################################
86+
# STFT LOSS SETTING #
87+
###########################################################
88+
use_stft_loss: False # Whether to use multi-resolution STFT loss.
89+
use_mel_loss: True # Whether to use Mel-spectrogram loss.
90+
mel_loss_params:
91+
fs: 24000
92+
fft_size: 512
93+
hop_size: 128
94+
win_length: 512
95+
window: "hann"
96+
num_mels: 80
97+
fmin: 30
98+
fmax: 12000
99+
log_base: null
100+
generator_adv_loss_params:
101+
average_by_discriminators: False # Whether to average loss by #discriminators.
102+
discriminator_adv_loss_params:
103+
average_by_discriminators: False # Whether to average loss by #discriminators.
104+
use_feat_match_loss: True
105+
feat_match_loss_params:
106+
average_by_discriminators: False # Whether to average loss by #discriminators.
107+
average_by_layers: False # Whether to average loss by #layers in each discriminator.
108+
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
109+
110+
###########################################################
111+
# ADVERSARIAL LOSS SETTING #
112+
###########################################################
113+
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
114+
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
115+
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
116+
117+
###########################################################
118+
# DATA LOADER SETTING #
119+
###########################################################
120+
#batch_size: 16 # Batch size.
121+
batch_size: 1 # Batch size.
122+
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
123+
num_workers: 1 # Number of workers in DataLoader.
124+
125+
###########################################################
126+
# OPTIMIZER & SCHEDULER SETTING #
127+
###########################################################
128+
generator_optimizer_params:
129+
beta1: 0.5
130+
beta2: 0.9
131+
weight_decay: 0.0 # Generator's weight decay coefficient.
132+
generator_scheduler_params:
133+
learning_rate: 2.0e-4 # Generator's learning rate.
134+
gamma: 0.5 # Generator's scheduler gamma.
135+
milestones: # At each milestone, lr will be multiplied by gamma.
136+
- 200000
137+
- 400000
138+
- 600000
139+
- 800000
140+
generator_grad_norm: -1 # Generator's gradient norm.
141+
discriminator_optimizer_params:
142+
beta1: 0.5
143+
beta2: 0.9
144+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
145+
discriminator_scheduler_params:
146+
learning_rate: 2.0e-4 # Discriminator's learning rate.
147+
gamma: 0.5 # Discriminator's scheduler gamma.
148+
milestones: # At each milestone, lr will be multiplied by gamma.
149+
- 200000
150+
- 400000
151+
- 600000
152+
- 800000
153+
discriminator_grad_norm: -1 # Discriminator's gradient norm.
154+
155+
###########################################################
156+
# INTERVAL SETTING #
157+
###########################################################
158+
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
159+
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
160+
train_max_steps: 2600000 # Number of training steps.
161+
save_interval_steps: 5000 # Interval steps to save checkpoint.
162+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
163+
164+
###########################################################
165+
# OTHER SETTING #
166+
###########################################################
167+
num_snapshots: 4 # max number of snapshots to keep while training
168+
seed: 42 # random seed for paddle, random, and np.random

0 commit comments

Comments
 (0)