|
| 1 | +# This is the configuration file for CSMSC dataset. |
| 2 | +# This configuration is based on HiFiGAN V1, which is an official configuration. |
| 3 | +# But I found that the optimizer setting does not work well with my implementation. |
| 4 | +# So I changed optimizer settings as follows: |
| 5 | +# - AdamW -> Adam |
| 6 | +# - betas: [0.8, 0.99] -> betas: [0.5, 0.9] |
| 7 | +# - Scheduler: ExponentialLR -> MultiStepLR |
| 8 | +# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting. |
| 9 | + |
| 10 | +########################################################### |
| 11 | +# FEATURE EXTRACTION SETTING # |
| 12 | +########################################################### |
| 13 | +fs: 24000 # Sampling rate. |
| 14 | +n_fft: 512 # FFT size (samples). |
| 15 | +n_shift: 128 # Hop size (samples). 12.5ms |
| 16 | +win_length: 512 # Window length (samples). 50ms |
| 17 | + # If set to null, it will be the same as fft_size. |
| 18 | +window: "hann" # Window function. |
| 19 | +n_mels: 80 # Number of mel basis. |
| 20 | +fmin: 80 # Minimum freq in mel basis calculation. (Hz) |
| 21 | +fmax: 12000 # Maximum frequency in mel basis calculation. (Hz) |
| 22 | + |
| 23 | +########################################################### |
| 24 | +# GENERATOR NETWORK ARCHITECTURE SETTING # |
| 25 | +########################################################### |
| 26 | +generator_params: |
| 27 | + in_channels: 80 # Number of input channels. |
| 28 | + out_channels: 1 # Number of output channels. |
| 29 | + channels: 512 # Number of initial channels. |
| 30 | + kernel_size: 7 # Kernel size of initial and final conv layers. |
| 31 | + upsample_scales: [8, 4, 2, 2] # Upsampling scales. |
| 32 | + upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers. |
| 33 | + resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. |
| 34 | + resblock_dilations: # Dilations for residual blocks. |
| 35 | + - [1, 3, 5] |
| 36 | + - [1, 3, 5] |
| 37 | + - [1, 3, 5] |
| 38 | + use_additional_convs: True # Whether to use additional conv layer in residual blocks. |
| 39 | + bias: True # Whether to use bias parameter in conv. |
| 40 | + nonlinear_activation: "leakyrelu" # Nonlinear activation type. |
| 41 | + nonlinear_activation_params: # Nonlinear activation paramters. |
| 42 | + negative_slope: 0.1 |
| 43 | + use_weight_norm: True # Whether to apply weight normalization. |
| 44 | + |
| 45 | + |
| 46 | +########################################################### |
| 47 | +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # |
| 48 | +########################################################### |
| 49 | +discriminator_params: |
| 50 | + scales: 3 # Number of multi-scale discriminator. |
| 51 | + scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator. |
| 52 | + scale_downsample_pooling_params: |
| 53 | + kernel_size: 4 # Pooling kernel size. |
| 54 | + stride: 2 # Pooling stride. |
| 55 | + padding: 2 # Padding size. |
| 56 | + scale_discriminator_params: |
| 57 | + in_channels: 1 # Number of input channels. |
| 58 | + out_channels: 1 # Number of output channels. |
| 59 | + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. |
| 60 | + channels: 128 # Initial number of channels. |
| 61 | + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. |
| 62 | + max_groups: 16 # Maximum number of groups in downsampling conv layers. |
| 63 | + bias: True |
| 64 | + downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. |
| 65 | + nonlinear_activation: "leakyrelu" # Nonlinear activation. |
| 66 | + nonlinear_activation_params: |
| 67 | + negative_slope: 0.1 |
| 68 | + follow_official_norm: True # Whether to follow the official norm setting. |
| 69 | + periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. |
| 70 | + period_discriminator_params: |
| 71 | + in_channels: 1 # Number of input channels. |
| 72 | + out_channels: 1 # Number of output channels. |
| 73 | + kernel_sizes: [5, 3] # List of kernel sizes. |
| 74 | + channels: 32 # Initial number of channels. |
| 75 | + downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. |
| 76 | + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. |
| 77 | + bias: True # Whether to use bias parameter in conv layer." |
| 78 | + nonlinear_activation: "leakyrelu" # Nonlinear activation. |
| 79 | + nonlinear_activation_params: # Nonlinear activation paramters. |
| 80 | + negative_slope: 0.1 |
| 81 | + use_weight_norm: True # Whether to apply weight normalization. |
| 82 | + use_spectral_norm: False # Whether to apply spectral normalization. |
| 83 | + |
| 84 | + |
| 85 | +########################################################### |
| 86 | +# STFT LOSS SETTING # |
| 87 | +########################################################### |
| 88 | +use_stft_loss: False # Whether to use multi-resolution STFT loss. |
| 89 | +use_mel_loss: True # Whether to use Mel-spectrogram loss. |
| 90 | +mel_loss_params: |
| 91 | + fs: 24000 |
| 92 | + fft_size: 512 |
| 93 | + hop_size: 128 |
| 94 | + win_length: 512 |
| 95 | + window: "hann" |
| 96 | + num_mels: 80 |
| 97 | + fmin: 30 |
| 98 | + fmax: 12000 |
| 99 | + log_base: null |
| 100 | +generator_adv_loss_params: |
| 101 | + average_by_discriminators: False # Whether to average loss by #discriminators. |
| 102 | +discriminator_adv_loss_params: |
| 103 | + average_by_discriminators: False # Whether to average loss by #discriminators. |
| 104 | +use_feat_match_loss: True |
| 105 | +feat_match_loss_params: |
| 106 | + average_by_discriminators: False # Whether to average loss by #discriminators. |
| 107 | + average_by_layers: False # Whether to average loss by #layers in each discriminator. |
| 108 | + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. |
| 109 | + |
| 110 | +########################################################### |
| 111 | +# ADVERSARIAL LOSS SETTING # |
| 112 | +########################################################### |
| 113 | +lambda_aux: 45.0 # Loss balancing coefficient for STFT loss. |
| 114 | +lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss. |
| 115 | +lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. |
| 116 | + |
| 117 | +########################################################### |
| 118 | +# DATA LOADER SETTING # |
| 119 | +########################################################### |
| 120 | +#batch_size: 16 # Batch size. |
| 121 | +batch_size: 1 # Batch size. |
| 122 | +batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. |
| 123 | +num_workers: 1 # Number of workers in DataLoader. |
| 124 | + |
| 125 | +########################################################### |
| 126 | +# OPTIMIZER & SCHEDULER SETTING # |
| 127 | +########################################################### |
| 128 | +generator_optimizer_params: |
| 129 | + beta1: 0.5 |
| 130 | + beta2: 0.9 |
| 131 | + weight_decay: 0.0 # Generator's weight decay coefficient. |
| 132 | +generator_scheduler_params: |
| 133 | + learning_rate: 2.0e-4 # Generator's learning rate. |
| 134 | + gamma: 0.5 # Generator's scheduler gamma. |
| 135 | + milestones: # At each milestone, lr will be multiplied by gamma. |
| 136 | + - 200000 |
| 137 | + - 400000 |
| 138 | + - 600000 |
| 139 | + - 800000 |
| 140 | +generator_grad_norm: -1 # Generator's gradient norm. |
| 141 | +discriminator_optimizer_params: |
| 142 | + beta1: 0.5 |
| 143 | + beta2: 0.9 |
| 144 | + weight_decay: 0.0 # Discriminator's weight decay coefficient. |
| 145 | +discriminator_scheduler_params: |
| 146 | + learning_rate: 2.0e-4 # Discriminator's learning rate. |
| 147 | + gamma: 0.5 # Discriminator's scheduler gamma. |
| 148 | + milestones: # At each milestone, lr will be multiplied by gamma. |
| 149 | + - 200000 |
| 150 | + - 400000 |
| 151 | + - 600000 |
| 152 | + - 800000 |
| 153 | +discriminator_grad_norm: -1 # Discriminator's gradient norm. |
| 154 | + |
| 155 | +########################################################### |
| 156 | +# INTERVAL SETTING # |
| 157 | +########################################################### |
| 158 | +generator_train_start_steps: 1 # Number of steps to start to train discriminator. |
| 159 | +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. |
| 160 | +train_max_steps: 2600000 # Number of training steps. |
| 161 | +save_interval_steps: 5000 # Interval steps to save checkpoint. |
| 162 | +eval_interval_steps: 1000 # Interval steps to evaluate the network. |
| 163 | + |
| 164 | +########################################################### |
| 165 | +# OTHER SETTING # |
| 166 | +########################################################### |
| 167 | +num_snapshots: 4 # max number of snapshots to keep while training |
| 168 | +seed: 42 # random seed for paddle, random, and np.random |
0 commit comments