-
Notifications
You must be signed in to change notification settings - Fork 1
/
Config.py
45 lines (39 loc) · 3.67 KB
/
Config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import numpy as np
from sacred import Ingredient
config_ingredient = Ingredient("cfg")
@config_ingredient.config
def cfg():
# Base configuration
model_config = {"model_base_dir" : "/content/drive/MyDrive/BE_PROJECT/checkpoints", # Base folder for model checkpoints
"log_dir" : "/content/drive/MyDrive/BE_PROJECT/logs", # Base folder for logs files
"batch_size" : 16, # Batch size
"init_sup_sep_lr" : 1e-4, # Supervised separator learning rate
"epoch_it" : 2000, # Number of supervised separator steps per epoch
'cache_size' : 16, # Number of audio excerpts that are cached to build batches from
'num_workers' : 6, # Number of processes reading audio and filling up the cache
"duration" : 2, # Duration in seconds of the audio excerpts in the cache. Has to be at least the output length of the network!
'min_replacement_rate' : 16, # roughly: how many cache entries to replace at least per batch on average. Can be fractional
'num_layers' : 12, # How many U-Net layers
'filter_size' : 15, # Filter size of conv in downsampling block
'merge_filter_size' : 5, # Filter size of conv in upsampling block
'num_initial_filters' : 24, # Number of filters for convolution in first layer of network
"num_frames": 16384, # DESIRED number of time frames in the output waveform per samples (could be changed when using valid padding)
'expected_sr': 22050, # If audio input > 22.05kHz, downsample to this sampling rate
'mono_downmix': True, # Whether to convert audio input to mono, if required
'output_type' : 'direct', # Type of output layer, either "direct" or "difference". Direct output: Each source is result of tanh activation and independent. DIfference: Last source output is equal to mixture input - sum(all other sources)
'context' : False, # Type of padding for convolutions in separator. If False, feature maps double or half in dimensions after each convolution, and convolutions are padded with zeros ("same" padding). If True, convolution is only performed on the available mixture input, thus the output is smaller than the input
'network' : 'unet', # Type of network architecture - Stoller et al's (2018) Wave-U-Net
'upsampling' : 'linear', # Type of technique used for upsampling the feature maps in a unet architecture, either 'linear' interpolation or 'learned' filling in of extra samples
'task' : 'speech', # Type of separation task. 'speech' : Separate mix into speech and background noise.
'augmentation' : True, # Random attenuation of source signals to improve generalisation performance (data augmentation)
'raw_audio_loss' : True, # Only active if employ Spectrogram-based U-Net, as per Jansson et al (2017) - available here: <https://github.com/f90/Wave-U-Net/blob/master/Models/UnetSpectrogramSeparator.py>. True: L2 loss on audio. False: L1 loss on spectrogram magnitudes for training and validation loss
'worse_epochs' : 20, # Patience for early stoppping on validation set
'ccmixter_path':"/content/drive/MyDrive/BE_PROJECT/wave_dataset/CCMixter_latest.xml"
}
seed=1337
experiment_id = np.random.randint(0,1000000)
model_config["num_sources"] = 2
model_config["num_channels"] = 1 if model_config["mono_downmix"] else 2
@config_ingredient.named_config
def baseline():
print("Training baseline model")