added the missing files

Browse files

Files changed (4) hide show

classifier_esc50.ckpt +3 -0
embedding_model.ckpt +3 -0
embedding_model_esc50ft.ckpt +3 -0
hyperparams.yaml +226 -0

classifier_esc50.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e85ef49491db26ce50ee49753ed83cb7b7eb760d47f4c1a01fb2bdef0dcea704
+size 1647311

embedding_model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca6f7dcf4eb97e68fb0989e3fbc9c667c60eaa0c598753e86e7b07bac0729755
+size 301999678

embedding_model_esc50ft.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:375b53b0107759f58b173759d9c439211a648970f3d0ea02a2ace179cf8550f7
+size 301999678

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,226 @@

+# Generated 2022-11-21 from:
+# /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml
+# yamllint disable
+# #################################
+# Basic training parameters for sound classification using the ESC50 dataset.
+# This recipe uses the ecapa-tdnn backbone for classification.
+#
+# Author:
+#  * Cem Subakan
+#  (based on the SpeechBrain UrbanSound8k recipe)
+# #################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 11
+__set_seed: !!python/object/apply:torch.manual_seed [11]
+# Set up folders for reading from and writing to
+# Dataset must already exist at `audio_data_folder`
+data_folder: /data2/ESC-50-master
+                          # e.g., /localscratch/UrbanSound8K
+open_rir_folder: <data_folder>/RIRS # Change if needed
+audio_data_folder: /data2/ESC-50-master/audio
+# TODO the follwing folder will contain the resampled audio
+# files (mono channel and config SR) to train on
+#reasmpled_audio_data_folder: !ref <data_folder>/audio_mono16kHz
+#
+experiment_name: cnn14
+output_folder: ./results/cnn14/11
+save_folder: ./results/cnn14/11/save
+train_log: ./results/cnn14/11/train_log.txt
+test_only: false
+# Tensorboard logs
+use_tensorboard: false
+tensorboard_logs_folder: ./results/cnn14/11/tb_logs/
+# Path where data manifest files will be stored
+train_annotation: /data2/ESC-50-master/manifest/train.json
+valid_annotation: /data2/ESC-50-master/manifest/valid.json
+test_annotation: /data2/ESC-50-master/manifest/test.json
+# To standardize results, UrbanSound8k has pre-separated samples into
+# 10 folds for multi-fold validation
+train_fold_nums: [1, 2, 3]
+valid_fold_nums: [4]
+test_fold_nums: [5]
+skip_manifest_creation: false
+ckpt_interval_minutes: 15 # save checkpoint every N min
+# Training parameters
+number_of_epochs: 200
+batch_size: 32
+lr: 0.0002
+base_lr: 0.00000001
+max_lr: 0.0002
+step_size: 65000
+sample_rate: 44100
+device: cpu
+# Feature parameters
+n_mels: 80
+left_frames: 0
+right_frames: 0
+deltas: false
+amp_to_db: true
+normalize: true
+# Number of classes
+out_n_neurons: 50
+# Note that it's actually important to shuffle the data here
+# (or at the very least, not sort the data by duration)
+# Also note that this does not violate the UrbanSound8k "no-shuffle" policy
+# because this does not mix samples from folds in train to valid/test, only
+# within train or valid, or test
+shuffle: true
+dataloader_options:
+  batch_size: 32
+  shuffle: true
+  num_workers: 0
+# Functions
+compute_features: &id003 !new:speechbrain.lobes.features.Fbank
+  n_mels: 80
+  left_frames: 0
+  right_frames: 0
+  deltas: false
+  sample_rate: 44100
+  n_fft: 1024
+  win_length: 20
+  hop_length: 10
+use_pretrain: false
+embedding_model: &id009 !new:recipes.ESC50.classification.custom_models.Cnn14
+  mel_bins: 80
+  emb_dim: 2048
+classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
+  input_size: 2048
+  out_neurons: 50
+  lin_blocks: 1
+epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter
+# If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
+  limit: 200
+augment_wavedrop: &id004 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
+  sample_rate: 44100
+  speeds: [100]
+augment_speed: &id005 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
+  sample_rate: 44100
+  speeds: [95, 100, 105]
+add_rev: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
+  openrir_folder: /data2/ESC-50-master/RIRS
+  openrir_max_noise_len: 3.0    # seconds
+  reverb_prob: 1.0
+  noise_prob: 0.0
+  noise_snr_low: 0
+  noise_snr_high: 15
+  rir_scale_factor: 1.0
+add_noise: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
+  openrir_folder: /data2/ESC-50-master/RIRS
+  openrir_max_noise_len: 3.0    # seconds
+  reverb_prob: 0.0
+  noise_prob: 1.0
+  noise_snr_low: 0
+  noise_snr_high: 15
+  rir_scale_factor: 1.0
+add_rev_noise: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
+  openrir_folder: /data2/ESC-50-master/RIRS
+  openrir_max_noise_len: 3.0    # seconds
+  reverb_prob: 1.0
+  noise_prob: 1.0
+  noise_snr_low: 0
+  noise_snr_high: 15
+  rir_scale_factor: 1.0
+# Definition of the augmentation pipeline.
+# If concat_augment = False, the augmentation techniques are applied
+# in sequence. If concat_augment = True, all the augmented signals
+# # are concatenated in a single big batch.
+augment_pipeline: []
+concat_augment: true
+mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization
+  norm_type: sentence
+  std_norm: false
+# pre-processing
+n_fft: 1024
+spec_mag_power: 0.5
+hop_length: 11.6099
+win_length: 23.2199
+compute_stft: &id001 !new:speechbrain.processing.features.STFT
+  n_fft: 1024
+  hop_length: 11.6099
+  win_length: 23.2199
+  sample_rate: 44100
+compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank
+  n_mels: 80
+  n_fft: 1024
+  sample_rate: 44100
+modules:
+  compute_stft: *id001
+  compute_fbank: *id002
+  compute_features: *id003
+  augment_wavedrop: *id004
+  augment_speed: *id005
+  add_rev: *id006
+  add_noise: *id007
+  add_rev_noise: *id008
+  embedding_model: *id009
+  classifier: *id010
+  mean_var_norm: *id011
+compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
+  loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
+    margin: 0.2
+    scale: 30
+# compute_error: !name:speechbrain.nnet.losses.classification_error
+opt_class: !name:torch.optim.Adam
+  lr: 0.0002
+  weight_decay: 0.000002
+lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
+  base_lr: 0.00000001
+  max_lr: 0.0002
+  step_size: 65000
+# Logging + checkpoints
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: ./results/cnn14/11/train_log.txt
+error_stats: !name:speechbrain.utils.metric_stats.MetricStats
+  metric: !name:speechbrain.nnet.losses.classification_error
+    reduction: batch
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: ./results/cnn14/11/save
+  recoverables:
+    embedding_model: *id009
+    classifier: *id010
+    normalizer: *id011
+    counter: *id012
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+  loadables:
+    embedding_model: !ref <embedding_model>
+    classifier: !ref <classifier>