speechbrain · fpaissan · Feb 27, 2025 · Apr 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,32 @@
+{
+    // Usare IntelliSense per informazioni sui possibili attributi.
+    // Al passaggio del mouse vengono visualizzate le descrizioni degli attributi esistenti.
+    // Per altre informazioni, visitare: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debugger Python: train_lmactd.py",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "args": ["/speechbrain_fork/recipes/ESC50/interpret/hparams/lmactd_cnn14.yaml", "--data_folder=/data/ESC50 ", "--batch_size=1"]
+        }, 
+        {
+            "name": "Debugger Python: eval.py",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "args": [
+                "/speechbrain_fork/recipes/ESC50/interpret/hparams/lmactd_cnn14.yaml",
+                "--data_folder", "/data/ESC50",
+                "--overlap_type", "mixtures",
+                "--add_wham_noise", "False",
+                "--pretrained_interpreter", "/speechbrain_fork/recipes/ESC50/interpret/results/LMACTD_cnn14/44/save/CKPT+2025-03-26+16-19-31+00"
+            ]
+        }
+    ]
+}
diff --git a/recipes/ESC50/esc50_prepare.py b/recipes/ESC50/esc50_prepare.py
@@ -2,7 +2,7 @@
 Creates data manifest files for ESC50
 If the data does not exist in the specified --data_folder, we download the data automatically.
 
-https://github.com/karolpiczak/ESC-50/
+https://github.com/karoldvl/ESC-50/
 
 Authors:
  * Cem Subakan 2022, 2023
@@ -12,6 +12,7 @@
 """
 
 import json
+import logging
 import os
 import shutil
 
@@ -20,12 +21,11 @@
 
 import speechbrain as sb
 from speechbrain.dataio.dataio import load_data_csv, read_audio
-from speechbrain.utils.fetching import LocalStrategy, fetch
-from speechbrain.utils.logger import get_logger
+from speechbrain.utils.fetching import fetch
 
-logger = get_logger(__name__)
+logger = logging.getLogger(__name__)
 
-ESC50_DOWNLOAD_URL = "https://github.com/karolpiczak/ESC-50/archive/master.zip"
+ESC50_DOWNLOAD_URL = "https://github.com/karoldvl/ESC-50/archive/master.zip"
 MODIFIED_METADATA_FILE_NAME = "esc50_speechbrain.csv"
 
 ACCEPTABLE_FOLD_NUMS = [1, 2, 3, 4, 5]
@@ -47,16 +47,14 @@ def download_esc50(data_path):
         temp_path = os.path.join(data_path, "temp_download")
 
         # download the data
-        archive_path = fetch(
+        fetch(
             "master.zip",
-            "https://github.com/karolpiczak/ESC-50/archive/",  # noqa ignore-url-check
+            "https://github.com/karoldvl/ESC-50/archive/",
             savedir=temp_path,
-            # URL, so will be fetched directly in the savedir anyway
-            local_strategy=LocalStrategy.COPY_SKIP_CACHE,
         )
 
         # unpack the .zip file
-        shutil.unpack_archive(archive_path, data_path)
+        shutil.unpack_archive(os.path.join(temp_path, "master.zip"), data_path)
 
         # move the files up to the datapath
         files = os.listdir(os.path.join(data_path, "ESC-50-master"))
@@ -266,7 +264,7 @@ def create_json(metadata, audio_data_folder, folds_list, json_file):
     if not os.path.exists(parent_dir):
         os.mkdir(parent_dir)
 
-    with open(json_file, mode="w", encoding="utf-8") as json_f:
+    with open(json_file, mode="w") as json_f:
         json.dump(json_dict, json_f, indent=2)
 
     logger.info(f"{json_file} successfully created!")

diff --git a/recipes/ESC50/interpret/README.md b/recipes/ESC50/interpret/README.md
@@ -39,9 +39,12 @@ Some results that are obtained with this recipe on the OOD evaluation are as fol
 
 |Method | AI    | AD  	| AG  	|FF   	|Fid-In   | SPS | COMP |
 |---	|---	|---	|---	|---	| ----    | --   | ---  |
-|L-MAC 	| 61.62 | 3.83 | 33.48 | 0.40 | 0.82 | 0.93 | 9.77 |
-|L-MAC FT | 58.87 | 4.89 | 30.84 | 0.40 | 0.82 | 0.82 | 10.65 |
-|L2I   	| 6.75  |25.93 	|1.25  	|0.26  | 0.01  | 0.58   | 11.38  |
+|L-MAC 	| 60.63 | 4.82 | 35.85 | 0.39 | 0.81 | 0.94 | 9.61 |
+|L-MAC FT | 50.75 | 6.73 | 26.00 | 0.39 | 0.81 | 0.84 | 10.51 |
+|L-MAC TD, &alpha; = 1.00 | 56.75 | 3.62 | 16.84 | 0.42 | 0.88 | 0.89 | 10.36 |
+|L-MAC TD, &alpha; = 0.75 | 59.50 | 3.42 | 21.22 | 0.41 | 0.87 | 0.88 | 10.35 |
+|L-MAC TD, &alpha; = 0.00 | 39.88 | 7.60 | 9.30 | 0.42 | 0.82 | 0.83 | 10.69 |
+|L2I   	| 5.00  | 25.65 | 1.00  	|0.20  | 0.35  | 0.52   | 10.99  |
 
 Please, refer to the [L-MAC paper](https://arxiv.org/abs/2403.13086) for more information about the evaluation metrics.
 
@@ -76,6 +79,40 @@ where $g_w$ is the guidance weight for the interpreter.
 
 The pretrained classifier to be interpreted is specified with the variables `embedding_model_path`, and `classifier_model_path`. The default model is a model we trained on ESC50, however, if you would like to specify your own model just use paths that point to your own model.
 
+---------------------------------------------------------------------------------------------------------
+### LMAC-TD: Producing Time Domain Explanations for Audio Classifiers 
+
+Following the approach adopted in LMAC, LMAC-TD trains an interpreter on the classifier's representations to reconstruct interpretations based on a amortized inference loss.
+
+For more details, refer to our [LMAC-TD paper](https://arxiv.org/abs/2409.08655). You can also find samples on the [companion website](https://francescopaissan.it/lmac-td/).
+
+To train LMAC-TD on a convolutional classifier using the ESC50 dataset, use the `train_lmactd.py` script. Run the following command:
+
+```shell
+python train_lmactd.py hparams/lmactd_cnn14.yaml --data_folder=/yourpath/ESC50
+```
+
+Eventually, you can use WHAM! augmentation to boost the interpretations performance, using:
+```shell
+python  train_lmactd.py hparams/lmactd_cnn14.yaml 
+         --data_folder=/yourpath/ESC50 --add_wham_noise True \
+         --wham_folder=/yourpath/wham_noise
+```
+**Note**: The WHAM! noise dataset can be downloaded from [here](http://wham.whisper.ai/).
+
+<!-- To run the finetuning stage of the interpreter, use
+```shell
+python  train_sepformerlmac_classifierreps.py \
+    hparams/sepformerlmac_cnn14_classifierreps.yaml --data_folder=/yourpath/ESC50 \
+    --add_wham_noise True --wham_folder=/yourpath/wham_noise \
+    --finetuning True --pretrained_interpreter=/yourLMACcheckpointpath/psi_model.ckpt --g_w 4
+```
+where $g_w$ is the guidance weight for the interpreter. -->
+
+#### Specifying the pretrained classifier
+
+The pretrained classifier to be interpreted is specified with the variables `embedding_model_path`, and `classifier_model_path`. The default model is a model we trained on ESC50, however, if you would like to specify your own model just use paths that point to your own model.
+
 ---------------------------------------------------------------------------------------------------------
 
 ### Posthoc Interpretation via Quantization (PIQ)
@@ -160,6 +197,9 @@ python eval.py hparams/<config>.yaml --data_folder /yourpath/esc50 --overlap_typ
 
 Note that overlap type should be either `mixture` (for contaminating signal to be set as other signals from ESC50), `LJSpeech` (for contaminating signal to be set as speech), or `white_noise` (for contaminating signal to be set as white noise). Please refer to the L-MAC paper for the performance obtained in each setting. Note that `yourpath/psi_model.ckpt` should point to the path of the model checkpoint you would like to use. The typical path for `yourpath/psi_model.ckpt` would be similar to `results/LMAC_cnn14/1234/save/CKPT+2024-06-20+16-05-44+00/psi_model.ckpt`.
 
+**N.B.** For the **LMAC-TD** case, `yourpath/psi_model` should point to the folder of the model checkpoint you would like to use, not the `.ckpt file` itself. For example, the typical path would look like:
+`results/LMACTD_cnn14/1234/save/CKPT+2024-06-20+16-05-44+00/`.
+
 Note also that `add_wham_noise` should be set to `False`.
 
 Another thing to note is that if you use `--overlap_type LJSpeech`, you would need to specify the path via the variable `ljspeech_path`. If the LJSpeech dataset is not already downloaded on the path you specify, the code will automatically download it, and use the downloaded data.
@@ -211,6 +251,19 @@ python eval.py hparams/<config>.yaml --data_folder /yourpath/esc50 --add_wham_no
 
 ## Citing
 
+Please cite our [LMAC-TD paper](https://arxiv.org/abs/2409.08655) if you use it in your research:
+
+```bibtex
+@inproceedings{lmactd,
+  author={Eleonora Mancini and Francesco Paissan and Mirco Ravanelli and Cem Subakan},
+  booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  title={LMAC-TD: Producing Time Domain Explanations for Audio Classifiers}, 
+  year={2025},
+  pages={1-5}
+  }
+
+```
+
 Please cite our [L-MAC paper](https://arxiv.org/abs/2403.13086) if you use it in your research:
 
 ```bibtex

diff --git a/recipes/ESC50/interpret/eval.py b/recipes/ESC50/interpret/eval.py
@@ -6,8 +6,9 @@
     Please refer to README.md for more details.
 
 Authors
-    * Francesco Paissan 2024
-    * Cem Subakan 2024
+    * Eleonora Mancini 2025
+    * Francesco Paissan 2025
+    * Cem Subakan 2025
 """
 import os
 import random
@@ -22,6 +23,9 @@
 from hyperpyyaml import load_hyperpyyaml
 from train_l2i import L2I
 from train_lmac import LMAC
+from train_lmactd import LMACTD
+
+# from train_sepformerlmac import SepformerLMAC
 from wham_prepare import prepare_wham
 
 import speechbrain as sb
@@ -58,23 +62,12 @@ def __init__(self, root, url, folder_in_archive, download, train=True):
 
 
 class ESCContaminated(torch.utils.data.Dataset):
-    """ESC50 Contaminated dataset
-
-    Arguments
-    ---------
-    esc50_ds : dataset
-        the ESC50 dataset as per training.
-    cont_d : dataset
-        the contamination dataset.
-    overlap_multiplier : int
-        number of overlaps
-    overlap_type : str
-        one of "mixtures" or "LJSpeech" or "white_noise"
-    """
-
     def __init__(
         self, esc50_ds, cont_d, overlap_multiplier=2, overlap_type="mixtures"
     ):
+        """esc50_ds is the ESC50 dataset as per training.
+        cont_d is the contamination dataset.
+        overlap_multiplier works as before"""
         super().__init__()
 
         self.esc50_ds = esc50_ds
@@ -143,7 +136,7 @@ def __getitem__(
 if __name__ == "__main__":
     hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
 
-    with open(hparams_file, encoding="utf-8") as fin:
+    with open(hparams_file) as fin:
         hparams = load_hyperpyyaml(fin, overrides)
 
     if hparams["add_wham_noise"]:
@@ -226,12 +219,44 @@ def __getitem__(
     if hparams["add_wham_noise"]:
         ood_dataset = datasets["valid"]
 
-    assert (
-        hparams["pretrained_interpreter"] is not None
-    ), "You need to specify a path for the pretrained_interpreter!"
-    hparams["psi_model"].load_state_dict(
-        torch.load(hparams["pretrained_interpreter"], map_location="cpu")
-    )
+    if hparams["int_method"] != "lmactd":
+        assert (
+            hparams["pretrained_interpreter"] is not None
+        ), "You need to specify a path for the pretrained_interpreter!"
+        hparams["psi_model"].load_state_dict(
+            torch.load(hparams["pretrained_interpreter"], map_location="cpu")
+        )
+
+    else:
+        assert (
+            hparams["pretrained_interpreter"] is not None
+        ), "You need to specify a path for the pretrained_interpreter!"
+
+        # Load each component separately
+        hparams["MaskNet"].load_state_dict(
+            torch.load(
+                f"{hparams['pretrained_interpreter']}/masknet.ckpt",
+                torch.device("cpu"),
+            )
+        )
+        hparams["Encoder"].load_state_dict(
+            torch.load(
+                f"{hparams['pretrained_interpreter']}/encoder.ckpt",
+                torch.device("cpu"),
+            )
+        )
+        hparams["Decoder"].load_state_dict(
+            torch.load(
+                f"{hparams['pretrained_interpreter']}/decoder.ckpt",
+                torch.device("cpu"),
+            )
+        )
+        hparams["convt_decoder"].load_state_dict(
+            torch.load(
+                f"{hparams['pretrained_interpreter']}/convt_decoder.ckpt",
+                torch.device("cpu"),
+            )
+        )
 
     if hparams["int_method"] == "lmac":
         Interpreter = LMAC(
@@ -253,6 +278,16 @@ def __getitem__(
             hparams=hparams,
             run_opts=run_opts,
         )
+
+    elif hparams["int_method"] == "lmactd":
+        class_labels = list(label_encoder.ind2lab.values())
+        hparams["class_labels"] = class_labels
+        Interpreter = LMACTD(
+            modules=hparams["modules"],
+            opt_class=hparams["opt_class"],
+            hparams=hparams,
+            run_opts=run_opts,
+        )
 
     if hparams["single_sample"] is None:
         Interpreter.evaluate(
@@ -265,8 +300,7 @@ def __getitem__(
                 else {"batch_size": 2}
             ),
         )
-
-    else:
+    elif hparams["int_method"] != "lmactd" and hparams["single_sample"] is not None:   
         wav, sr = torchaudio.load(hparams["single_sample"])
         wav = T.Resample(sr, hparams["sample_rate"])(wav).to(run_opts["device"])
 
@@ -308,3 +342,16 @@ def plot_spec(X, suffix=""):
             xhat_tm,
             hparams["sample_rate"],
         )
+
+    elif hparams["int_method"] == "lmactd" and hparams["single_sample"] is not None:
+        wav, sr = torchaudio.load(hparams["single_sample"])
+        wav = T.Resample(sr, hparams["sample_rate"])(wav).to(run_opts["device"])
+
+        with torch.no_grad():
+            X_int, x_td = Interpreter.interpret_computation_steps(wav)
+
+        torchaudio.save(
+            ".".join(hparams["single_sample"].split(".")[:-1]) + "_lmacTD_int.wav",
+            x_td.cpu(),
+            hparams["sample_rate"],
+        )
diff --git a/recipes/ESC50/interpret/hparams/amt_focalnet.yaml b/recipes/ESC50/interpret/hparams/amt_focalnet.yaml
@@ -11,7 +11,7 @@
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
-__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
 
 # Set up folders for reading from and writing to
 data_folder: !PLACEHOLDER  # e.g., /localscratch/ESC-50-master

diff --git a/recipes/ESC50/interpret/hparams/amt_vit.yaml b/recipes/ESC50/interpret/hparams/amt_vit.yaml
@@ -11,7 +11,7 @@
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
-__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
 
 # Set up folders for reading from and writing to
 data_folder: !PLACEHOLDER  # e.g., /localscratch/ESC-50-master

diff --git a/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml b/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml
@@ -9,7 +9,7 @@
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
-__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
 
 overlap_type: "mixtures"
 

diff --git a/recipes/ESC50/interpret/hparams/l2i_conv2d.yaml b/recipes/ESC50/interpret/hparams/l2i_conv2d.yaml
@@ -9,7 +9,7 @@
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
-__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
 
 # Set up folders for reading from and writing to
 data_folder: !PLACEHOLDER  # e.g., /localscratch/ESC-50-master

diff --git a/recipes/ESC50/interpret/hparams/lmac_cnn14.yaml b/recipes/ESC50/interpret/hparams/lmac_cnn14.yaml
@@ -8,7 +8,7 @@
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
-__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
 
 # Set up folders for reading from and writing to
 # Dataset must already exist at `audio_data_folder`