Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
// Usare IntelliSense per informazioni sui possibili attributi.
// Al passaggio del mouse vengono visualizzate le descrizioni degli attributi esistenti.
// Per altre informazioni, visitare: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Debugger Python: train_lmactd.py",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": false,
"args": ["/speechbrain_fork/recipes/ESC50/interpret/hparams/lmactd_cnn14.yaml", "--data_folder=/data/ESC50 ", "--batch_size=1"]
},
{
"name": "Debugger Python: eval.py",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": false,
"args": [
"/speechbrain_fork/recipes/ESC50/interpret/hparams/lmactd_cnn14.yaml",
"--data_folder", "/data/ESC50",
"--overlap_type", "mixtures",
"--add_wham_noise", "False",
"--pretrained_interpreter", "/speechbrain_fork/recipes/ESC50/interpret/results/LMACTD_cnn14/44/save/CKPT+2025-03-26+16-19-31+00"
]
}
]
}
20 changes: 9 additions & 11 deletions recipes/ESC50/esc50_prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Creates data manifest files for ESC50
If the data does not exist in the specified --data_folder, we download the data automatically.

https://github.com/karolpiczak/ESC-50/
https://github.com/karoldvl/ESC-50/

Authors:
* Cem Subakan 2022, 2023
Expand All @@ -12,6 +12,7 @@
"""

import json
import logging
import os
import shutil

Expand All @@ -20,12 +21,11 @@

import speechbrain as sb
from speechbrain.dataio.dataio import load_data_csv, read_audio
from speechbrain.utils.fetching import LocalStrategy, fetch
from speechbrain.utils.logger import get_logger
from speechbrain.utils.fetching import fetch

logger = get_logger(__name__)
logger = logging.getLogger(__name__)

ESC50_DOWNLOAD_URL = "https://github.com/karolpiczak/ESC-50/archive/master.zip"
ESC50_DOWNLOAD_URL = "https://github.com/karoldvl/ESC-50/archive/master.zip"
MODIFIED_METADATA_FILE_NAME = "esc50_speechbrain.csv"

ACCEPTABLE_FOLD_NUMS = [1, 2, 3, 4, 5]
Expand All @@ -47,16 +47,14 @@ def download_esc50(data_path):
temp_path = os.path.join(data_path, "temp_download")

# download the data
archive_path = fetch(
fetch(
"master.zip",
"https://github.com/karolpiczak/ESC-50/archive/", # noqa ignore-url-check
"https://github.com/karoldvl/ESC-50/archive/",
savedir=temp_path,
# URL, so will be fetched directly in the savedir anyway
local_strategy=LocalStrategy.COPY_SKIP_CACHE,
)

# unpack the .zip file
shutil.unpack_archive(archive_path, data_path)
shutil.unpack_archive(os.path.join(temp_path, "master.zip"), data_path)

# move the files up to the datapath
files = os.listdir(os.path.join(data_path, "ESC-50-master"))
Expand Down Expand Up @@ -266,7 +264,7 @@ def create_json(metadata, audio_data_folder, folds_list, json_file):
if not os.path.exists(parent_dir):
os.mkdir(parent_dir)

with open(json_file, mode="w", encoding="utf-8") as json_f:
with open(json_file, mode="w") as json_f:
json.dump(json_dict, json_f, indent=2)

logger.info(f"{json_file} successfully created!")
Expand Down
59 changes: 56 additions & 3 deletions recipes/ESC50/interpret/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,12 @@ Some results that are obtained with this recipe on the OOD evaluation are as fol

|Method | AI | AD | AG |FF |Fid-In | SPS | COMP |
|--- |--- |--- |--- |--- | ---- | -- | --- |
|L-MAC | 61.62 | 3.83 | 33.48 | 0.40 | 0.82 | 0.93 | 9.77 |
|L-MAC FT | 58.87 | 4.89 | 30.84 | 0.40 | 0.82 | 0.82 | 10.65 |
|L2I | 6.75 |25.93 |1.25 |0.26 | 0.01 | 0.58 | 11.38 |
|L-MAC | 60.63 | 4.82 | 35.85 | 0.39 | 0.81 | 0.94 | 9.61 |
|L-MAC FT | 50.75 | 6.73 | 26.00 | 0.39 | 0.81 | 0.84 | 10.51 |
|L-MAC TD, α = 1.00 | 56.75 | 3.62 | 16.84 | 0.42 | 0.88 | 0.89 | 10.36 |
|L-MAC TD, α = 0.75 | 59.50 | 3.42 | 21.22 | 0.41 | 0.87 | 0.88 | 10.35 |
|L-MAC TD, α = 0.00 | 39.88 | 7.60 | 9.30 | 0.42 | 0.82 | 0.83 | 10.69 |
|L2I | 5.00 | 25.65 | 1.00 |0.20 | 0.35 | 0.52 | 10.99 |

Please, refer to the [L-MAC paper](https://arxiv.org/abs/2403.13086) for more information about the evaluation metrics.

Expand Down Expand Up @@ -76,6 +79,40 @@ where $g_w$ is the guidance weight for the interpreter.

The pretrained classifier to be interpreted is specified with the variables `embedding_model_path`, and `classifier_model_path`. The default model is a model we trained on ESC50, however, if you would like to specify your own model just use paths that point to your own model.

---------------------------------------------------------------------------------------------------------
### LMAC-TD: Producing Time Domain Explanations for Audio Classifiers

Following the approach adopted in LMAC, LMAC-TD trains an interpreter on the classifier's representations to reconstruct interpretations based on a amortized inference loss.

For more details, refer to our [LMAC-TD paper](https://arxiv.org/abs/2409.08655). You can also find samples on the [companion website](https://francescopaissan.it/lmac-td/).

To train LMAC-TD on a convolutional classifier using the ESC50 dataset, use the `train_lmactd.py` script. Run the following command:

```shell
python train_lmactd.py hparams/lmactd_cnn14.yaml --data_folder=/yourpath/ESC50
```

Eventually, you can use WHAM! augmentation to boost the interpretations performance, using:
```shell
python train_lmactd.py hparams/lmactd_cnn14.yaml
--data_folder=/yourpath/ESC50 --add_wham_noise True \
--wham_folder=/yourpath/wham_noise
```
**Note**: The WHAM! noise dataset can be downloaded from [here](http://wham.whisper.ai/).

<!-- To run the finetuning stage of the interpreter, use
```shell
python train_sepformerlmac_classifierreps.py \
hparams/sepformerlmac_cnn14_classifierreps.yaml --data_folder=/yourpath/ESC50 \
--add_wham_noise True --wham_folder=/yourpath/wham_noise \
--finetuning True --pretrained_interpreter=/yourLMACcheckpointpath/psi_model.ckpt --g_w 4
```
where $g_w$ is the guidance weight for the interpreter. -->

#### Specifying the pretrained classifier

The pretrained classifier to be interpreted is specified with the variables `embedding_model_path`, and `classifier_model_path`. The default model is a model we trained on ESC50, however, if you would like to specify your own model just use paths that point to your own model.

---------------------------------------------------------------------------------------------------------

### Posthoc Interpretation via Quantization (PIQ)
Expand Down Expand Up @@ -160,6 +197,9 @@ python eval.py hparams/<config>.yaml --data_folder /yourpath/esc50 --overlap_typ

Note that overlap type should be either `mixture` (for contaminating signal to be set as other signals from ESC50), `LJSpeech` (for contaminating signal to be set as speech), or `white_noise` (for contaminating signal to be set as white noise). Please refer to the L-MAC paper for the performance obtained in each setting. Note that `yourpath/psi_model.ckpt` should point to the path of the model checkpoint you would like to use. The typical path for `yourpath/psi_model.ckpt` would be similar to `results/LMAC_cnn14/1234/save/CKPT+2024-06-20+16-05-44+00/psi_model.ckpt`.

**N.B.** For the **LMAC-TD** case, `yourpath/psi_model` should point to the folder of the model checkpoint you would like to use, not the `.ckpt file` itself. For example, the typical path would look like:
`results/LMACTD_cnn14/1234/save/CKPT+2024-06-20+16-05-44+00/`.

Note also that `add_wham_noise` should be set to `False`.

Another thing to note is that if you use `--overlap_type LJSpeech`, you would need to specify the path via the variable `ljspeech_path`. If the LJSpeech dataset is not already downloaded on the path you specify, the code will automatically download it, and use the downloaded data.
Expand Down Expand Up @@ -211,6 +251,19 @@ python eval.py hparams/<config>.yaml --data_folder /yourpath/esc50 --add_wham_no

## Citing

Please cite our [LMAC-TD paper](https://arxiv.org/abs/2409.08655) if you use it in your research:

```bibtex
@inproceedings{lmactd,
author={Eleonora Mancini and Francesco Paissan and Mirco Ravanelli and Cem Subakan},
booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title={LMAC-TD: Producing Time Domain Explanations for Audio Classifiers},
year={2025},
pages={1-5}
}

```

Please cite our [L-MAC paper](https://arxiv.org/abs/2403.13086) if you use it in your research:

```bibtex
Expand Down
97 changes: 72 additions & 25 deletions recipes/ESC50/interpret/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
Please refer to README.md for more details.

Authors
* Francesco Paissan 2024
* Cem Subakan 2024
* Eleonora Mancini 2025
* Francesco Paissan 2025
* Cem Subakan 2025
"""
import os
import random
Expand All @@ -22,6 +23,9 @@
from hyperpyyaml import load_hyperpyyaml
from train_l2i import L2I
from train_lmac import LMAC
from train_lmactd import LMACTD

# from train_sepformerlmac import SepformerLMAC
from wham_prepare import prepare_wham

import speechbrain as sb
Expand Down Expand Up @@ -58,23 +62,12 @@ def __init__(self, root, url, folder_in_archive, download, train=True):


class ESCContaminated(torch.utils.data.Dataset):
"""ESC50 Contaminated dataset

Arguments
---------
esc50_ds : dataset
the ESC50 dataset as per training.
cont_d : dataset
the contamination dataset.
overlap_multiplier : int
number of overlaps
overlap_type : str
one of "mixtures" or "LJSpeech" or "white_noise"
"""

def __init__(
self, esc50_ds, cont_d, overlap_multiplier=2, overlap_type="mixtures"
):
"""esc50_ds is the ESC50 dataset as per training.
cont_d is the contamination dataset.
overlap_multiplier works as before"""
super().__init__()

self.esc50_ds = esc50_ds
Expand Down Expand Up @@ -143,7 +136,7 @@ def __getitem__(
if __name__ == "__main__":
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

with open(hparams_file, encoding="utf-8") as fin:
with open(hparams_file) as fin:
hparams = load_hyperpyyaml(fin, overrides)

if hparams["add_wham_noise"]:
Expand Down Expand Up @@ -226,12 +219,44 @@ def __getitem__(
if hparams["add_wham_noise"]:
ood_dataset = datasets["valid"]

assert (
hparams["pretrained_interpreter"] is not None
), "You need to specify a path for the pretrained_interpreter!"
hparams["psi_model"].load_state_dict(
torch.load(hparams["pretrained_interpreter"], map_location="cpu")
)
if hparams["int_method"] != "lmactd":
assert (
hparams["pretrained_interpreter"] is not None
), "You need to specify a path for the pretrained_interpreter!"
hparams["psi_model"].load_state_dict(
torch.load(hparams["pretrained_interpreter"], map_location="cpu")
)

else:
assert (
hparams["pretrained_interpreter"] is not None
), "You need to specify a path for the pretrained_interpreter!"

# Load each component separately
hparams["MaskNet"].load_state_dict(
torch.load(
f"{hparams['pretrained_interpreter']}/masknet.ckpt",
torch.device("cpu"),
)
)
hparams["Encoder"].load_state_dict(
torch.load(
f"{hparams['pretrained_interpreter']}/encoder.ckpt",
torch.device("cpu"),
)
)
hparams["Decoder"].load_state_dict(
torch.load(
f"{hparams['pretrained_interpreter']}/decoder.ckpt",
torch.device("cpu"),
)
)
hparams["convt_decoder"].load_state_dict(
torch.load(
f"{hparams['pretrained_interpreter']}/convt_decoder.ckpt",
torch.device("cpu"),
)
)

if hparams["int_method"] == "lmac":
Interpreter = LMAC(
Expand All @@ -253,6 +278,16 @@ def __getitem__(
hparams=hparams,
run_opts=run_opts,
)

elif hparams["int_method"] == "lmactd":
class_labels = list(label_encoder.ind2lab.values())
hparams["class_labels"] = class_labels
Interpreter = LMACTD(
modules=hparams["modules"],
opt_class=hparams["opt_class"],
hparams=hparams,
run_opts=run_opts,
)

if hparams["single_sample"] is None:
Interpreter.evaluate(
Expand All @@ -265,8 +300,7 @@ def __getitem__(
else {"batch_size": 2}
),
)

else:
elif hparams["int_method"] != "lmactd" and hparams["single_sample"] is not None:
wav, sr = torchaudio.load(hparams["single_sample"])
wav = T.Resample(sr, hparams["sample_rate"])(wav).to(run_opts["device"])

Expand Down Expand Up @@ -308,3 +342,16 @@ def plot_spec(X, suffix=""):
xhat_tm,
hparams["sample_rate"],
)

elif hparams["int_method"] == "lmactd" and hparams["single_sample"] is not None:
wav, sr = torchaudio.load(hparams["single_sample"])
wav = T.Resample(sr, hparams["sample_rate"])(wav).to(run_opts["device"])

with torch.no_grad():
X_int, x_td = Interpreter.interpret_computation_steps(wav)

torchaudio.save(
".".join(hparams["single_sample"].split(".")[:-1]) + "_lmacTD_int.wav",
x_td.cpu(),
hparams["sample_rate"],
)
2 changes: 1 addition & 1 deletion recipes/ESC50/interpret/hparams/amt_focalnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]

# Set up folders for reading from and writing to
data_folder: !PLACEHOLDER # e.g., /localscratch/ESC-50-master
Expand Down
2 changes: 1 addition & 1 deletion recipes/ESC50/interpret/hparams/amt_vit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]

# Set up folders for reading from and writing to
data_folder: !PLACEHOLDER # e.g., /localscratch/ESC-50-master
Expand Down
2 changes: 1 addition & 1 deletion recipes/ESC50/interpret/hparams/l2i_cnn14.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]

overlap_type: "mixtures"

Expand Down
2 changes: 1 addition & 1 deletion recipes/ESC50/interpret/hparams/l2i_conv2d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]

# Set up folders for reading from and writing to
data_folder: !PLACEHOLDER # e.g., /localscratch/ESC-50-master
Expand Down
2 changes: 1 addition & 1 deletion recipes/ESC50/interpret/hparams/lmac_cnn14.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]

# Set up folders for reading from and writing to
# Dataset must already exist at `audio_data_folder`
Expand Down
Loading