add voxceleb1 / voxceleb2 datasets

Wadaboa · Wadaboa · commit 6cbff1748b94 · 2022-10-30T23:04:17.000Z
diff --git a/init/colab_requirements.txt b/init/colab_requirements.txt
@@ -1,7 +1,8 @@
-torchaudio==0.9.0
+torchaudio==0.13.0
 pyyaml==5.4.1
 scikit-learn==1.0
 wandb==0.12.4
 rich==10.12.0
 umap-learn==0.5.1
 librosa==0.8.1
+joblib==1.1.0
diff --git a/init/requirements.txt b/init/requirements.txt
@@ -1,7 +1,7 @@
 black==21.9b0
 tqdm==4.62.3
-torch==1.9.1
-torchaudio==0.9.1
+torch==1.13.0
+torchaudio==0.13.0
 matplotlib==3.4.3
 pandas==1.3.3
 pyyaml==5.4.1
@@ -12,3 +12,4 @@ requests==2.26.0
 rich==10.12.0
 umap-learn==0.5.1
 librosa==0.8.1
+joblib==1.1.0
diff --git a/src/datasets.py b/src/datasets.py
@@ -1,13 +1,23 @@
 import os
 import itertools
+import csv
+import shutil
+import logging
+from pathlib import Path
 from collections import defaultdict
 from functools import partial
 
 import torch
 import torchaudio
 import librosa
 import numpy as np
+import soundfile as sf
+from joblib import Parallel, delayed
+from librosa.core.audio import __audioread_load as audioread_load
 from tqdm import tqdm
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
 
 import utils
 
@@ -352,3 +362,235 @@ def get_path(self, idx):
             speaker_id,
             f"{speaker_id}_{utterance_id}_{self._mic_id}{self._audio_ext}",
         )
+
+
+class VoxCeleb1Dataset(SpeakerDataset, torchaudio.datasets.VoxCeleb1Identification):
+    """
+    Custom VoxCeleb1 dataset for speaker-related tasks
+    """
+
+    def __init__(self, root, transforms=None, *args, **kwargs):
+        if not os.path.exists(root):
+            os.makedirs(root, exist_ok=True)
+            kwargs["download"] = True
+        torchaudio.datasets.VoxCeleb1Identification.__init__(
+            self, root, *args, **kwargs
+        )
+        SpeakerDataset.__init__(self, transforms=transforms)
+
+    def get_speakers_utterances(self):
+        speakers_utterances = defaultdict(list)
+        for i, file_path in enumerate(self._flist):
+            speaker_id, _, _ = file_path.split("/")[-3:]
+            speakers_utterances[speaker_id].append(i)
+        return speakers_utterances
+
+    def get_sample(self, idx):
+        (
+            waveform,
+            sample_rate,
+            speaker,
+            _,
+        ) = torchaudio.datasets.VoxCeleb1Identification.__getitem__(self, idx)
+        return waveform, sample_rate, speaker
+
+    def get_path(self, idx):
+        return self._flist[idx]
+
+
+class VoxCeleb2(Dataset):
+    """
+    VoxCeleb2 dataset following torchaudio's implementation of VoxCeleb1.
+
+    References:
+    - https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html
+    - https://pytorch.org/audio/stable/_modules/torchaudio/datasets/voxceleb1.html
+    """
+
+    SAMPLE_RATE = 16000
+    # Credentials from https://github.com/UoA-CARES-Student/VoxCeleb2-Dataset
+    _USERNAME = "voxceleb1912"
+    _PASSWORD = "0s42xuw6"
+    _ARCHIVE_CONFIGS = {
+        "dev": {
+            "archive_name": "vox2_dev_aac.zip",
+            "urls": [
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partaa",
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partab",
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partac",
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partad",
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partae",
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partaf",
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partag",
+                "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_dev_aac_partah",
+            ],
+            "checksums": [None, None, None, None, None, None, None, None],
+        },
+        "test": {
+            "archive_name": "vox2_test_aac.zip",
+            "url": "http://cnode01.mm.kaist.ac.kr/voxceleb/vox1a/vox2_test_aac.zip",
+            "checksum": "e4d9200107a7bc60f0b620d5dc04c3aab66681b649f9c218380ac43c6c722079",
+        },
+    }
+    _IDEN_SPLIT_URL = "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/vox2_meta.csv"
+    _ext_audio = ".wav"
+
+    def __init__(self, root, subset="dev", meta_url=_IDEN_SPLIT_URL, download=False):
+        if subset not in ["dev", "test"]:
+            raise ValueError("`subset` must be one of ['dev', 'test']")
+        root = os.fspath(root)
+        self._path = os.path.join(root, "wav")
+        if not os.path.isdir(self._path):
+            if not download:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+            self._download_extract_wavs(root)
+
+        # Download the `vox2_meta.csv` file to get the dev and test lists
+        meta_list_path = os.path.join(root, os.path.basename(meta_url))
+        if not os.path.exists(meta_list_path):
+            download_url_to_file(meta_url, meta_list_path)
+        self._flist = self._get_flist(root, meta_list_path, subset)
+
+    def _convert_to_wav(self, root, paths):
+        """
+        Convert .m4a files in the given paths to .wav
+        """
+
+        def _to_wav(path):
+            try:
+                waveform, _ = audioread_load(
+                    path, offset=0.0, duration=None, dtype=np.float32
+                )
+                path_wav = os.path.splitext(path)[0] + ".wav"
+                sf.write(path_wav, waveform, self.SAMPLE_RATE)
+            except:
+                logging.warning(f"Could not convert file {path} to .wav.")
+            os.remove(path)
+
+        Parallel(n_jobs=-1, backend="threading")(
+            delayed(_to_wav)(os.path.join(root, p))
+            for p in tqdm(paths, desc="Converting audios to .wav")
+            if p.endswith(".m4a")
+        )
+
+    def _download_extract_wavs(self, root):
+        """
+        Download dataset splits, extract zipped archives
+        and convert .m4a files to .wav
+        """
+        if not os.path.isdir(root):
+            os.makedirs(root)
+        for split, split_config in self._ARCHIVE_CONFIGS.items():
+            split_name = split_config["archive_name"]
+            split_path = os.path.join(root, split_name)
+            # The zip file of dev data is splited to 8 chunks.
+            # Download and combine them into one file before extraction.
+            if split == "dev":
+                urls = split_config["urls"]
+                checksums = split_config["checksums"]
+                with open(split_path, "wb") as f:
+                    for url, checksum in zip(urls, checksums):
+                        file_path = os.path.join(root, os.path.basename(url))
+                        utils.download_auth_url_to_file(
+                            url,
+                            file_path,
+                            self._USERNAME,
+                            self._PASSWORD,
+                            hash_prefix=checksum,
+                        )
+                        with open(file_path, "rb") as f_split:
+                            f.write(f_split.read())
+            elif split == "test":
+                url = split_config["url"]
+                checksum = split_config["checksum"]
+                file_path = os.path.join(root, os.path.basename(url))
+                utils.download_auth_url_to_file(
+                    url, file_path, self._USERNAME, self._PASSWORD, hash_prefix=checksum
+                )
+            extracted_paths = extract_archive(split_path)
+            self._convert_to_wav(root, extracted_paths)
+        shutil.move(os.path.join(root, "aac"), os.path.join(root, "wav"))
+
+    def _get_flist(self, root, meta_list_path, subset):
+        """
+        Load the full list of files in the given split
+        """
+        f_list = []
+        with open(meta_list_path, "r") as f:
+            csv_file = csv.reader(f, delimiter=",")
+            for line in csv_file:
+                id, set = line[0].strip(), line[-1].strip()
+                if set == subset:
+                    f_list += [str(i) for i in Path(root).rglob(f"{id}/**/*.wav")]
+        return sorted(f_list)
+
+    def _get_file_id(self, file_path, _ext_audio):
+        """
+        Return the file identifier as a combination of speaker id,
+        youtube video id and utterance id
+        """
+        speaker_id, youtube_id, utterance_id = file_path.split("/")[-3:]
+        utterance_id = utterance_id.replace(_ext_audio, "")
+        file_id = "-".join([speaker_id, youtube_id, utterance_id])
+        return file_id
+
+    def get_metadata(self, n):
+        """
+        Get metadata for the n-th sample from the dataset.
+        Returns filepath instead of waveform, but otherwise
+        returns the same fields as `__getitem__`.
+        """
+        file_path = self._flist[n]
+        file_id = self._get_file_id(file_path, self._ext_audio)
+        speaker_id = file_id.split("-")[0]
+        speaker_id = int(speaker_id[3:])
+        return file_path, self.SAMPLE_RATE, speaker_id, file_id
+
+    def __getitem__(self, n):
+        """
+        Load the n-th sample from the dataset
+        """
+        metadata = self.get_metadata(n)
+        waveform, sample_rate = torchaudio.load(metadata[0], metadata[1])
+        if sample_rate != self.SAMPLE_RATE:
+            raise ValueError(
+                f"sample rate should be {self.SAMPLE_RATE}, but got {sample_rate}"
+            )
+        return (waveform,) + metadata[1:]
+
+    def __len__(self):
+        return len(self._flist)
+
+
+class VoxCeleb2Dataset(SpeakerDataset, VoxCeleb2):
+    """
+    Custom VoxCeleb2 dataset for speaker-related tasks
+    """
+
+    def __init__(self, root, transforms=None, *args, **kwargs):
+        if not os.path.exists(root):
+            os.makedirs(root, exist_ok=True)
+            kwargs["download"] = True
+        VoxCeleb2.__init__(self, root, *args, **kwargs)
+        SpeakerDataset.__init__(self, transforms=transforms)
+
+    def get_speakers_utterances(self):
+        speakers_utterances = defaultdict(list)
+        for i, file_path in enumerate(self._flist):
+            speaker_id, _, _ = file_path.split("/")[-3:]
+            speakers_utterances[speaker_id].append(i)
+        return speakers_utterances
+
+    def get_sample(self, idx):
+        (
+            waveform,
+            sample_rate,
+            speaker,
+            _,
+        ) = VoxCeleb2.__getitem__(self, idx)
+        return waveform, sample_rate, speaker
+
+    def get_path(self, idx):
+        return self._flist[idx]
diff --git a/src/utils.py b/src/utils.py
@@ -2,6 +2,8 @@
 import datetime
 import os
 import string
+import hashlib
+import requests
 
 import torch
 import numpy as np
@@ -10,6 +12,7 @@
 import IPython.display as ipd
 import wandb
 import umap
+from tqdm import tqdm
 from sklearn.manifold import TSNE
 from sklearn.decomposition import TruncatedSVD
 from scipy.spatial import ConvexHull
@@ -463,3 +466,40 @@ def chart_dependencies(model, n_mels=80, device="cpu"):
     ).all() and (
         inputs.grad[random_index] != 0
     ).any(), f"Only index {random_index} should have non-zero gradients"
+
+
+def download_auth_url_to_file(
+    url, file_path, username, password, hash_prefix=None, progress=True
+):
+    """
+    Download the file at the given URL using the given credentials,
+    and finally double check the checksum of the downloaded file
+    """
+    if hash_prefix is not None:
+        sha256 = hashlib.sha256()
+    response = requests.get(url, auth=(username, password), stream=True)
+    if response.status_code == 200:
+        file_size = int(response.headers.get("content-length", 0))
+        with open(file_path, "wb") as out:
+            with tqdm(
+                total=file_size,
+                disable=not progress,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as pbar:
+                for buffer in response.iter_content():
+                    out.write(buffer)
+                    if hash_prefix is not None:
+                        sha256.update(buffer)
+                    pbar.update(len(buffer))
+        if hash_prefix is not None:
+            digest = sha256.hexdigest()
+            if digest[: len(hash_prefix)] != hash_prefix:
+                raise RuntimeError(
+                    f'invalid hash value (expected "{hash_prefix}", got "{digest}")'
+                )
+        return True
+    raise RuntimeError(
+        f"Couldn't download from url {url}, got response status code {response.status_code}"
+    )