Compute total duration of an audio dataset
from librosa import get_duration, load
from pathlib import Path
def total_duration(audio_dir: Path | str, extension: str = "wav") -> tuple[float, float]:
_ = sum(get_duration(y=y, sr=sr) for y, sr in [load(fp) for fp in Path(audio_dir).rglob(f"*.{extension}")])
return divmod(_, 60)
m, s = total_duration("/home/ubuntu/nvidia/ttron/datasets/cus-it-it-f-chiara-s-001-22kHz-ttron")
print(f"Audio dataset duration: {m:.0f} minutes, {s:.2f} seconds.")
# Audio dataset duration: 103 minutes, 56.48 seconds.
Formulae for spectrogram-to-audio length conversion and vice versa
Spectrogram-to-audio length conversion
To compute the number of mel-spectrogram frames for a given audio clip, you can use the following formula:
Number of frames = 1 + floor((audio_length_in_seconds * sampling_rate - window_size) / hop_length)
Where:
- audio_length_in_seconds: Length of the audio clip in seconds
- sampling_rate: Sampling rate of the audio clip in Hz
- window_size: Window size used for the Short-Time Fourier Transform (STFT) in samples
- hop_length: Hop length used for the STFT in samples
- floor(): The floor function, which rounds a number down to the nearest integer
This formula calculates the total number of frames by dividing the total number of samples in the audio clip (minus the window size) by the hop length and adding 1. The floor function ensures that the result is an integer value.
Audio-to-spectrogram length conversion
If you want to compute the length of an audio clip in seconds from the number of mel-spectrogram frames, you can use the following formula:
audio_length_in_seconds = (number_of_frames - 1) * hop_length / sampling_rate
Where:
- number_of_frames: The number of mel-spectrogram frames
- hop_length: Hop length used for the Short-Time Fourier Transform (STFT) in samples
- sampling_rate: Sampling rate of the audio clip in Hz
This formula calculates the audio length in seconds by multiplying the number of frames minus 1 by the hop length, and then dividing by the sampling_rate.
Python Code
def compute_num_frames(audio_length_in_seconds: float, sampling_rate: int, window_size: int, hop_length: int) -> int:
return 1 + ((audio_length_in_seconds * sampling_rate - window_size) // hop_length) # type: ignore
def compute_audio_length(number_of_frames: int, sampling_rate: int, hop_length: int) -> float:
return (number_of_frames - 1) * hop_length / sampling_rate
audio_length_in_seconds = 15
sampling_rate = 22050
window_size = 1024
hop_length = 256
number_of_frames = compute_num_frames(audio_length_in_seconds, sampling_rate, window_size, hop_length)
print(f"Number of mel-spectrogram frames: {number_of_frames}")
computed_audio_length = compute_audio_length(number_of_frames, sampling_rate, hop_length)
print(f"Computed audio length in seconds: {computed_audio_length}")