Source code for pykanto.signal.analysis

# ─── DESCRIPTION ──────────────────────────────────────────────────────────────

"""
Basic audio feature calculations (spectral centroids, peak frequencies, etc.)
"""

# ──── IMPORTS ─────────────────────────────────────────────────────────────────

from __future__ import annotations

import pickle
from typing import TYPE_CHECKING, List, Tuple

import librosa
import numpy as np
from pykanto.plot import show_minmax_frequency, show_spec_centroid_bandwidth
from pykanto.signal.filter import mels_to_hzs
from pykanto.signal.spectrogram import retrieve_spectrogram

if TYPE_CHECKING:
    from pykanto.dataset import KantoData

# ──── FUNCTIONS ───────────────────────────────────────────────────────────────


[docs]def get_peak_freqs( dataset: KantoData, spectrograms: np.ndarray, melscale: bool = True, threshold: float = 0.3, ) -> np.ndarray: """ Return the peak frequencies of each spectrogram in an array of spectrograms. Args: dataset (KantoData): Vocalisation dataset. spectrograms (np.ndarray): Array of np.ndarray spectrograms. melscale (bool, optional): Are the spectrograms in the mel scale? Defaults to True. threshold (float, optional): Threshold for peak detection. Defaults to 0.3. Returns: np.ndarray: Array with peak frequencies. """ minfreq = dataset.parameters.lowcut min_db = -dataset.parameters.top_dB if melscale: hz_freq = mels_to_hzs(dataset) result = np.array( [ hz_freq[np.argmax(np.max(w, axis=1))] if (max(np.max(w, axis=1)) > min_db * (1 - threshold)) else -1 for w in spectrograms ] ) return result else: return np.array( [minfreq + np.argmax(np.max(w, axis=1)) for w in spectrograms] )
# REVIEW did not test for melscale = False
[docs]def spec_centroid_bandwidth( dataset: KantoData, key: None | str = None, spec: None | np.ndarray = None, plot: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: """ Calculate a vocalisation's spectral centroid and bandwidth from a mel spectrogram. You can either provide a key string for a vocalisation or its mel spectrogram directly. Args: dataset (KantoData): Dataset object with your data. key (None | str = None): Key of a vocalisation. Defaults to None. spec (spec: None | np.ndarray): Mel spectrogram. Defaults to None. plot (bool, optional): Whether to show the result. Defaults to False. Returns: Tuple[np.ndarray, np.ndarray]: A tuple with the centroids and bandwidths. """ if not key and not isinstance(spec, np.ndarray): raise KeyError("You need to provide either a key or a spectrogram") if not isinstance(spec, np.ndarray): spec = retrieve_spectrogram(dataset.files.at[key, "spectrogram"]) offset = 0 if np.min(spec) < 0: offset = abs(np.min(spec)) centroid = librosa.feature.spectral_centroid( S=spec + offset, freq=mels_to_hzs(dataset) )[0] spec_bw = librosa.feature.spectral_bandwidth( S=spec + offset, freq=mels_to_hzs(dataset) )[0] centroid[centroid <= dataset.parameters.lowcut] = np.nan spec_bw[spec_bw <= dataset.parameters.lowcut] = np.nan if plot: show_spec_centroid_bandwidth( dataset, centroid, spec_bw, key=key, spec=spec ) return centroid, spec_bw
[docs]def get_mean_sd_mfcc(S: np.ndarray, n_mfcc: int) -> np.ndarray: """ Extract the mean and SD of n Mel-frequency cepstral coefficients (MFCCs) calculated ffrom a log-power Mel spectrogram. Args: S (np.ndarray): A log-power Mel spectrogram. n_mfcc (int): Number of coefficients to return. Returns: np.ndarray: Array containing mean and std of each coefficient (len = n_mfcc*2). """ mfcc = librosa.feature.mfcc(S=S, n_mfcc=n_mfcc) mean_sd = np.hstack((np.mean(mfcc, axis=1), np.std(mfcc, axis=1))) return mean_sd
[docs]def approximate_minmax_frequency( dataset: KantoData, key: None | str = None, spec: None | np.ndarray = None, roll_percents: list[float] = [0.95, 0.1], plot: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: """ Calculate approximate minimum and maximum frequencies from a mel spectrogram. You can either provide a key string for a vocalisation or its mel spectrogram directly. Args: dataset (KantoData): Dataset object with your data. key (None | str = None): Key of a vocalisation. Defaults to None. spec (spec: None | np.ndarray): Mel spectrogram. Defaults to None. roll_percents (list[float, float], optional): Percentage of energy contained in bin. Defaults to [0.95, 0.1]. plot (bool, optional): Whether to show the result. Defaults to False. Returns: Tuple[np.ndarray, np.ndarray]: A tuple with the approximate minimum and maximum frequencies, in this order. """ if not key and not isinstance(spec, np.ndarray): raise KeyError("You need to provide either a key or a spectrogram") if not isinstance(spec, np.ndarray): spec = retrieve_spectrogram(dataset.files.at[key, "spectrogram"]) offset = 0 if np.min(spec) < 0: offset = abs(np.min(spec)) maxfreqs, minfreqs = [ librosa.feature.spectral_rolloff( S=spec + offset, sr=dataset.parameters.sr, roll_percent=p, freq=mels_to_hzs(dataset), )[0] for p in roll_percents ] maxfreqs[maxfreqs <= dataset.parameters.lowcut] = np.nan minfreqs[minfreqs <= dataset.parameters.lowcut] = np.nan if plot: show_minmax_frequency( dataset, maxfreqs, minfreqs, roll_percents, key=key, spec=spec ) return minfreqs, maxfreqs