Source code for pykanto.signal.analysis

# ─── DESCRIPTION ──────────────────────────────────────────────────────────────

"""
Basic audio feature calculations (spectral centroids, peak frequencies, etc.)
"""

# ──── IMPORTS ─────────────────────────────────────────────────────────────────

from __future__ import annotations

import pickle
from typing import TYPE_CHECKING, List, Tuple

import librosa
import numpy as np
from pykanto.plot import show_minmax_frequency, show_spec_centroid_bandwidth
from pykanto.signal.filter import mels_to_hzs
from pykanto.signal.spectrogram import retrieve_spectrogram

if TYPE_CHECKING:
    from pykanto.dataset import KantoData

# ──── FUNCTIONS ───────────────────────────────────────────────────────────────


[docs]def get_peak_freqs(
    dataset: KantoData,
    spectrograms: np.ndarray,
    melscale: bool = True,
    threshold: float = 0.3,
) -> np.ndarray:
    """
    Return the peak frequencies of each spectrogram in an array of spectrograms.

    Args:
        dataset (KantoData): Vocalisation dataset.
        spectrograms (np.ndarray): Array of np.ndarray spectrograms.
        melscale (bool, optional): Are the spectrograms in the mel scale? Defaults to True.
        threshold (float, optional): Threshold for peak detection. Defaults to 0.3.

    Returns:
        np.ndarray: Array with peak frequencies.
    """

    minfreq = dataset.parameters.lowcut
    min_db = -dataset.parameters.top_dB

    if melscale:
        hz_freq = mels_to_hzs(dataset)
        result = np.array(
            [
                hz_freq[np.argmax(np.max(w, axis=1))]
                if (max(np.max(w, axis=1)) > min_db * (1 - threshold))
                else -1
                for w in spectrograms
            ]
        )

        return result

    else:
        return np.array(
            [minfreq + np.argmax(np.max(w, axis=1)) for w in spectrograms]
        )
        # REVIEW did not test for melscale = False


[docs]def spec_centroid_bandwidth(
    dataset: KantoData,
    key: None | str = None,
    spec: None | np.ndarray = None,
    plot: bool = False,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculate a vocalisation's spectral centroid and bandwidth from a mel
    spectrogram. You can either provide a key string for a vocalisation or its
    mel spectrogram directly.

    Args:
        dataset (KantoData): Dataset object with your data.
        key (None | str = None): Key of a vocalisation. Defaults to None.
        spec (spec: None | np.ndarray): Mel spectrogram. Defaults to None.
        plot (bool, optional): Whether to show the result. Defaults to False.

    Returns:
        Tuple[np.ndarray, np.ndarray]: A tuple with the centroids
            and bandwidths.
    """

    if not key and not isinstance(spec, np.ndarray):
        raise KeyError("You need to provide either a key or a spectrogram")
    if not isinstance(spec, np.ndarray):
        spec = retrieve_spectrogram(dataset.files.at[key, "spectrogram"])

    offset = 0
    if np.min(spec) < 0:
        offset = abs(np.min(spec))
    centroid = librosa.feature.spectral_centroid(
        S=spec + offset, freq=mels_to_hzs(dataset)
    )[0]
    spec_bw = librosa.feature.spectral_bandwidth(
        S=spec + offset, freq=mels_to_hzs(dataset)
    )[0]
    centroid[centroid <= dataset.parameters.lowcut] = np.nan
    spec_bw[spec_bw <= dataset.parameters.lowcut] = np.nan

    if plot:
        show_spec_centroid_bandwidth(
            dataset, centroid, spec_bw, key=key, spec=spec
        )

    return centroid, spec_bw


[docs]def get_mean_sd_mfcc(S: np.ndarray, n_mfcc: int) -> np.ndarray:
    """
    Extract the mean and SD of n Mel-frequency cepstral coefficients (MFCCs)
    calculated ffrom a log-power Mel spectrogram.

    Args:
        S (np.ndarray): A log-power Mel spectrogram.
        n_mfcc (int): Number of coefficients to return.

    Returns:
        np.ndarray: Array containing mean and std of each coefficient (len =
        n_mfcc*2).
    """
    mfcc = librosa.feature.mfcc(S=S, n_mfcc=n_mfcc)
    mean_sd = np.hstack((np.mean(mfcc, axis=1), np.std(mfcc, axis=1)))
    return mean_sd


[docs]def approximate_minmax_frequency(
    dataset: KantoData,
    key: None | str = None,
    spec: None | np.ndarray = None,
    roll_percents: list[float] = [0.95, 0.1],
    plot: bool = False,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculate approximate minimum and maximum frequencies from a mel
    spectrogram. You can either provide a key string for a vocalisation or its
    mel spectrogram directly.

    Args:
        dataset (KantoData): Dataset object with your data.
        key (None | str = None): Key of a vocalisation. Defaults to None.
        spec (spec: None | np.ndarray): Mel spectrogram. Defaults to None.
        roll_percents (list[float, float], optional): Percentage of energy
            contained in bin. Defaults to [0.95, 0.1].
        plot (bool, optional): Whether to show the result. Defaults to False.

    Returns:
        Tuple[np.ndarray, np.ndarray]: A tuple with the approximate minimum and
            maximum frequencies, in this order.
    """

    if not key and not isinstance(spec, np.ndarray):
        raise KeyError("You need to provide either a key or a spectrogram")
    if not isinstance(spec, np.ndarray):
        spec = retrieve_spectrogram(dataset.files.at[key, "spectrogram"])

    offset = 0
    if np.min(spec) < 0:
        offset = abs(np.min(spec))

    maxfreqs, minfreqs = [
        librosa.feature.spectral_rolloff(
            S=spec + offset,
            sr=dataset.parameters.sr,
            roll_percent=p,
            freq=mels_to_hzs(dataset),
        )[0]
        for p in roll_percents
    ]

    maxfreqs[maxfreqs <= dataset.parameters.lowcut] = np.nan
    minfreqs[minfreqs <= dataset.parameters.lowcut] = np.nan

    if plot:
        show_minmax_frequency(
            dataset, maxfreqs, minfreqs, roll_percents, key=key, spec=spec
        )

    return minfreqs, maxfreqs