Multimedia_AAC_Project/source/core/aac_utils.py

# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Utilities
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
#   Christos Choutouridis (ΑΕΜ 8997)
#   cchoutou@ece.auth.gr
#
# Description:
#   Shared utility functions used across AAC encoder/decoder levels.
#
#   This module currently provides:
#   - MDCT / IMDCT conversions
#   - Signal-to-Noise Ratio (SNR) computation in dB
#   - Loading and access helpers for psychoacoustic band tables
#     (TableB219.mat, Tables B.2.1.9a / B.2.1.9b of the AAC specification)
# ------------------------------------------------------------
from __future__ import annotations

import numpy as np
from pathlib import Path

from scipy.io import loadmat
from core.aac_types import *


# -----------------------------------------------------------------------------
# Global cached data
# -----------------------------------------------------------------------------
# Cached contents of TableB219.mat to avoid repeated disk I/O.
# Keys:
#   - "B219a": long-window psychoacoustic bands (69 bands, FFT size 2048)
#   - "B219b": short-window psychoacoustic bands (42 bands, FFT size 256)
B219_CACHE: dict[str, BarkTable] | None = None


# -----------------------------------------------------------------------------
# MDCT / IMDCT
# -----------------------------------------------------------------------------
def mdct(s: TimeSignal) -> MdctCoeffs:
    """
    MDCT (direct form) as specified in the assignment.

    Parameters
    ----------
    s : TimeSignal
        Windowed time samples, 1-D array of length N (N = 2048 or 256).

    Returns
    -------
    MdctCoeffs
        MDCT coefficients, 1-D array of length N/2.

    Definition
    ----------
    X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
    where n0 = (N/2 + 1)/2.
    """
    s = np.asarray(s, dtype=np.float64).reshape(-1)
    N = int(s.shape[0])
    if N not in (2048, 256):
        raise ValueError("MDCT input length must be 2048 or 256.")

    n0 = (N / 2.0 + 1.0) / 2.0
    n = np.arange(N, dtype=np.float64) + n0
    k = np.arange(N // 2, dtype=np.float64) + 0.5

    C = np.cos((2.0 * np.pi / N) * np.outer(n, k))  # (N, N/2)
    X = 2.0 * (s @ C)  # (N/2,)
    return X


def imdct(X: MdctCoeffs) -> TimeSignal:
    """
    IMDCT (direct form) as specified in the assignment.

    Parameters
    ----------
    X : MdctCoeffs
        MDCT coefficients, 1-D array of length K (K = 1024 or 128).

    Returns
    -------
    TimeSignal
        Reconstructed time samples, 1-D array of length N = 2K.

    Definition
    ----------
    s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
    where n0 = (N/2 + 1)/2.
    """
    X = np.asarray(X, dtype=np.float64).reshape(-1)
    K = int(X.shape[0])
    if K not in (1024, 128):
        raise ValueError("IMDCT input length must be 1024 or 128.")

    N = 2 * K
    n0 = (N / 2.0 + 1.0) / 2.0

    n = np.arange(N, dtype=np.float64) + n0
    k = np.arange(K, dtype=np.float64) + 0.5

    C = np.cos((2.0 * np.pi / N) * np.outer(n, k))  # (N, K)
    s = (2.0 / N) * (C @ X)  # (N,)
    return s


# -----------------------------------------------------------------------------
# Signal quality metrics
# -----------------------------------------------------------------------------

def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
    """
    Compute the overall Signal-to-Noise Ratio (SNR) in dB.

    The SNR is computed over all available samples and channels,
    after conservatively aligning the two signals to their common
    length and channel count.

    Parameters
    ----------
    x_ref : StereoSignal
        Reference (original) signal.
        Typical shape: (N, 2) for stereo.
    x_hat : StereoSignal
        Reconstructed or processed signal.
        Typical shape: (M, 2) for stereo.

    Returns
    -------
    float
        SNR in dB.
        - +inf if the noise power is zero (perfect reconstruction).
        - -inf if the reference signal power is zero.
    """
    x_ref = np.asarray(x_ref, dtype=np.float64)
    x_hat = np.asarray(x_hat, dtype=np.float64)

    # Ensure 2-D shape: (samples, channels)
    if x_ref.ndim == 1:
        x_ref = x_ref.reshape(-1, 1)
    if x_hat.ndim == 1:
        x_hat = x_hat.reshape(-1, 1)

    # Align lengths and channel count conservatively
    n = min(x_ref.shape[0], x_hat.shape[0])
    c = min(x_ref.shape[1], x_hat.shape[1])

    x_ref = x_ref[:n, :c]
    x_hat = x_hat[:n, :c]

    err = x_ref - x_hat
    ps = float(np.sum(x_ref * x_ref))     # signal power
    pn = float(np.sum(err * err))          # noise power

    if pn <= 0.0:
        return float("inf")
    if ps <= 0.0:
        return float("-inf")

    return float(10.0 * np.log10(ps / pn))


# -----------------------------------------------------------------------------
# Psychoacoustic band tables (TableB219.mat)
# -----------------------------------------------------------------------------

def load_b219_tables() -> dict[str, BarkTable]:
    """
    Load and cache psychoacoustic band tables from TableB219.mat.

    The assignment/project layout assumes that a 'material' directory
    is available in the current working directory when running:
    - tests
    - level_1 / level_2 / level_3 entrypoints

    This function loads the tables once and caches them for subsequent calls.

    Returns
    -------
    dict[str, BarkTable]
        Dictionary with the following entries:
        - "B219a": long-window psychoacoustic table
                   (69 bands, FFT size 2048 / 1024 spectral lines)
        - "B219b": short-window psychoacoustic table
                   (42 bands, FFT size 256 / 128 spectral lines)
    """
    global B219_CACHE
    if B219_CACHE is not None:
        return B219_CACHE

    mat_path = Path("material") / "TableB219.mat"
    if not mat_path.exists():
        raise FileNotFoundError(
            "Could not locate material/TableB219.mat in the current working directory."
        )

    data = loadmat(str(mat_path))
    if "B219a" not in data or "B219b" not in data:
        raise ValueError(
            "TableB219.mat missing required variables 'B219a' and/or 'B219b'."
        )

    B219_CACHE = {
        "B219a": np.asarray(data["B219a"], dtype=np.float64),
        "B219b": np.asarray(data["B219b"], dtype=np.float64),
    }
    return B219_CACHE


def get_table(frame_type: FrameType) -> tuple[BarkTable, int]:
    """
    Select the appropriate psychoacoustic band table and FFT size
    based on the AAC frame type.

    Parameters
    ----------
    frame_type : FrameType
        AAC frame type ("OLS", "LSS", "ESH", "LPS").

    Returns
    -------
    table : BarkTable
        Psychoacoustic band table:
        - B219a for long frames
        - B219b for ESH short subframes
    N : int
        FFT size corresponding to the table:
        - 2048 for long frames
        - 256 for short frames (ESH)
    """
    tables = load_b219_tables()
    if frame_type == "ESH":
        return tables["B219b"], 256
    return tables["B219a"], 2048


def band_limits(
    table: BarkTable,
) -> tuple[BandIndexArray, BandIndexArray, BandValueArray, BandValueArray]:
    """
    Extract per-band metadata from a TableB2.1.9 psychoacoustic table.

    The column layout follows the provided TableB219.mat file and the
    AAC specification tables B.2.1.9a / B.2.1.9b.

    Parameters
    ----------
    table : BarkTable
        Psychoacoustic band table (B219a or B219b).

    Returns
    -------
    wlow : BandIndexArray
        Lower FFT bin index (inclusive) for each band.
    whigh : BandIndexArray
        Upper FFT bin index (inclusive) for each band.
    bval : BandValueArray
        Bark-scale (or equivalent) band position values.
        Used in the spreading function.
    qthr_db : BandValueArray
        Threshold in quiet for each band, in dB.
    """
    wlow = table[:, 1].astype(int)
    whigh = table[:, 2].astype(int)
    bval = table[:, 4].astype(np.float64)
    qthr_db = table[:, 5].astype(np.float64)
    return wlow, whigh, bval, qthr_db