271 lines
8.0 KiB
Python
271 lines
8.0 KiB
Python
# ------------------------------------------------------------
|
||
# AAC Coder/Decoder - AAC Utilities
|
||
#
|
||
# Multimedia course at Aristotle University of
|
||
# Thessaloniki (AUTh)
|
||
#
|
||
# Author:
|
||
# Christos Choutouridis (ΑΕΜ 8997)
|
||
# cchoutou@ece.auth.gr
|
||
#
|
||
# Description:
|
||
# Shared utility functions used across AAC encoder/decoder levels.
|
||
#
|
||
# This module currently provides:
|
||
# - MDCT / IMDCT conversions
|
||
# - Signal-to-Noise Ratio (SNR) computation in dB
|
||
# - Loading and access helpers for psychoacoustic band tables
|
||
# (TableB219.mat, Tables B.2.1.9a / B.2.1.9b of the AAC specification)
|
||
# ------------------------------------------------------------
|
||
from __future__ import annotations
|
||
|
||
import numpy as np
|
||
from pathlib import Path
|
||
|
||
from scipy.io import loadmat
|
||
from core.aac_types import *
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Global cached data
|
||
# -----------------------------------------------------------------------------
|
||
# Cached contents of TableB219.mat to avoid repeated disk I/O.
|
||
# Keys:
|
||
# - "B219a": long-window psychoacoustic bands (69 bands, FFT size 2048)
|
||
# - "B219b": short-window psychoacoustic bands (42 bands, FFT size 256)
|
||
B219_CACHE: dict[str, BarkTable] | None = None
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# MDCT / IMDCT
|
||
# -----------------------------------------------------------------------------
|
||
def mdct(s: TimeSignal) -> MdctCoeffs:
|
||
"""
|
||
MDCT (direct form) as specified in the assignment.
|
||
|
||
Parameters
|
||
----------
|
||
s : TimeSignal
|
||
Windowed time samples, 1-D array of length N (N = 2048 or 256).
|
||
|
||
Returns
|
||
-------
|
||
MdctCoeffs
|
||
MDCT coefficients, 1-D array of length N/2.
|
||
|
||
Definition
|
||
----------
|
||
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||
where n0 = (N/2 + 1)/2.
|
||
"""
|
||
s = np.asarray(s, dtype=np.float64).reshape(-1)
|
||
N = int(s.shape[0])
|
||
if N not in (2048, 256):
|
||
raise ValueError("MDCT input length must be 2048 or 256.")
|
||
|
||
n0 = (N / 2.0 + 1.0) / 2.0
|
||
n = np.arange(N, dtype=np.float64) + n0
|
||
k = np.arange(N // 2, dtype=np.float64) + 0.5
|
||
|
||
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
|
||
X = 2.0 * (s @ C) # (N/2,)
|
||
return X
|
||
|
||
|
||
def imdct(X: MdctCoeffs) -> TimeSignal:
|
||
"""
|
||
IMDCT (direct form) as specified in the assignment.
|
||
|
||
Parameters
|
||
----------
|
||
X : MdctCoeffs
|
||
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
|
||
|
||
Returns
|
||
-------
|
||
TimeSignal
|
||
Reconstructed time samples, 1-D array of length N = 2K.
|
||
|
||
Definition
|
||
----------
|
||
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||
where n0 = (N/2 + 1)/2.
|
||
"""
|
||
X = np.asarray(X, dtype=np.float64).reshape(-1)
|
||
K = int(X.shape[0])
|
||
if K not in (1024, 128):
|
||
raise ValueError("IMDCT input length must be 1024 or 128.")
|
||
|
||
N = 2 * K
|
||
n0 = (N / 2.0 + 1.0) / 2.0
|
||
|
||
n = np.arange(N, dtype=np.float64) + n0
|
||
k = np.arange(K, dtype=np.float64) + 0.5
|
||
|
||
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
|
||
s = (2.0 / N) * (C @ X) # (N,)
|
||
return s
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Signal quality metrics
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||
"""
|
||
Compute the overall Signal-to-Noise Ratio (SNR) in dB.
|
||
|
||
The SNR is computed over all available samples and channels,
|
||
after conservatively aligning the two signals to their common
|
||
length and channel count.
|
||
|
||
Parameters
|
||
----------
|
||
x_ref : StereoSignal
|
||
Reference (original) signal.
|
||
Typical shape: (N, 2) for stereo.
|
||
x_hat : StereoSignal
|
||
Reconstructed or processed signal.
|
||
Typical shape: (M, 2) for stereo.
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
SNR in dB.
|
||
- +inf if the noise power is zero (perfect reconstruction).
|
||
- -inf if the reference signal power is zero.
|
||
"""
|
||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||
|
||
# Ensure 2-D shape: (samples, channels)
|
||
if x_ref.ndim == 1:
|
||
x_ref = x_ref.reshape(-1, 1)
|
||
if x_hat.ndim == 1:
|
||
x_hat = x_hat.reshape(-1, 1)
|
||
|
||
# Align lengths and channel count conservatively
|
||
n = min(x_ref.shape[0], x_hat.shape[0])
|
||
c = min(x_ref.shape[1], x_hat.shape[1])
|
||
|
||
x_ref = x_ref[:n, :c]
|
||
x_hat = x_hat[:n, :c]
|
||
|
||
err = x_ref - x_hat
|
||
ps = float(np.sum(x_ref * x_ref)) # signal power
|
||
pn = float(np.sum(err * err)) # noise power
|
||
|
||
if pn <= 0.0:
|
||
return float("inf")
|
||
if ps <= 0.0:
|
||
return float("-inf")
|
||
|
||
return float(10.0 * np.log10(ps / pn))
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Psychoacoustic band tables (TableB219.mat)
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def load_b219_tables() -> dict[str, BarkTable]:
|
||
"""
|
||
Load and cache psychoacoustic band tables from TableB219.mat.
|
||
|
||
The assignment/project layout assumes that a 'material' directory
|
||
is available in the current working directory when running:
|
||
- tests
|
||
- level_1 / level_2 / level_3 entrypoints
|
||
|
||
This function loads the tables once and caches them for subsequent calls.
|
||
|
||
Returns
|
||
-------
|
||
dict[str, BarkTable]
|
||
Dictionary with the following entries:
|
||
- "B219a": long-window psychoacoustic table
|
||
(69 bands, FFT size 2048 / 1024 spectral lines)
|
||
- "B219b": short-window psychoacoustic table
|
||
(42 bands, FFT size 256 / 128 spectral lines)
|
||
"""
|
||
global B219_CACHE
|
||
if B219_CACHE is not None:
|
||
return B219_CACHE
|
||
|
||
mat_path = Path("material") / "TableB219.mat"
|
||
if not mat_path.exists():
|
||
raise FileNotFoundError(
|
||
"Could not locate material/TableB219.mat in the current working directory."
|
||
)
|
||
|
||
data = loadmat(str(mat_path))
|
||
if "B219a" not in data or "B219b" not in data:
|
||
raise ValueError(
|
||
"TableB219.mat missing required variables 'B219a' and/or 'B219b'."
|
||
)
|
||
|
||
B219_CACHE = {
|
||
"B219a": np.asarray(data["B219a"], dtype=np.float64),
|
||
"B219b": np.asarray(data["B219b"], dtype=np.float64),
|
||
}
|
||
return B219_CACHE
|
||
|
||
|
||
def get_table(frame_type: FrameType) -> tuple[BarkTable, int]:
|
||
"""
|
||
Select the appropriate psychoacoustic band table and FFT size
|
||
based on the AAC frame type.
|
||
|
||
Parameters
|
||
----------
|
||
frame_type : FrameType
|
||
AAC frame type ("OLS", "LSS", "ESH", "LPS").
|
||
|
||
Returns
|
||
-------
|
||
table : BarkTable
|
||
Psychoacoustic band table:
|
||
- B219a for long frames
|
||
- B219b for ESH short subframes
|
||
N : int
|
||
FFT size corresponding to the table:
|
||
- 2048 for long frames
|
||
- 256 for short frames (ESH)
|
||
"""
|
||
tables = load_b219_tables()
|
||
if frame_type == "ESH":
|
||
return tables["B219b"], 256
|
||
return tables["B219a"], 2048
|
||
|
||
|
||
def band_limits(
|
||
table: BarkTable,
|
||
) -> tuple[BandIndexArray, BandIndexArray, BandValueArray, BandValueArray]:
|
||
"""
|
||
Extract per-band metadata from a TableB2.1.9 psychoacoustic table.
|
||
|
||
The column layout follows the provided TableB219.mat file and the
|
||
AAC specification tables B.2.1.9a / B.2.1.9b.
|
||
|
||
Parameters
|
||
----------
|
||
table : BarkTable
|
||
Psychoacoustic band table (B219a or B219b).
|
||
|
||
Returns
|
||
-------
|
||
wlow : BandIndexArray
|
||
Lower FFT bin index (inclusive) for each band.
|
||
whigh : BandIndexArray
|
||
Upper FFT bin index (inclusive) for each band.
|
||
bval : BandValueArray
|
||
Bark-scale (or equivalent) band position values.
|
||
Used in the spreading function.
|
||
qthr_db : BandValueArray
|
||
Threshold in quiet for each band, in dB.
|
||
"""
|
||
wlow = table[:, 1].astype(int)
|
||
whigh = table[:, 2].astype(int)
|
||
bval = table[:, 4].astype(np.float64)
|
||
qthr_db = table[:, 5].astype(np.float64)
|
||
return wlow, whigh, bval, qthr_db
|