442 lines
13 KiB
Python
442 lines
13 KiB
Python
# ------------------------------------------------------------
|
||
# AAC Coder/Decoder - Psychoacoustic Model
|
||
#
|
||
# Multimedia course at Aristotle University of
|
||
# Thessaloniki (AUTh)
|
||
#
|
||
# Author:
|
||
# Christos Choutouridis (ΑΕΜ 8997)
|
||
# cchoutou@ece.auth.gr
|
||
#
|
||
# Description:
|
||
# Psychoacoustic model for ONE channel, based on the assignment notes (Section 2.4).
|
||
#
|
||
# Public API:
|
||
# SMR = aac_psycho(frame_T, frame_type, frame_T_prev_1, frame_T_prev_2)
|
||
#
|
||
# Output:
|
||
# - For long frames ("OLS", "LSS", "LPS"): SMR has shape (69,)
|
||
# - For short frames ("ESH"): SMR has shape (42, 8) (one column per subframe)
|
||
#
|
||
# Notes:
|
||
# - Uses Bark band tables from material/TableB219.mat:
|
||
# * B219a for long windows (69 bands, N=2048 FFT, N/2=1024 bins)
|
||
# * B219b for short windows (42 bands, N=256 FFT, N/2=128 bins)
|
||
# - Applies a Hann window in time domain before FFT magnitude/phase extraction.
|
||
# - Implements:
|
||
# spreading function -> band spreading -> tonality index -> masking thresholds -> SMR.
|
||
# ------------------------------------------------------------
|
||
from __future__ import annotations
|
||
|
||
import numpy as np
|
||
|
||
from core.aac_utils import band_limits, get_table
|
||
from core.aac_configuration import NMT_DB, TMN_DB
|
||
from core.aac_types import *
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Spreading function
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def _spreading_matrix(bval: BandValueArray) -> FloatArray:
|
||
"""
|
||
Compute the spreading function matrix between psychoacoustic bands.
|
||
|
||
The spreading function describes how energy in one critical band masks
|
||
nearby bands. The formula follows the assignment pseudo-code.
|
||
|
||
Parameters
|
||
----------
|
||
bval : BandValueArray
|
||
Bark value per band, shape (B,).
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
Spreading matrix S of shape (B, B), where:
|
||
S[bb, b] quantifies the contribution of band bb masking band b.
|
||
"""
|
||
bval = np.asarray(bval, dtype=np.float64).reshape(-1)
|
||
B = int(bval.shape[0])
|
||
|
||
spread = np.zeros((B, B), dtype=np.float64)
|
||
|
||
for b in range(B):
|
||
for bb in range(B):
|
||
# tmpx depends on direction (asymmetric spreading)
|
||
if bb >= b:
|
||
tmpx = 3.0 * (bval[bb] - bval[b])
|
||
else:
|
||
tmpx = 1.5 * (bval[bb] - bval[b])
|
||
|
||
# tmpz uses the "min(..., 0)" nonlinearity exactly as in the notes
|
||
tmpz = 8.0 * min((tmpx - 0.5) ** 2 - 2.0 * (tmpx - 0.5), 0.0)
|
||
tmpy = 15.811389 + 7.5 * (tmpx + 0.474) - 17.5 * np.sqrt(1.0 + (tmpx + 0.474) ** 2)
|
||
|
||
# Clamp very small values (below -100 dB) to 0 contribution
|
||
if tmpy < -100.0:
|
||
spread[bb, b] = 0.0
|
||
else:
|
||
spread[bb, b] = 10.0 ** ((tmpz + tmpy) / 10.0)
|
||
|
||
return spread
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Windowing + FFT feature extraction
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def _hann_window(N: int) -> FloatArray:
|
||
"""
|
||
Hann window as specified in the notes:
|
||
w[n] = 0.5 - 0.5*cos(2*pi*(n + 0.5)/N)
|
||
|
||
Parameters
|
||
----------
|
||
N : int
|
||
Window length.
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
1-D array of shape (N,), dtype float64.
|
||
"""
|
||
n = np.arange(N, dtype=np.float64)
|
||
return 0.5 - 0.5 * np.cos((2.0 * np.pi / N) * (n + 0.5))
|
||
|
||
|
||
def _r_phi_from_time(x: FrameChannelT, N: int) -> tuple[FloatArray, FloatArray]:
|
||
"""
|
||
Compute FFT magnitude r(w) and phase phi(w) for bins w = 0 .. N/2-1.
|
||
|
||
Processing:
|
||
1) Apply Hann window in time domain.
|
||
2) Compute N-point FFT.
|
||
3) Keep only the positive-frequency bins [0 .. N/2-1].
|
||
|
||
Parameters
|
||
----------
|
||
x : FrameChannelT
|
||
Time-domain samples, shape (N,).
|
||
N : int
|
||
FFT size (2048 or 256).
|
||
|
||
Returns
|
||
-------
|
||
r : FloatArray
|
||
Magnitude spectrum for bins 0 .. N/2-1, shape (N/2,).
|
||
phi : FloatArray
|
||
Phase spectrum for bins 0 .. N/2-1, shape (N/2,).
|
||
"""
|
||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||
if x.shape[0] != N:
|
||
raise ValueError(f"Expected time vector of length {N}, got {x.shape[0]}.")
|
||
|
||
w = _hann_window(N)
|
||
X = np.fft.fft(x * w, n=N)
|
||
|
||
Xp = X[: N // 2]
|
||
r = np.abs(Xp).astype(np.float64, copy=False)
|
||
phi = np.angle(Xp).astype(np.float64, copy=False)
|
||
return r, phi
|
||
|
||
|
||
def _predictability(
|
||
r: FloatArray,
|
||
phi: FloatArray,
|
||
r_m1: FloatArray,
|
||
phi_m1: FloatArray,
|
||
r_m2: FloatArray,
|
||
phi_m2: FloatArray,
|
||
) -> FloatArray:
|
||
"""
|
||
Compute predictability c(w) per spectral bin.
|
||
|
||
The notes define:
|
||
r_pred(w) = 2*r_{-1}(w) - r_{-2}(w)
|
||
phi_pred(w) = 2*phi_{-1}(w) - phi_{-2}(w)
|
||
|
||
c(w) = |X(w) - X_pred(w)| / (r(w) + |r_pred(w)|)
|
||
|
||
where X(w) is represented in polar form using r(w), phi(w).
|
||
|
||
Parameters
|
||
----------
|
||
r, phi : FloatArray
|
||
Current magnitude and phase, shape (N/2,).
|
||
r_m1, phi_m1 : FloatArray
|
||
Previous magnitude and phase, shape (N/2,).
|
||
r_m2, phi_m2 : FloatArray
|
||
Pre-previous magnitude and phase, shape (N/2,).
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
Predictability c(w), shape (N/2,).
|
||
"""
|
||
r_pred = 2.0 * r_m1 - r_m2
|
||
phi_pred = 2.0 * phi_m1 - phi_m2
|
||
|
||
num = np.sqrt(
|
||
(r * np.cos(phi) - r_pred * np.cos(phi_pred)) ** 2
|
||
+ (r * np.sin(phi) - r_pred * np.sin(phi_pred)) ** 2
|
||
)
|
||
den = r + np.abs(r_pred) + 1e-12 # avoid division-by-zero without altering behavior
|
||
return (num / den).astype(np.float64, copy=False)
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Band-domain aggregation
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def _band_energy_and_pred(
|
||
r: FloatArray,
|
||
c: FloatArray,
|
||
wlow: BandIndexArray,
|
||
whigh: BandIndexArray,
|
||
) -> tuple[FloatArray, FloatArray]:
|
||
"""
|
||
Aggregate spectral bin quantities into psychoacoustic bands.
|
||
|
||
Definitions (notes):
|
||
e(b) = sum_{w=wlow(b)..whigh(b)} r(w)^2
|
||
c_num(b) = sum_{w=wlow(b)..whigh(b)} c(w) * r(w)^2
|
||
|
||
The band predictability c(b) is later computed after spreading as:
|
||
cb(b) = ct(b) / ecb(b)
|
||
|
||
Parameters
|
||
----------
|
||
r : FloatArray
|
||
Magnitude spectrum, shape (N/2,).
|
||
c : FloatArray
|
||
Predictability per bin, shape (N/2,).
|
||
wlow, whigh : BandIndexArray
|
||
Band limits (inclusive indices), shape (B,).
|
||
|
||
Returns
|
||
-------
|
||
e_b : FloatArray
|
||
Band energies e(b), shape (B,).
|
||
c_num_b : FloatArray
|
||
Weighted predictability numerators c_num(b), shape (B,).
|
||
"""
|
||
r2 = (r * r).astype(np.float64, copy=False)
|
||
|
||
B = int(wlow.shape[0])
|
||
e_b = np.zeros(B, dtype=np.float64)
|
||
c_num_b = np.zeros(B, dtype=np.float64)
|
||
|
||
for b in range(B):
|
||
a = int(wlow[b])
|
||
z = int(whigh[b])
|
||
|
||
seg_r2 = r2[a : z + 1]
|
||
e_b[b] = float(np.sum(seg_r2))
|
||
c_num_b[b] = float(np.sum(c[a : z + 1] * seg_r2))
|
||
|
||
return e_b, c_num_b
|
||
|
||
|
||
def _psycho_window(
|
||
time_x: FrameChannelT,
|
||
prev1_x: FrameChannelT,
|
||
prev2_x: FrameChannelT,
|
||
*,
|
||
N: int,
|
||
table: BarkTable,
|
||
) -> FloatArray:
|
||
"""
|
||
Compute SMR for one FFT analysis window (N=2048 for long, N=256 for short).
|
||
|
||
This implements the pipeline described in the notes:
|
||
- FFT magnitude/phase
|
||
- predictability per bin
|
||
- band energies and predictability
|
||
- band spreading
|
||
- tonality index tb(b)
|
||
- masking threshold (noise + threshold in quiet)
|
||
- SMR(b) = e(b) / np(b)
|
||
|
||
Parameters
|
||
----------
|
||
time_x : FrameChannelT
|
||
Current time-domain samples, shape (N,).
|
||
prev1_x : FrameChannelT
|
||
Previous time-domain samples, shape (N,).
|
||
prev2_x : FrameChannelT
|
||
Pre-previous time-domain samples, shape (N,).
|
||
N : int
|
||
FFT size.
|
||
table : BarkTable
|
||
Psychoacoustic band table (B219a or B219b).
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
SMR per band, shape (B,).
|
||
"""
|
||
wlow, whigh, bval, qthr_db = band_limits(table)
|
||
spread = _spreading_matrix(bval)
|
||
|
||
# FFT features for current and history windows
|
||
r, phi = _r_phi_from_time(time_x, N)
|
||
r_m1, phi_m1 = _r_phi_from_time(prev1_x, N)
|
||
r_m2, phi_m2 = _r_phi_from_time(prev2_x, N)
|
||
|
||
# Predictability per bin
|
||
c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2)
|
||
|
||
# Aggregate into psycho bands
|
||
e_b, c_num_b = _band_energy_and_pred(r, c_w, wlow, whigh)
|
||
|
||
# Spread energies and predictability across bands:
|
||
# ecb(b) = sum_bb e(bb) * S(bb, b)
|
||
# ct(b) = sum_bb c_num(bb) * S(bb, b)
|
||
ecb = spread.T @ e_b
|
||
ct = spread.T @ c_num_b
|
||
|
||
# Band predictability after spreading: cb(b) = ct(b) / ecb(b)
|
||
cb = ct / (ecb + 1e-12)
|
||
|
||
# Normalized energy term:
|
||
# en(b) = ecb(b) / sum_bb S(bb, b)
|
||
spread_colsum = np.sum(spread, axis=0)
|
||
en = ecb / (spread_colsum + 1e-12)
|
||
|
||
# Tonality index (clamped to [0, 1])
|
||
tb = -0.299 - 0.43 * np.log(np.maximum(cb, 1e-12))
|
||
tb = np.clip(tb, 0.0, 1.0)
|
||
|
||
# Required SNR per band (dB): interpolate between TMN and NMT
|
||
snr_b = tb * TMN_DB + (1.0 - tb) * NMT_DB
|
||
bc = 10.0 ** (-snr_b / 10.0)
|
||
|
||
# Noise masking threshold estimate (power domain)
|
||
nb = en * bc
|
||
|
||
# Threshold in quiet (convert from dB to power domain):
|
||
# qthr_power = eps * (N/2) * 10^(qthr_db/10)
|
||
qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0))
|
||
|
||
# Final masking threshold per band:
|
||
# np(b) = max(nb(b), qthr(b))
|
||
npart = np.maximum(nb, qthr_power)
|
||
|
||
# Signal-to-mask ratio:
|
||
# SMR(b) = e(b) / np(b)
|
||
smr = e_b / (npart + 1e-12)
|
||
return smr.astype(np.float64, copy=False)
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# ESH window slicing (match filterbank conventions)
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def _esh_subframes(x_2048: FrameChannelT) -> list[FrameChannelT]:
|
||
"""
|
||
Extract the 8 overlapping 256-sample short windows used by AAC ESH.
|
||
|
||
The project convention (matching the filterbank) is:
|
||
start_j = 448 + 128*j, for j = 0..7
|
||
subframe_j = x[start_j : start_j + 256]
|
||
|
||
This selects the central 1152-sample region [448, 1600) and produces
|
||
8 windows with 50% overlap.
|
||
|
||
Parameters
|
||
----------
|
||
x_2048 : FrameChannelT
|
||
Time-domain channel frame, shape (2048,).
|
||
|
||
Returns
|
||
-------
|
||
list[FrameChannelT]
|
||
List of 8 subframes, each of shape (256,).
|
||
"""
|
||
x_2048 = np.asarray(x_2048, dtype=np.float64).reshape(-1)
|
||
if x_2048.shape[0] != 2048:
|
||
raise ValueError("ESH requires 2048-sample input frames.")
|
||
|
||
subs: list[FrameChannelT] = []
|
||
for j in range(8):
|
||
start = 448 + 128 * j
|
||
subs.append(x_2048[start : start + 256])
|
||
return subs
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Public API
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def aac_psycho(
|
||
frame_T: FrameChannelT,
|
||
frame_type: FrameType,
|
||
frame_T_prev_1: FrameChannelT,
|
||
frame_T_prev_2: FrameChannelT,
|
||
) -> FloatArray:
|
||
"""
|
||
Psychoacoustic model for ONE channel.
|
||
|
||
Parameters
|
||
----------
|
||
frame_T : FrameChannelT
|
||
Current time-domain channel frame, shape (2048,).
|
||
For "ESH", the 8 short windows are derived internally.
|
||
frame_type : FrameType
|
||
AAC frame type ("OLS", "LSS", "ESH", "LPS").
|
||
frame_T_prev_1 : FrameChannelT
|
||
Previous time-domain channel frame, shape (2048,).
|
||
frame_T_prev_2 : FrameChannelT
|
||
Pre-previous time-domain channel frame, shape (2048,).
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
Signal-to-Mask Ratio (SMR), per psychoacoustic band.
|
||
- If frame_type == "ESH": shape (42, 8)
|
||
- Else: shape (69,)
|
||
"""
|
||
frame_T = np.asarray(frame_T, dtype=np.float64).reshape(-1)
|
||
frame_T_prev_1 = np.asarray(frame_T_prev_1, dtype=np.float64).reshape(-1)
|
||
frame_T_prev_2 = np.asarray(frame_T_prev_2, dtype=np.float64).reshape(-1)
|
||
|
||
if frame_T.shape[0] != 2048 or frame_T_prev_1.shape[0] != 2048 or frame_T_prev_2.shape[0] != 2048:
|
||
raise ValueError("aac_psycho expects 2048-sample frames for current/prev1/prev2.")
|
||
|
||
table, N = get_table(frame_type)
|
||
|
||
# Long frame types: compute one SMR vector (69 bands)
|
||
if frame_type != "ESH":
|
||
return _psycho_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table)
|
||
|
||
# ESH: compute 8 SMR vectors (42 bands each), one per short subframe.
|
||
#
|
||
# The notes use short-window history for predictability:
|
||
# - For j=0: use previous frame's subframes (7, 6)
|
||
# - For j=1: use current subframe 0 and previous frame's subframe 7
|
||
# - For j>=2: use current subframes (j-1, j-2)
|
||
#
|
||
# This matches the "within-frame history" convention commonly used in
|
||
# simplified psycho models for ESH.
|
||
cur_subs = _esh_subframes(frame_T)
|
||
prev1_subs = _esh_subframes(frame_T_prev_1)
|
||
|
||
B = int(table.shape[0]) # expected 42
|
||
smr_out = np.zeros((B, 8), dtype=np.float64)
|
||
|
||
for j in range(8):
|
||
if j == 0:
|
||
x_m1 = prev1_subs[7]
|
||
x_m2 = prev1_subs[6]
|
||
elif j == 1:
|
||
x_m1 = cur_subs[0]
|
||
x_m2 = prev1_subs[7]
|
||
else:
|
||
x_m1 = cur_subs[j - 1]
|
||
x_m2 = cur_subs[j - 2]
|
||
|
||
smr_out[:, j] = _psycho_window(cur_subs[j], x_m1, x_m2, N=256, table=table)
|
||
|
||
return smr_out
|