# ------------------------------------------------------------ # AAC Coder/Decoder - Quantizer / iQuantizer (Level 3) # # Multimedia course at Aristotle University of # Thessaloniki (AUTh) # # Author: # Christos Choutouridis (ΑΕΜ 8997) # cchoutou@ece.auth.gr # # Description: # Implements AAC quantizer and inverse quantizer for one channel. # Based on assignment section 2.6 (Eq. 12-15). # # Notes: # - Bit reservoir is not implemented (assignment simplification). # - Scalefactor bands are assumed equal to psychoacoustic bands # (Table B.2.1.9a / B.2.1.9b from TableB219.mat). # ------------------------------------------------------------ from __future__ import annotations import numpy as np from core.aac_utils import get_table, band_limits from core.aac_types import * # ----------------------------------------------------------------------------- # Constants (assignment) # ----------------------------------------------------------------------------- MAGIC_NUMBER: float = 0.4054 EPS: float = 1e-12 MAX_SF_DELTA:int = 60 # ----------------------------------------------------------------------------- # Helpers: ESH packing/unpacking (128x8 <-> 1024x1) # ----------------------------------------------------------------------------- def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray: """ Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1). Packing order: Columns are concatenated in subframe order (0..7), column-major. Parameters ---------- x_128x8 : FloatArray ESH coefficients, shape (128, 8). Returns ------- FloatArray Packed coefficients, shape (1024, 1). """ x_128x8 = np.asarray(x_128x8, dtype=np.float64) if x_128x8.shape != (128, 8): raise ValueError("ESH pack expects shape (128, 8).") return x_128x8.reshape(1024, 1, order="F") def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray: """ Unpack a packed ESH vector (1024 elements) back to shape (128, 8). Parameters ---------- x_1024x1 : FloatArray Packed ESH vector, shape (1024,) or (1024, 1) after flattening. Returns ------- FloatArray Unpacked ESH coefficients, shape (128, 8). """ x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1) if x_1024x1.shape[0] != 1024: raise ValueError("ESH unpack expects 1024 elements.") return x_1024x1.reshape(128, 8, order="F") # ----------------------------------------------------------------------------- # Core quantizer formulas (Eq. 12, Eq. 13) # ----------------------------------------------------------------------------- def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols: """ Quantize MDCT coefficients to integer symbols S(k). Implements Eq. (12): S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER ) Parameters ---------- x : FloatArray MDCT coefficients for a contiguous set of spectral lines. Shape: (N,) alpha : float Scalefactor gain for the corresponding scalefactor band. Returns ------- QuantizedSymbols Quantized symbols S(k) as int64, shape (N,). """ x = np.asarray(x, dtype=np.float64) scale = 2.0 ** (-0.25 * float(alpha)) ax = np.abs(x) * scale y = np.power(ax, 0.75, dtype=np.float64) # "int" in the assignment corresponds to truncation. q = np.floor(y + MAGIC_NUMBER).astype(np.int64) return (np.sign(x).astype(np.int64) * q).astype(np.int64) def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray: """ Inverse quantizer (dequantization of symbols). Implements Eq. (13): Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4) Parameters ---------- S : QuantizedSymbols Quantized symbols S(k), int64, shape (N,). alpha : float Scalefactor gain for the corresponding scalefactor band. Returns ------- FloatArray Reconstructed MDCT coefficients Xhat(k), float64, shape (N,). """ S = np.asarray(S, dtype=np.int64) scale = 2.0 ** (0.25 * float(alpha)) aS = np.abs(S).astype(np.float64) y = np.power(aS, 4.0 / 3.0, dtype=np.float64) return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64) # ----------------------------------------------------------------------------- # Alpha initialization (Eq. 14) # ----------------------------------------------------------------------------- def _initial_alpha_hat(X: "FloatArray", MQ: int = 8191) -> int: """ Compute the initial scalefactor estimate alpha_hat for a frame. The assignment proposes the following first approximation (Equation 14): alpha_hat = (16/3) * log2( max_k(|X(k)|)^(3/4) / MQ ) where max_k runs over all MDCT coefficients of the frame (not per band), and MQ is the maximum quantization level parameter (2*MQ + 1 levels). Parameters ---------- X : FloatArray MDCT coefficients of one frame (or one ESH subframe), shape (N,). MQ : int Quantizer parameter (default 8191, as per assignment). Returns ------- int Integer alpha_hat (rounded to nearest integer). """ x_max = float(np.max(np.abs(X))) if x_max <= 0.0: return 0 alpha_hat = (16.0 / 3.0) * np.log2((x_max ** (3.0 / 4.0)) / float(MQ)) return int(np.round(alpha_hat)) # ----------------------------------------------------------------------------- # Band utilities # ----------------------------------------------------------------------------- def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]: """ Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type. These are derived from the psychoacoustic tables (TableB219), and map directly to MDCT indices: - long: 0..1023 - short (ESH subframe): 0..127 Parameters ---------- frame_type : FrameType Frame type ("OLS", "LSS", "ESH", "LPS"). Returns ------- list[tuple[int, int]] List of (lo, hi) inclusive index pairs for each band. """ table, _Nfft = get_table(frame_type) wlow, whigh, _bval, _qthr_db = band_limits(table) bands: list[tuple[int, int]] = [] for lo, hi in zip(wlow, whigh): bands.append((int(lo), int(hi))) return bands def _band_energy(x: FloatArray, lo: int, hi: int) -> float: """ Compute energy of a spectral segment x[lo:hi+1]. Parameters ---------- x : FloatArray MDCT coefficient vector. lo, hi : int Inclusive index range. Returns ------- float Sum of squares (energy) within the band. """ sec = x[lo : hi + 1] return float(np.sum(sec * sec)) def _threshold_T_from_SMR( X: FloatArray, SMR_col: FloatArray, bands: list[tuple[int, int]], ) -> FloatArray: """ Compute psychoacoustic thresholds T(b) per band. Uses: P(b) = sum_{k in band} X(k)^2 T(b) = P(b) / SMR(b) Parameters ---------- X : FloatArray MDCT coefficients for a frame (long) or one ESH subframe (short). SMR_col : FloatArray SMR values for this frame/subframe, shape (NB,). bands : list[tuple[int, int]] Band index ranges. Returns ------- FloatArray Threshold vector T(b), shape (NB,). """ nb = len(bands) T = np.zeros((nb,), dtype=np.float64) for b, (lo, hi) in enumerate(bands): P = _band_energy(X, lo, hi) smr = float(SMR_col[b]) if smr <= EPS: T[b] = 0.0 else: T[b] = P / smr return T # ----------------------------------------------------------------------------- # Alpha selection per band + neighbor-difference constraint # ----------------------------------------------------------------------------- def _best_alpha_for_band( X: "FloatArray", lo: int, hi: int, T_b: float, alpha_hat: int, alpha_prev: int, alpha_min: int, alpha_max: int, ) -> int: """ Determine the band-wise scalefactor alpha(b) following the assignment. Procedure: - Start from a frame-wise initial estimate alpha_hat. - Iteratively increase alpha(b) by 1 as long as the quantization error power stays below the psychoacoustic threshold T(b): P_e(b) = sum_{k in band} ( X(k) - Xhat(k) )^2 - Stop increasing alpha(b) if the neighbor constraint would be violated: |alpha(b) - alpha(b-1)| <= 60 When processing bands sequentially (low -> high), this becomes: alpha(b) <= alpha_prev + 60 Notes: - This function does not decrease alpha if the initial value already violates the threshold; the assignment only specifies iterative increase. Parameters ---------- X : FloatArray Full MDCT vector of the current (sub)frame, shape (N,). lo, hi : int Band index bounds (inclusive), defining the band slice. T_b : float Threshold T(b) for this band. alpha_hat : int Initial frame-wise estimate (Equation 14). alpha_prev : int Previously selected alpha for band b-1 (neighbor constraint reference). alpha_min, alpha_max : int Safeguard bounds for alpha. Returns ------- int Selected integer alpha(b). """ if T_b <= 0.0: return int(alpha_hat) Xsec = X[lo : hi + 1] # Neighbor constraint (sequential processing): alpha(b) <= alpha_prev + 60 alpha_limit = min(int(alpha_max), int(alpha_prev) + MAX_SF_DELTA) # Start from alpha_hat, clamped to feasible range alpha = int(alpha_hat) alpha = max(int(alpha_min), min(alpha, int(alpha_limit))) # Evaluate at current alpha Ssec = _quantize_symbol(Xsec, alpha) Xhat = _dequantize_symbol(Ssec, alpha) Pe = float(np.sum((Xsec - Xhat) ** 2)) # If already above threshold, return current alpha (no decrease step specified) if Pe > T_b: return alpha # Increase alpha while still under threshold and within constraints while True: alpha_next = alpha + 1 if alpha_next > alpha_limit: break Ssec = _quantize_symbol(Xsec, alpha_next) Xhat = _dequantize_symbol(Ssec, alpha_next) Pe_next = float(np.sum((Xsec - Xhat) ** 2)) if Pe_next > T_b: break alpha = alpha_next return alpha # ----------------------------------------------------------------------------- # Public API # ----------------------------------------------------------------------------- def aac_quantizer( frame_F: FrameChannelF, frame_type: FrameType, SMR: FloatArray, ) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]: """ AAC quantizer for one channel (Level 3). Quantizes MDCT coefficients (after TNS) using band-wise scalefactors derived from psychoacoustic thresholds computed via SMR. The implementation follows the assignment procedure: - Compute an initial frame-wise alpha_hat using Equation (14), based on the maximum MDCT coefficient magnitude of the (sub)frame. - For each band b, increase alpha(b) by 1 while the quantization error power P_e(b) stays below the threshold T(b). - Enforce the neighbor constraint |alpha(b) - alpha(b-1)| <= 60 during the band-by-band search (no post-processing needed). Parameters ---------- frame_F : FrameChannelF MDCT coefficients after TNS, one channel. Shapes: - Long frames: (1024,) or (1024, 1) - ESH: (128, 8) frame_type : FrameType AAC frame type ("OLS", "LSS", "ESH", "LPS"). SMR : FloatArray Signal-to-Mask Ratio per band. Shapes: - Long: (NB,) or (NB, 1) - ESH: (NB, 8) Returns ------- S : QuantizedSymbols Quantized symbols S(k), packed as shape (1024, 1) for all frame types. For ESH, the 8 subframes are packed in column-major subframe layout. sfc : ScaleFactors DPCM-coded scalefactors: sfc(0) = alpha(0) = G sfc(b) = alpha(b) - alpha(b-1), for b > 0 Shapes: - Long: (NB, 1) - ESH: (NB, 8) G : GlobalGain Global gain G = alpha(0). - Long: scalar float - ESH: array shape (1, 8), dtype float64 """ bands = _band_slices(frame_type) NB = len(bands) X = np.asarray(frame_F, dtype=np.float64) SMR = np.asarray(SMR, dtype=np.float64) # ------------------------------------------------------------------------- # ESH: 8 short subframes, each of length 128 # ------------------------------------------------------------------------- if frame_type == "ESH": if X.shape != (128, 8): raise ValueError("For ESH, frame_F must have shape (128, 8).") if SMR.shape != (NB, 8): raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64) sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64) G_arr = np.zeros((1, 8), dtype=np.float64) # Packed output view: (128, 8) with column-major layout S_pack = S_out[:, 0].reshape(128, 8, order="F") for j in range(8): Xj = X[:, j].reshape(128) SMRj = SMR[:, j].reshape(NB) # Compute psychoacoustic threshold T(b) for this subframe T = _threshold_T_from_SMR(Xj, SMRj, bands) # Frame-wise initial estimate alpha_hat (Equation 14) alpha_hat = _initial_alpha_hat(Xj) # Band-wise scalefactors alpha(b) alpha = np.zeros((NB,), dtype=np.int64) alpha_prev = int(alpha_hat) for b, (lo, hi) in enumerate(bands): alpha_b = _best_alpha_for_band( X=Xj, lo=lo, hi=hi, T_b=float(T[b]), alpha_hat=int(alpha_hat), alpha_prev=int(alpha_prev), alpha_min=-4096, alpha_max=4096, ) alpha[b] = int(alpha_b) alpha_prev = int(alpha_b) # DPCM-coded scalefactors G_arr[0, j] = float(alpha[0]) sfc[0, j] = int(alpha[0]) for b in range(1, NB): sfc[b, j] = int(alpha[b] - alpha[b - 1]) # Quantize MDCT coefficients band-by-band Sj = np.zeros((128,), dtype=np.int64) for b, (lo, hi) in enumerate(bands): Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b])) # Store subframe in packed output S_pack[:, j] = Sj return S_out, sfc, G_arr # ------------------------------------------------------------------------- # Long frames: OLS / LSS / LPS, length 1024 # ------------------------------------------------------------------------- if X.shape == (1024,): Xv = X elif X.shape == (1024, 1): Xv = X[:, 0] else: raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).") if SMR.shape == (NB,): SMRv = SMR elif SMR.shape == (NB, 1): SMRv = SMR[:, 0] else: raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") # Compute psychoacoustic threshold T(b) for the long frame T = _threshold_T_from_SMR(Xv, SMRv, bands) # Frame-wise initial estimate alpha_hat (Equation 14) alpha_hat = _initial_alpha_hat(Xv) # Band-wise scalefactors alpha(b) alpha = np.zeros((NB,), dtype=np.int64) alpha_prev = int(alpha_hat) for b, (lo, hi) in enumerate(bands): alpha_b = _best_alpha_for_band( X=Xv, lo=lo, hi=hi, T_b=float(T[b]), alpha_hat=int(alpha_hat), alpha_prev=int(alpha_prev), alpha_min=-4096, alpha_max=4096, ) alpha[b] = int(alpha_b) alpha_prev = int(alpha_b) # DPCM-coded scalefactors sfc_out: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) sfc_out[0, 0] = int(alpha[0]) for b in range(1, NB): sfc_out[b, 0] = int(alpha[b] - alpha[b - 1]) G: float = float(alpha[0]) # Quantize MDCT coefficients band-by-band S_vec = np.zeros((1024,), dtype=np.int64) for b, (lo, hi) in enumerate(bands): S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b])) return S_vec.reshape(1024, 1), sfc_out, G def aac_i_quantizer( S: QuantizedSymbols, sfc: ScaleFactors, G: GlobalGain, frame_type: FrameType, ) -> FrameChannelF: """ Inverse quantizer (iQuantizer) for one channel. Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors. Parameters ---------- S : QuantizedSymbols Quantized symbols, shape (1024, 1) (or any array with 1024 elements). sfc : ScaleFactors DPCM-coded scalefactors. Shapes: - Long: (NB, 1) - ESH: (NB, 8) G : GlobalGain Global gain (not strictly required if sfc includes sfc(0)=alpha(0)). Present for API compatibility with the assignment. frame_type : FrameType AAC frame type. Returns ------- FrameChannelF Reconstructed MDCT coefficients: - ESH: (128, 8) - Long: (1024, 1) """ bands = _band_slices(frame_type) NB = len(bands) S_flat = np.asarray(S, dtype=np.int64).reshape(-1) if S_flat.shape[0] != 1024: raise ValueError("S must contain 1024 symbols.") if frame_type == "ESH": sfc = np.asarray(sfc, dtype=np.int64) if sfc.shape != (NB, 8): raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).") S_128x8 = _esh_unpack_from_1024(S_flat) Xrec = np.zeros((128, 8), dtype=np.float64) for j in range(8): alpha = np.zeros((NB,), dtype=np.int64) alpha[0] = int(sfc[0, j]) for b in range(1, NB): alpha[b] = int(alpha[b - 1] + sfc[b, j]) Xj = np.zeros((128,), dtype=np.float64) for b, (lo, hi) in enumerate(bands): Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b])) Xrec[:, j] = Xj return Xrec sfc = np.asarray(sfc, dtype=np.int64) if sfc.shape != (NB, 1): raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).") alpha = np.zeros((NB,), dtype=np.int64) alpha[0] = int(sfc[0, 0]) for b in range(1, NB): alpha[b] = int(alpha[b - 1] + sfc[b, 0]) Xrec = np.zeros((1024,), dtype=np.float64) for b, (lo, hi) in enumerate(bands): Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b])) return Xrec.reshape(1024, 1)