605 lines
17 KiB
Python
605 lines
17 KiB
Python
# ------------------------------------------------------------
|
||
# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3)
|
||
#
|
||
# Multimedia course at Aristotle University of
|
||
# Thessaloniki (AUTh)
|
||
#
|
||
# Author:
|
||
# Christos Choutouridis (ΑΕΜ 8997)
|
||
# cchoutou@ece.auth.gr
|
||
#
|
||
# Description:
|
||
# Implements AAC quantizer and inverse quantizer for one channel.
|
||
# Based on assignment section 2.6 (Eq. 12-15).
|
||
#
|
||
# Notes:
|
||
# - Bit reservoir is not implemented (assignment simplification).
|
||
# - Scalefactor bands are assumed equal to psychoacoustic bands
|
||
# (Table B.2.1.9a / B.2.1.9b from TableB219.mat).
|
||
# ------------------------------------------------------------
|
||
from __future__ import annotations
|
||
|
||
import numpy as np
|
||
|
||
from core.aac_utils import get_table, band_limits
|
||
from core.aac_types import *
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Constants (assignment)
|
||
# -----------------------------------------------------------------------------
|
||
MAGIC_NUMBER: float = 0.4054
|
||
MQ: int = 8191
|
||
|
||
EPS: float = 1e-12
|
||
MAX_SFC_DIFF: int = 60
|
||
|
||
# Safeguard: prevents infinite loops in pathological cases
|
||
MAX_ALPHA_ITERS: int = 2000
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Helpers: ESH packing/unpacking (128x8 <-> 1024x1)
|
||
# -----------------------------------------------------------------------------
|
||
def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray:
|
||
"""
|
||
Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1).
|
||
|
||
Packing order:
|
||
Columns are concatenated in subframe order (0..7), column-major.
|
||
|
||
Parameters
|
||
----------
|
||
x_128x8 : FloatArray
|
||
ESH coefficients, shape (128, 8).
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
Packed coefficients, shape (1024, 1).
|
||
"""
|
||
x_128x8 = np.asarray(x_128x8, dtype=np.float64)
|
||
if x_128x8.shape != (128, 8):
|
||
raise ValueError("ESH pack expects shape (128, 8).")
|
||
return x_128x8.reshape(1024, 1, order="F")
|
||
|
||
|
||
def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray:
|
||
"""
|
||
Unpack a packed ESH vector (1024 elements) back to shape (128, 8).
|
||
|
||
Parameters
|
||
----------
|
||
x_1024x1 : FloatArray
|
||
Packed ESH vector, shape (1024,) or (1024, 1) after flattening.
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
Unpacked ESH coefficients, shape (128, 8).
|
||
"""
|
||
x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1)
|
||
if x_1024x1.shape[0] != 1024:
|
||
raise ValueError("ESH unpack expects 1024 elements.")
|
||
return x_1024x1.reshape(128, 8, order="F")
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Core quantizer formulas (Eq. 12, Eq. 13)
|
||
# -----------------------------------------------------------------------------
|
||
def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols:
|
||
"""
|
||
Quantize MDCT coefficients to integer symbols S(k).
|
||
|
||
Implements Eq. (12):
|
||
S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER )
|
||
|
||
Parameters
|
||
----------
|
||
x : FloatArray
|
||
MDCT coefficients for a contiguous set of spectral lines.
|
||
Shape: (N,)
|
||
alpha : float
|
||
Scalefactor gain for the corresponding scalefactor band.
|
||
|
||
Returns
|
||
-------
|
||
QuantizedSymbols
|
||
Quantized symbols S(k) as int64, shape (N,).
|
||
"""
|
||
x = np.asarray(x, dtype=np.float64)
|
||
|
||
scale = 2.0 ** (-0.25 * float(alpha))
|
||
ax = np.abs(x) * scale
|
||
|
||
y = np.power(ax, 0.75, dtype=np.float64)
|
||
|
||
# "int" in the assignment corresponds to truncation.
|
||
q = np.floor(y + MAGIC_NUMBER).astype(np.int64)
|
||
return (np.sign(x).astype(np.int64) * q).astype(np.int64)
|
||
|
||
|
||
def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray:
|
||
"""
|
||
Inverse quantizer (dequantization of symbols).
|
||
|
||
Implements Eq. (13):
|
||
Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4)
|
||
|
||
Parameters
|
||
----------
|
||
S : QuantizedSymbols
|
||
Quantized symbols S(k), int64, shape (N,).
|
||
alpha : float
|
||
Scalefactor gain for the corresponding scalefactor band.
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
Reconstructed MDCT coefficients Xhat(k), float64, shape (N,).
|
||
"""
|
||
S = np.asarray(S, dtype=np.int64)
|
||
|
||
scale = 2.0 ** (0.25 * float(alpha))
|
||
aS = np.abs(S).astype(np.float64)
|
||
y = np.power(aS, 4.0 / 3.0, dtype=np.float64)
|
||
|
||
return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64)
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Alpha initialization (Eq. 14)
|
||
# -----------------------------------------------------------------------------
|
||
def _alpha_initial_from_frame_max(x_all: FloatArray) -> int:
|
||
"""
|
||
Compute the initial scalefactor gain alpha_hat for a frame.
|
||
|
||
Implements Eq. (14):
|
||
alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ )
|
||
|
||
The same initial value is used for all bands before the per-band refinement.
|
||
|
||
Parameters
|
||
----------
|
||
x_all : FloatArray
|
||
All MDCT coefficients of a frame (one channel), flattened.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
Initial alpha value (integer).
|
||
"""
|
||
x_all = np.asarray(x_all, dtype=np.float64).reshape(-1)
|
||
if x_all.size == 0:
|
||
return 0
|
||
|
||
max_abs = float(np.max(np.abs(x_all)))
|
||
if max_abs <= 0.0:
|
||
return 0
|
||
|
||
val = (max_abs ** 0.75) / float(MQ)
|
||
if val <= 0.0:
|
||
return 0
|
||
|
||
alpha_hat = (16.0 / 3.0) * np.log2(val)
|
||
return int(np.floor(alpha_hat))
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Band utilities
|
||
# -----------------------------------------------------------------------------
|
||
def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]:
|
||
"""
|
||
Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type.
|
||
|
||
These are derived from the psychoacoustic tables (TableB219),
|
||
and map directly to MDCT indices:
|
||
- long: 0..1023
|
||
- short (ESH subframe): 0..127
|
||
|
||
Parameters
|
||
----------
|
||
frame_type : FrameType
|
||
Frame type ("OLS", "LSS", "ESH", "LPS").
|
||
|
||
Returns
|
||
-------
|
||
list[tuple[int, int]]
|
||
List of (lo, hi) inclusive index pairs for each band.
|
||
"""
|
||
table, _Nfft = get_table(frame_type)
|
||
wlow, whigh, _bval, _qthr_db = band_limits(table)
|
||
|
||
bands: list[tuple[int, int]] = []
|
||
for lo, hi in zip(wlow, whigh):
|
||
bands.append((int(lo), int(hi)))
|
||
return bands
|
||
|
||
|
||
def _band_energy(x: FloatArray, lo: int, hi: int) -> float:
|
||
"""
|
||
Compute energy of a spectral segment x[lo:hi+1].
|
||
|
||
Parameters
|
||
----------
|
||
x : FloatArray
|
||
MDCT coefficient vector.
|
||
lo, hi : int
|
||
Inclusive index range.
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
Sum of squares (energy) within the band.
|
||
"""
|
||
sec = x[lo : hi + 1]
|
||
return float(np.sum(sec * sec))
|
||
|
||
|
||
def _threshold_T_from_SMR(
|
||
X: FloatArray,
|
||
SMR_col: FloatArray,
|
||
bands: list[tuple[int, int]],
|
||
) -> FloatArray:
|
||
"""
|
||
Compute psychoacoustic thresholds T(b) per band.
|
||
|
||
Uses:
|
||
P(b) = sum_{k in band} X(k)^2
|
||
T(b) = P(b) / SMR(b)
|
||
|
||
Parameters
|
||
----------
|
||
X : FloatArray
|
||
MDCT coefficients for a frame (long) or one ESH subframe (short).
|
||
SMR_col : FloatArray
|
||
SMR values for this frame/subframe, shape (NB,).
|
||
bands : list[tuple[int, int]]
|
||
Band index ranges.
|
||
|
||
Returns
|
||
-------
|
||
FloatArray
|
||
Threshold vector T(b), shape (NB,).
|
||
"""
|
||
nb = len(bands)
|
||
T = np.zeros((nb,), dtype=np.float64)
|
||
|
||
for b, (lo, hi) in enumerate(bands):
|
||
P = _band_energy(X, lo, hi)
|
||
smr = float(SMR_col[b])
|
||
if smr <= EPS:
|
||
T[b] = 0.0
|
||
else:
|
||
T[b] = P / smr
|
||
|
||
return T
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Alpha selection per band + neighbor-difference constraint
|
||
# -----------------------------------------------------------------------------
|
||
def _best_alpha_for_band(
|
||
X: FloatArray,
|
||
lo: int,
|
||
hi: int,
|
||
T_b: float,
|
||
alpha0: int,
|
||
alpha_min: int,
|
||
alpha_max: int,
|
||
) -> int:
|
||
"""
|
||
Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha.
|
||
|
||
The algorithm increases alpha until the quantization error power exceeds
|
||
the band threshold:
|
||
|
||
P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2
|
||
select the largest alpha such that P_e(b) <= T(b)
|
||
|
||
Parameters
|
||
----------
|
||
X : FloatArray
|
||
Full MDCT vector (one frame or one subframe), shape (N,).
|
||
lo, hi : int
|
||
Inclusive MDCT index range defining the band.
|
||
T_b : float
|
||
Psychoacoustic threshold for this band.
|
||
alpha0 : int
|
||
Initial alpha value for the band.
|
||
alpha_min, alpha_max : int
|
||
Bounds for alpha, used as a safeguard.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
Selected integer alpha(b).
|
||
"""
|
||
if T_b <= 0.0:
|
||
return int(alpha0)
|
||
|
||
Xsec = X[lo : hi + 1]
|
||
|
||
alpha = int(alpha0)
|
||
alpha = max(alpha_min, min(alpha, alpha_max))
|
||
|
||
# Evaluate at current alpha
|
||
Ssec = _quantize_symbol(Xsec, alpha)
|
||
Xhat = _dequantize_symbol(Ssec, alpha)
|
||
Pe = float(np.sum((Xsec - Xhat) ** 2))
|
||
|
||
if Pe > T_b:
|
||
return alpha
|
||
|
||
iters = 0
|
||
while iters < MAX_ALPHA_ITERS:
|
||
iters += 1
|
||
alpha_next = alpha + 1
|
||
if alpha_next > alpha_max:
|
||
break
|
||
|
||
Ssec = _quantize_symbol(Xsec, alpha_next)
|
||
Xhat = _dequantize_symbol(Ssec, alpha_next)
|
||
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
|
||
|
||
if Pe_next > T_b:
|
||
break
|
||
|
||
alpha = alpha_next
|
||
Pe = Pe_next
|
||
|
||
return alpha
|
||
|
||
|
||
def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray:
|
||
"""
|
||
Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping.
|
||
|
||
Uses a forward pass and a backward pass to reduce drift.
|
||
|
||
Parameters
|
||
----------
|
||
alpha : np.ndarray
|
||
Alpha vector, shape (NB,).
|
||
max_diff : int
|
||
Maximum allowed absolute difference between adjacent bands.
|
||
|
||
Returns
|
||
-------
|
||
np.ndarray
|
||
Clamped alpha vector, int64, shape (NB,).
|
||
"""
|
||
a = np.asarray(alpha, dtype=np.int64).copy()
|
||
nb = a.shape[0]
|
||
|
||
for b in range(1, nb):
|
||
lo = int(a[b - 1] - max_diff)
|
||
hi = int(a[b - 1] + max_diff)
|
||
if a[b] < lo:
|
||
a[b] = lo
|
||
elif a[b] > hi:
|
||
a[b] = hi
|
||
|
||
for b in range(nb - 2, -1, -1):
|
||
lo = int(a[b + 1] - max_diff)
|
||
hi = int(a[b + 1] + max_diff)
|
||
if a[b] < lo:
|
||
a[b] = lo
|
||
elif a[b] > hi:
|
||
a[b] = hi
|
||
|
||
return a
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Public API
|
||
# -----------------------------------------------------------------------------
|
||
def aac_quantizer(
|
||
frame_F: FrameChannelF,
|
||
frame_type: FrameType,
|
||
SMR: FloatArray,
|
||
) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]:
|
||
"""
|
||
AAC quantizer for one channel (Level 3).
|
||
|
||
Quantizes MDCT coefficients using band-wise scalefactors derived from SMR.
|
||
|
||
Parameters
|
||
----------
|
||
frame_F : FrameChannelF
|
||
MDCT coefficients after TNS, one channel.
|
||
Shapes:
|
||
- Long frames: (1024,) or (1024, 1)
|
||
- ESH: (128, 8)
|
||
frame_type : FrameType
|
||
AAC frame type ("OLS", "LSS", "ESH", "LPS").
|
||
SMR : FloatArray
|
||
Signal-to-Mask Ratio per band.
|
||
Shapes:
|
||
- Long: (NB,) or (NB, 1)
|
||
- ESH: (NB, 8)
|
||
|
||
Returns
|
||
-------
|
||
S : QuantizedSymbols
|
||
Quantized symbols S(k), shape (1024, 1) for all frame types.
|
||
sfc : ScaleFactors
|
||
DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1).
|
||
Shapes:
|
||
- Long: (NB, 1)
|
||
- ESH: (NB, 8)
|
||
G : GlobalGain
|
||
Global gain G = alpha(0).
|
||
- Long: scalar float
|
||
- ESH: array shape (1, 8)
|
||
"""
|
||
bands = _band_slices(frame_type)
|
||
NB = len(bands)
|
||
|
||
X = np.asarray(frame_F, dtype=np.float64)
|
||
SMR = np.asarray(SMR, dtype=np.float64)
|
||
|
||
if frame_type == "ESH":
|
||
if X.shape != (128, 8):
|
||
raise ValueError("For ESH, frame_F must have shape (128, 8).")
|
||
if SMR.shape != (NB, 8):
|
||
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
|
||
|
||
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
|
||
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
|
||
G_arr = np.zeros((1, 8), dtype=np.int64)
|
||
|
||
for j in range(8):
|
||
Xj = X[:, j].reshape(128)
|
||
SMRj = SMR[:, j].reshape(NB)
|
||
|
||
T = _threshold_T_from_SMR(Xj, SMRj, bands)
|
||
|
||
alpha0 = _alpha_initial_from_frame_max(Xj)
|
||
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
||
|
||
for b, (lo, hi) in enumerate(bands):
|
||
alpha[b] = _best_alpha_for_band(
|
||
X=Xj, lo=lo, hi=hi, T_b=float(T[b]),
|
||
alpha0=int(alpha[b]),
|
||
alpha_min=-4096,
|
||
alpha_max=4096,
|
||
)
|
||
|
||
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
||
|
||
G_arr[0, j] = int(alpha[0])
|
||
sfc[0, j] = int(alpha[0])
|
||
for b in range(1, NB):
|
||
sfc[b, j] = int(alpha[b] - alpha[b - 1])
|
||
|
||
Sj = np.zeros((128,), dtype=np.int64)
|
||
for b, (lo, hi) in enumerate(bands):
|
||
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
|
||
|
||
# Place this subframe in the packed output (column-major subframe layout)
|
||
S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj
|
||
|
||
return S_out, sfc, G_arr.astype(np.float64)
|
||
|
||
# Long frames
|
||
if X.shape == (1024,):
|
||
Xv = X
|
||
elif X.shape == (1024, 1):
|
||
Xv = X[:, 0]
|
||
else:
|
||
raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).")
|
||
|
||
if SMR.shape == (NB,):
|
||
SMRv = SMR
|
||
elif SMR.shape == (NB, 1):
|
||
SMRv = SMR[:, 0]
|
||
else:
|
||
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
|
||
|
||
T = _threshold_T_from_SMR(Xv, SMRv, bands)
|
||
|
||
alpha0 = _alpha_initial_from_frame_max(Xv)
|
||
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
||
|
||
for b, (lo, hi) in enumerate(bands):
|
||
alpha[b] = _best_alpha_for_band(
|
||
X=Xv, lo=lo, hi=hi, T_b=float(T[b]),
|
||
alpha0=int(alpha[b]),
|
||
alpha_min=-4096,
|
||
alpha_max=4096,
|
||
)
|
||
|
||
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
||
|
||
sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
|
||
sfc[0, 0] = int(alpha[0])
|
||
for b in range(1, NB):
|
||
sfc[b, 0] = int(alpha[b] - alpha[b - 1])
|
||
|
||
G: float = float(alpha[0])
|
||
|
||
S_vec = np.zeros((1024,), dtype=np.int64)
|
||
for b, (lo, hi) in enumerate(bands):
|
||
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
|
||
|
||
return S_vec.reshape(1024, 1), sfc, G
|
||
|
||
|
||
def aac_i_quantizer(
|
||
S: QuantizedSymbols,
|
||
sfc: ScaleFactors,
|
||
G: GlobalGain,
|
||
frame_type: FrameType,
|
||
) -> FrameChannelF:
|
||
"""
|
||
Inverse quantizer (iQuantizer) for one channel.
|
||
|
||
Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors.
|
||
|
||
Parameters
|
||
----------
|
||
S : QuantizedSymbols
|
||
Quantized symbols, shape (1024, 1) (or any array with 1024 elements).
|
||
sfc : ScaleFactors
|
||
DPCM-coded scalefactors.
|
||
Shapes:
|
||
- Long: (NB, 1)
|
||
- ESH: (NB, 8)
|
||
G : GlobalGain
|
||
Global gain (not strictly required if sfc includes sfc(0)=alpha(0)).
|
||
Present for API compatibility with the assignment.
|
||
frame_type : FrameType
|
||
AAC frame type.
|
||
|
||
Returns
|
||
-------
|
||
FrameChannelF
|
||
Reconstructed MDCT coefficients:
|
||
- ESH: (128, 8)
|
||
- Long: (1024, 1)
|
||
"""
|
||
bands = _band_slices(frame_type)
|
||
NB = len(bands)
|
||
|
||
S_flat = np.asarray(S, dtype=np.int64).reshape(-1)
|
||
if S_flat.shape[0] != 1024:
|
||
raise ValueError("S must contain 1024 symbols.")
|
||
|
||
if frame_type == "ESH":
|
||
sfc = np.asarray(sfc, dtype=np.int64)
|
||
if sfc.shape != (NB, 8):
|
||
raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).")
|
||
|
||
S_128x8 = _esh_unpack_from_1024(S_flat)
|
||
|
||
Xrec = np.zeros((128, 8), dtype=np.float64)
|
||
|
||
for j in range(8):
|
||
alpha = np.zeros((NB,), dtype=np.int64)
|
||
alpha[0] = int(sfc[0, j])
|
||
for b in range(1, NB):
|
||
alpha[b] = int(alpha[b - 1] + sfc[b, j])
|
||
|
||
Xj = np.zeros((128,), dtype=np.float64)
|
||
for b, (lo, hi) in enumerate(bands):
|
||
Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b]))
|
||
|
||
Xrec[:, j] = Xj
|
||
|
||
return Xrec
|
||
|
||
sfc = np.asarray(sfc, dtype=np.int64)
|
||
if sfc.shape != (NB, 1):
|
||
raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).")
|
||
|
||
alpha = np.zeros((NB,), dtype=np.int64)
|
||
alpha[0] = int(sfc[0, 0])
|
||
for b in range(1, NB):
|
||
alpha[b] = int(alpha[b - 1] + sfc[b, 0])
|
||
|
||
Xrec = np.zeros((1024,), dtype=np.float64)
|
||
for b, (lo, hi) in enumerate(bands):
|
||
Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b]))
|
||
|
||
return Xrec.reshape(1024, 1)
|