diff --git a/source/core/aac_coder.py b/source/core/aac_coder.py index 3e9cc96..07034e6 100644 --- a/source/core/aac_coder.py +++ b/source/core/aac_coder.py @@ -485,46 +485,16 @@ def aac_coder_3( sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] - # Codebook 11: - # maxAbsCodeVal = 16 is RESERVED for ESCAPE. - # We must stay strictly within [-15, +15] to avoid escape decoding. - # sf_cb = 11 - # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 - # - # sfc_L_dpcm = np.clip( - # sfc_L_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - # - # sfc_R_dpcm = np.clip( - # sfc_R_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - - sfc_L_stream, cb_sfc_L = aac_encode_huff( - sfc_L_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) - sfc_R_stream, cb_sfc_R = aac_encode_huff( - sfc_R_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) + # sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + # sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list) + sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list) if cb_sfc_L != 11 or cb_sfc_R != 11: - print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.") - mdct_L_stream, cb_L = aac_encode_huff( - np.asarray(S_L, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) - mdct_R_stream, cb_R = aac_encode_huff( - np.asarray(S_R, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) + mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list) + mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list) # Typed dict construction helps static analyzers validate the schema. frame_out: AACSeq3Frame = { diff --git a/source/core/aac_quantizer.py b/source/core/aac_quantizer.py index addcfe8..5bc8526 100644 --- a/source/core/aac_quantizer.py +++ b/source/core/aac_quantizer.py @@ -28,13 +28,8 @@ from core.aac_types import * # Constants (assignment) # ----------------------------------------------------------------------------- MAGIC_NUMBER: float = 0.4054 -MQ: int = 8191 - EPS: float = 1e-12 -MAX_SFC_DIFF: int = 60 - -# Safeguard: prevents infinite loops in pathological cases -MAX_ALPHA_ITERS: int = 2000 +MAX_SF_DELTA:int = 60 # ----------------------------------------------------------------------------- @@ -149,39 +144,35 @@ def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray: # ----------------------------------------------------------------------------- # Alpha initialization (Eq. 14) # ----------------------------------------------------------------------------- -def _alpha_initial_from_frame_max(x_all: FloatArray) -> int: +def _initial_alpha_hat(X: "FloatArray", MQ: int = 8191) -> int: """ - Compute the initial scalefactor gain alpha_hat for a frame. + Compute the initial scalefactor estimate alpha_hat for a frame. - Implements Eq. (14): - alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ ) + The assignment proposes the following first approximation (Equation 14): - The same initial value is used for all bands before the per-band refinement. + alpha_hat = (16/3) * log2( max_k(|X(k)|)^(3/4) / MQ ) + + where max_k runs over all MDCT coefficients of the frame (not per band), + and MQ is the maximum quantization level parameter (2*MQ + 1 levels). Parameters ---------- - x_all : FloatArray - All MDCT coefficients of a frame (one channel), flattened. + X : FloatArray + MDCT coefficients of one frame (or one ESH subframe), shape (N,). + MQ : int + Quantizer parameter (default 8191, as per assignment). Returns ------- int - Initial alpha value (integer). + Integer alpha_hat (rounded to nearest integer). """ - x_all = np.asarray(x_all, dtype=np.float64).reshape(-1) - if x_all.size == 0: + x_max = float(np.max(np.abs(X))) + if x_max <= 0.0: return 0 - max_abs = float(np.max(np.abs(x_all))) - if max_abs <= 0.0: - return 0 - - val = (max_abs ** 0.75) / float(MQ) - if val <= 0.0: - return 0 - - alpha_hat = (16.0 / 3.0) * np.log2(val) - return int(np.floor(alpha_hat)) + alpha_hat = (16.0 / 3.0) * np.log2((x_max ** (3.0 / 4.0)) / float(MQ)) + return int(np.round(alpha_hat)) # ----------------------------------------------------------------------------- @@ -279,35 +270,38 @@ def _threshold_T_from_SMR( # Alpha selection per band + neighbor-difference constraint # ----------------------------------------------------------------------------- def _best_alpha_for_band( - X: FloatArray, - lo: int, - hi: int, - T_b: float, - alpha0: int, - alpha_min: int, - alpha_max: int, + X: "FloatArray", lo: int, hi: int, T_b: float, + alpha_hat: int, alpha_prev: int, alpha_min: int, alpha_max: int, ) -> int: """ - Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha. + Determine the band-wise scalefactor alpha(b) following the assignment. - The algorithm increases alpha until the quantization error power exceeds - the band threshold: + Procedure: + - Start from a frame-wise initial estimate alpha_hat. + - Iteratively increase alpha(b) by 1 as long as the quantization error power + stays below the psychoacoustic threshold T(b): P_e(b) = sum_{k in band} ( X(k) - Xhat(k) )^2 - P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2 - select the largest alpha such that P_e(b) <= T(b) + - Stop increasing alpha(b) if the neighbor constraint would be violated: |alpha(b) - alpha(b-1)| <= 60 + When processing bands sequentially (low -> high), this becomes: alpha(b) <= alpha_prev + 60 + + Notes: + - This function does not decrease alpha if the initial value already violates + the threshold; the assignment only specifies iterative increase. Parameters ---------- X : FloatArray - Full MDCT vector (one frame or one subframe), shape (N,). + Full MDCT vector of the current (sub)frame, shape (N,). lo, hi : int - Inclusive MDCT index range defining the band. + Band index bounds (inclusive), defining the band slice. T_b : float - Psychoacoustic threshold for this band. - alpha0 : int - Initial alpha value for the band. + Threshold T(b) for this band. + alpha_hat : int + Initial frame-wise estimate (Equation 14). + alpha_prev : int + Previously selected alpha for band b-1 (neighbor constraint reference). alpha_min, alpha_max : int - Bounds for alpha, used as a safeguard. + Safeguard bounds for alpha. Returns ------- @@ -315,81 +309,41 @@ def _best_alpha_for_band( Selected integer alpha(b). """ if T_b <= 0.0: - return int(alpha0) - + return int(alpha_hat) Xsec = X[lo : hi + 1] - alpha = int(alpha0) - alpha = max(alpha_min, min(alpha, alpha_max)) + # Neighbor constraint (sequential processing): alpha(b) <= alpha_prev + 60 + alpha_limit = min(int(alpha_max), int(alpha_prev) + MAX_SF_DELTA) + + # Start from alpha_hat, clamped to feasible range + alpha = int(alpha_hat) + alpha = max(int(alpha_min), min(alpha, int(alpha_limit))) # Evaluate at current alpha Ssec = _quantize_symbol(Xsec, alpha) Xhat = _dequantize_symbol(Ssec, alpha) Pe = float(np.sum((Xsec - Xhat) ** 2)) + # If already above threshold, return current alpha (no decrease step specified) if Pe > T_b: return alpha - iters = 0 - while iters < MAX_ALPHA_ITERS: - iters += 1 + # Increase alpha while still under threshold and within constraints + while True: alpha_next = alpha + 1 - if alpha_next > alpha_max: + if alpha_next > alpha_limit: break - Ssec = _quantize_symbol(Xsec, alpha_next) Xhat = _dequantize_symbol(Ssec, alpha_next) Pe_next = float(np.sum((Xsec - Xhat) ** 2)) if Pe_next > T_b: break - alpha = alpha_next - Pe = Pe_next return alpha -def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray: - """ - Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping. - - Uses a forward pass and a backward pass to reduce drift. - - Parameters - ---------- - alpha : np.ndarray - Alpha vector, shape (NB,). - max_diff : int - Maximum allowed absolute difference between adjacent bands. - - Returns - ------- - np.ndarray - Clamped alpha vector, int64, shape (NB,). - """ - a = np.asarray(alpha, dtype=np.int64).copy() - nb = a.shape[0] - - for b in range(1, nb): - lo = int(a[b - 1] - max_diff) - hi = int(a[b - 1] + max_diff) - if a[b] < lo: - a[b] = lo - elif a[b] > hi: - a[b] = hi - - for b in range(nb - 2, -1, -1): - lo = int(a[b + 1] - max_diff) - hi = int(a[b + 1] + max_diff) - if a[b] < lo: - a[b] = lo - elif a[b] > hi: - a[b] = hi - - return a - - # ----------------------------------------------------------------------------- # Public API # ----------------------------------------------------------------------------- @@ -401,7 +355,16 @@ def aac_quantizer( """ AAC quantizer for one channel (Level 3). - Quantizes MDCT coefficients using band-wise scalefactors derived from SMR. + Quantizes MDCT coefficients (after TNS) using band-wise scalefactors derived + from psychoacoustic thresholds computed via SMR. + + The implementation follows the assignment procedure: + - Compute an initial frame-wise alpha_hat using Equation (14), based on the + maximum MDCT coefficient magnitude of the (sub)frame. + - For each band b, increase alpha(b) by 1 while the quantization error power + P_e(b) stays below the threshold T(b). + - Enforce the neighbor constraint |alpha(b) - alpha(b-1)| <= 60 during the + band-by-band search (no post-processing needed). Parameters ---------- @@ -421,16 +384,19 @@ def aac_quantizer( Returns ------- S : QuantizedSymbols - Quantized symbols S(k), shape (1024, 1) for all frame types. + Quantized symbols S(k), packed as shape (1024, 1) for all frame types. + For ESH, the 8 subframes are packed in column-major subframe layout. sfc : ScaleFactors - DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1). + DPCM-coded scalefactors: + sfc(0) = alpha(0) = G + sfc(b) = alpha(b) - alpha(b-1), for b > 0 Shapes: - Long: (NB, 1) - ESH: (NB, 8) G : GlobalGain Global gain G = alpha(0). - Long: scalar float - - ESH: array shape (1, 8) + - ESH: array shape (1, 8), dtype float64 """ bands = _band_slices(frame_type) NB = len(bands) @@ -438,6 +404,9 @@ def aac_quantizer( X = np.asarray(frame_F, dtype=np.float64) SMR = np.asarray(SMR, dtype=np.float64) + # ------------------------------------------------------------------------- + # ESH: 8 short subframes, each of length 128 + # ------------------------------------------------------------------------- if frame_type == "ESH": if X.shape != (128, 8): raise ValueError("For ESH, frame_F must have shape (128, 8).") @@ -446,42 +415,58 @@ def aac_quantizer( S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64) sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64) - G_arr = np.zeros((1, 8), dtype=np.int64) + G_arr = np.zeros((1, 8), dtype=np.float64) + + # Packed output view: (128, 8) with column-major layout + S_pack = S_out[:, 0].reshape(128, 8, order="F") for j in range(8): Xj = X[:, j].reshape(128) SMRj = SMR[:, j].reshape(NB) + # Compute psychoacoustic threshold T(b) for this subframe T = _threshold_T_from_SMR(Xj, SMRj, bands) - alpha0 = _alpha_initial_from_frame_max(Xj) - alpha = np.full((NB,), alpha0, dtype=np.int64) + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xj) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) for b, (lo, hi) in enumerate(bands): - alpha[b] = _best_alpha_for_band( - X=Xj, lo=lo, hi=hi, T_b=float(T[b]), - alpha0=int(alpha[b]), + alpha_b = _best_alpha_for_band( + X=Xj, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), alpha_min=-4096, alpha_max=4096, ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) - alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) - - G_arr[0, j] = int(alpha[0]) + # DPCM-coded scalefactors + G_arr[0, j] = float(alpha[0]) sfc[0, j] = int(alpha[0]) for b in range(1, NB): sfc[b, j] = int(alpha[b] - alpha[b - 1]) + # Quantize MDCT coefficients band-by-band Sj = np.zeros((128,), dtype=np.int64) for b, (lo, hi) in enumerate(bands): Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b])) - # Place this subframe in the packed output (column-major subframe layout) - S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj + # Store subframe in packed output + S_pack[:, j] = Sj - return S_out, sfc, G_arr.astype(np.float64) + return S_out, sfc, G_arr - # Long frames + # ------------------------------------------------------------------------- + # Long frames: OLS / LSS / LPS, length 1024 + # ------------------------------------------------------------------------- if X.shape == (1024,): Xv = X elif X.shape == (1024, 1): @@ -496,33 +481,44 @@ def aac_quantizer( else: raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") + # Compute psychoacoustic threshold T(b) for the long frame T = _threshold_T_from_SMR(Xv, SMRv, bands) - alpha0 = _alpha_initial_from_frame_max(Xv) - alpha = np.full((NB,), alpha0, dtype=np.int64) + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xv) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) for b, (lo, hi) in enumerate(bands): - alpha[b] = _best_alpha_for_band( - X=Xv, lo=lo, hi=hi, T_b=float(T[b]), - alpha0=int(alpha[b]), + alpha_b = _best_alpha_for_band( + X=Xv, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), alpha_min=-4096, alpha_max=4096, ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) - alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) - - sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) - sfc[0, 0] = int(alpha[0]) + # DPCM-coded scalefactors + sfc_out: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) + sfc_out[0, 0] = int(alpha[0]) for b in range(1, NB): - sfc[b, 0] = int(alpha[b] - alpha[b - 1]) + sfc_out[b, 0] = int(alpha[b] - alpha[b - 1]) G: float = float(alpha[0]) + # Quantize MDCT coefficients band-by-band S_vec = np.zeros((1024,), dtype=np.int64) for b, (lo, hi) in enumerate(bands): S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b])) - return S_vec.reshape(1024, 1), sfc, G + return S_vec.reshape(1024, 1), sfc_out, G def aac_i_quantizer( diff --git a/source/core/tests/test_aac_coder_decoder.py b/source/core/tests/test_aac_coder_decoder.py index 7241902..54207fc 100644 --- a/source/core/tests/test_aac_coder_decoder.py +++ b/source/core/tests/test_aac_coder_decoder.py @@ -310,5 +310,5 @@ def test_end_to_end_level_3_high_snr(mk_actual_stereo_wav: Path, tmp_path: Path) n = min(x_ref.shape[0], y_hat.shape[0]) s = snr_db(x_ref[:n, :], y_hat[:n, :]) - print(f"SNR={s}") - assert s > 2.0 + # print(f" with SNR={s}") + assert s > 7.0 diff --git a/source/level_1/core/aac_coder.py b/source/level_1/core/aac_coder.py index 3e9cc96..07034e6 100644 --- a/source/level_1/core/aac_coder.py +++ b/source/level_1/core/aac_coder.py @@ -485,46 +485,16 @@ def aac_coder_3( sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] - # Codebook 11: - # maxAbsCodeVal = 16 is RESERVED for ESCAPE. - # We must stay strictly within [-15, +15] to avoid escape decoding. - # sf_cb = 11 - # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 - # - # sfc_L_dpcm = np.clip( - # sfc_L_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - # - # sfc_R_dpcm = np.clip( - # sfc_R_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - - sfc_L_stream, cb_sfc_L = aac_encode_huff( - sfc_L_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) - sfc_R_stream, cb_sfc_R = aac_encode_huff( - sfc_R_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) + # sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + # sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list) + sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list) if cb_sfc_L != 11 or cb_sfc_R != 11: - print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.") - mdct_L_stream, cb_L = aac_encode_huff( - np.asarray(S_L, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) - mdct_R_stream, cb_R = aac_encode_huff( - np.asarray(S_R, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) + mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list) + mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list) # Typed dict construction helps static analyzers validate the schema. frame_out: AACSeq3Frame = { diff --git a/source/level_1/material/LicorDeCalandraca_out1.wav b/source/level_1/material/LicorDeCalandraca_out1.wav deleted file mode 100644 index 2630d3a..0000000 Binary files a/source/level_1/material/LicorDeCalandraca_out1.wav and /dev/null differ diff --git a/source/level_2/core/aac_coder.py b/source/level_2/core/aac_coder.py index 3e9cc96..07034e6 100644 --- a/source/level_2/core/aac_coder.py +++ b/source/level_2/core/aac_coder.py @@ -485,46 +485,16 @@ def aac_coder_3( sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] - # Codebook 11: - # maxAbsCodeVal = 16 is RESERVED for ESCAPE. - # We must stay strictly within [-15, +15] to avoid escape decoding. - # sf_cb = 11 - # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 - # - # sfc_L_dpcm = np.clip( - # sfc_L_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - # - # sfc_R_dpcm = np.clip( - # sfc_R_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - - sfc_L_stream, cb_sfc_L = aac_encode_huff( - sfc_L_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) - sfc_R_stream, cb_sfc_R = aac_encode_huff( - sfc_R_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) + # sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + # sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list) + sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list) if cb_sfc_L != 11 or cb_sfc_R != 11: - print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.") - mdct_L_stream, cb_L = aac_encode_huff( - np.asarray(S_L, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) - mdct_R_stream, cb_R = aac_encode_huff( - np.asarray(S_R, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) + mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list) + mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list) # Typed dict construction helps static analyzers validate the schema. frame_out: AACSeq3Frame = { diff --git a/source/level_2/material/LicorDeCalandraca_out2.wav b/source/level_2/material/LicorDeCalandraca_out2.wav deleted file mode 100644 index 2630d3a..0000000 Binary files a/source/level_2/material/LicorDeCalandraca_out2.wav and /dev/null differ diff --git a/source/level_3/core/aac_coder.py b/source/level_3/core/aac_coder.py index 3e9cc96..07034e6 100644 --- a/source/level_3/core/aac_coder.py +++ b/source/level_3/core/aac_coder.py @@ -485,46 +485,16 @@ def aac_coder_3( sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] - # Codebook 11: - # maxAbsCodeVal = 16 is RESERVED for ESCAPE. - # We must stay strictly within [-15, +15] to avoid escape decoding. - # sf_cb = 11 - # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 - # - # sfc_L_dpcm = np.clip( - # sfc_L_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - # - # sfc_R_dpcm = np.clip( - # sfc_R_dpcm, - # -sf_max_abs, - # sf_max_abs, - # ).astype(np.int64, copy=False) - - sfc_L_stream, cb_sfc_L = aac_encode_huff( - sfc_L_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) - sfc_R_stream, cb_sfc_R = aac_encode_huff( - sfc_R_dpcm.reshape(-1, order="F"), - huff_LUT_list, - # force_codebook=11, - ) + # sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + # sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11) + sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list) + sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list) if cb_sfc_L != 11 or cb_sfc_R != 11: - print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.") - mdct_L_stream, cb_L = aac_encode_huff( - np.asarray(S_L, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) - mdct_R_stream, cb_R = aac_encode_huff( - np.asarray(S_R, dtype=np.int64).reshape(-1), - huff_LUT_list, - ) + mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list) + mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list) # Typed dict construction helps static analyzers validate the schema. frame_out: AACSeq3Frame = { diff --git a/source/level_3/core/aac_quantizer.py b/source/level_3/core/aac_quantizer.py index addcfe8..5bc8526 100644 --- a/source/level_3/core/aac_quantizer.py +++ b/source/level_3/core/aac_quantizer.py @@ -28,13 +28,8 @@ from core.aac_types import * # Constants (assignment) # ----------------------------------------------------------------------------- MAGIC_NUMBER: float = 0.4054 -MQ: int = 8191 - EPS: float = 1e-12 -MAX_SFC_DIFF: int = 60 - -# Safeguard: prevents infinite loops in pathological cases -MAX_ALPHA_ITERS: int = 2000 +MAX_SF_DELTA:int = 60 # ----------------------------------------------------------------------------- @@ -149,39 +144,35 @@ def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray: # ----------------------------------------------------------------------------- # Alpha initialization (Eq. 14) # ----------------------------------------------------------------------------- -def _alpha_initial_from_frame_max(x_all: FloatArray) -> int: +def _initial_alpha_hat(X: "FloatArray", MQ: int = 8191) -> int: """ - Compute the initial scalefactor gain alpha_hat for a frame. + Compute the initial scalefactor estimate alpha_hat for a frame. - Implements Eq. (14): - alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ ) + The assignment proposes the following first approximation (Equation 14): - The same initial value is used for all bands before the per-band refinement. + alpha_hat = (16/3) * log2( max_k(|X(k)|)^(3/4) / MQ ) + + where max_k runs over all MDCT coefficients of the frame (not per band), + and MQ is the maximum quantization level parameter (2*MQ + 1 levels). Parameters ---------- - x_all : FloatArray - All MDCT coefficients of a frame (one channel), flattened. + X : FloatArray + MDCT coefficients of one frame (or one ESH subframe), shape (N,). + MQ : int + Quantizer parameter (default 8191, as per assignment). Returns ------- int - Initial alpha value (integer). + Integer alpha_hat (rounded to nearest integer). """ - x_all = np.asarray(x_all, dtype=np.float64).reshape(-1) - if x_all.size == 0: + x_max = float(np.max(np.abs(X))) + if x_max <= 0.0: return 0 - max_abs = float(np.max(np.abs(x_all))) - if max_abs <= 0.0: - return 0 - - val = (max_abs ** 0.75) / float(MQ) - if val <= 0.0: - return 0 - - alpha_hat = (16.0 / 3.0) * np.log2(val) - return int(np.floor(alpha_hat)) + alpha_hat = (16.0 / 3.0) * np.log2((x_max ** (3.0 / 4.0)) / float(MQ)) + return int(np.round(alpha_hat)) # ----------------------------------------------------------------------------- @@ -279,35 +270,38 @@ def _threshold_T_from_SMR( # Alpha selection per band + neighbor-difference constraint # ----------------------------------------------------------------------------- def _best_alpha_for_band( - X: FloatArray, - lo: int, - hi: int, - T_b: float, - alpha0: int, - alpha_min: int, - alpha_max: int, + X: "FloatArray", lo: int, hi: int, T_b: float, + alpha_hat: int, alpha_prev: int, alpha_min: int, alpha_max: int, ) -> int: """ - Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha. + Determine the band-wise scalefactor alpha(b) following the assignment. - The algorithm increases alpha until the quantization error power exceeds - the band threshold: + Procedure: + - Start from a frame-wise initial estimate alpha_hat. + - Iteratively increase alpha(b) by 1 as long as the quantization error power + stays below the psychoacoustic threshold T(b): P_e(b) = sum_{k in band} ( X(k) - Xhat(k) )^2 - P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2 - select the largest alpha such that P_e(b) <= T(b) + - Stop increasing alpha(b) if the neighbor constraint would be violated: |alpha(b) - alpha(b-1)| <= 60 + When processing bands sequentially (low -> high), this becomes: alpha(b) <= alpha_prev + 60 + + Notes: + - This function does not decrease alpha if the initial value already violates + the threshold; the assignment only specifies iterative increase. Parameters ---------- X : FloatArray - Full MDCT vector (one frame or one subframe), shape (N,). + Full MDCT vector of the current (sub)frame, shape (N,). lo, hi : int - Inclusive MDCT index range defining the band. + Band index bounds (inclusive), defining the band slice. T_b : float - Psychoacoustic threshold for this band. - alpha0 : int - Initial alpha value for the band. + Threshold T(b) for this band. + alpha_hat : int + Initial frame-wise estimate (Equation 14). + alpha_prev : int + Previously selected alpha for band b-1 (neighbor constraint reference). alpha_min, alpha_max : int - Bounds for alpha, used as a safeguard. + Safeguard bounds for alpha. Returns ------- @@ -315,81 +309,41 @@ def _best_alpha_for_band( Selected integer alpha(b). """ if T_b <= 0.0: - return int(alpha0) - + return int(alpha_hat) Xsec = X[lo : hi + 1] - alpha = int(alpha0) - alpha = max(alpha_min, min(alpha, alpha_max)) + # Neighbor constraint (sequential processing): alpha(b) <= alpha_prev + 60 + alpha_limit = min(int(alpha_max), int(alpha_prev) + MAX_SF_DELTA) + + # Start from alpha_hat, clamped to feasible range + alpha = int(alpha_hat) + alpha = max(int(alpha_min), min(alpha, int(alpha_limit))) # Evaluate at current alpha Ssec = _quantize_symbol(Xsec, alpha) Xhat = _dequantize_symbol(Ssec, alpha) Pe = float(np.sum((Xsec - Xhat) ** 2)) + # If already above threshold, return current alpha (no decrease step specified) if Pe > T_b: return alpha - iters = 0 - while iters < MAX_ALPHA_ITERS: - iters += 1 + # Increase alpha while still under threshold and within constraints + while True: alpha_next = alpha + 1 - if alpha_next > alpha_max: + if alpha_next > alpha_limit: break - Ssec = _quantize_symbol(Xsec, alpha_next) Xhat = _dequantize_symbol(Ssec, alpha_next) Pe_next = float(np.sum((Xsec - Xhat) ** 2)) if Pe_next > T_b: break - alpha = alpha_next - Pe = Pe_next return alpha -def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray: - """ - Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping. - - Uses a forward pass and a backward pass to reduce drift. - - Parameters - ---------- - alpha : np.ndarray - Alpha vector, shape (NB,). - max_diff : int - Maximum allowed absolute difference between adjacent bands. - - Returns - ------- - np.ndarray - Clamped alpha vector, int64, shape (NB,). - """ - a = np.asarray(alpha, dtype=np.int64).copy() - nb = a.shape[0] - - for b in range(1, nb): - lo = int(a[b - 1] - max_diff) - hi = int(a[b - 1] + max_diff) - if a[b] < lo: - a[b] = lo - elif a[b] > hi: - a[b] = hi - - for b in range(nb - 2, -1, -1): - lo = int(a[b + 1] - max_diff) - hi = int(a[b + 1] + max_diff) - if a[b] < lo: - a[b] = lo - elif a[b] > hi: - a[b] = hi - - return a - - # ----------------------------------------------------------------------------- # Public API # ----------------------------------------------------------------------------- @@ -401,7 +355,16 @@ def aac_quantizer( """ AAC quantizer for one channel (Level 3). - Quantizes MDCT coefficients using band-wise scalefactors derived from SMR. + Quantizes MDCT coefficients (after TNS) using band-wise scalefactors derived + from psychoacoustic thresholds computed via SMR. + + The implementation follows the assignment procedure: + - Compute an initial frame-wise alpha_hat using Equation (14), based on the + maximum MDCT coefficient magnitude of the (sub)frame. + - For each band b, increase alpha(b) by 1 while the quantization error power + P_e(b) stays below the threshold T(b). + - Enforce the neighbor constraint |alpha(b) - alpha(b-1)| <= 60 during the + band-by-band search (no post-processing needed). Parameters ---------- @@ -421,16 +384,19 @@ def aac_quantizer( Returns ------- S : QuantizedSymbols - Quantized symbols S(k), shape (1024, 1) for all frame types. + Quantized symbols S(k), packed as shape (1024, 1) for all frame types. + For ESH, the 8 subframes are packed in column-major subframe layout. sfc : ScaleFactors - DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1). + DPCM-coded scalefactors: + sfc(0) = alpha(0) = G + sfc(b) = alpha(b) - alpha(b-1), for b > 0 Shapes: - Long: (NB, 1) - ESH: (NB, 8) G : GlobalGain Global gain G = alpha(0). - Long: scalar float - - ESH: array shape (1, 8) + - ESH: array shape (1, 8), dtype float64 """ bands = _band_slices(frame_type) NB = len(bands) @@ -438,6 +404,9 @@ def aac_quantizer( X = np.asarray(frame_F, dtype=np.float64) SMR = np.asarray(SMR, dtype=np.float64) + # ------------------------------------------------------------------------- + # ESH: 8 short subframes, each of length 128 + # ------------------------------------------------------------------------- if frame_type == "ESH": if X.shape != (128, 8): raise ValueError("For ESH, frame_F must have shape (128, 8).") @@ -446,42 +415,58 @@ def aac_quantizer( S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64) sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64) - G_arr = np.zeros((1, 8), dtype=np.int64) + G_arr = np.zeros((1, 8), dtype=np.float64) + + # Packed output view: (128, 8) with column-major layout + S_pack = S_out[:, 0].reshape(128, 8, order="F") for j in range(8): Xj = X[:, j].reshape(128) SMRj = SMR[:, j].reshape(NB) + # Compute psychoacoustic threshold T(b) for this subframe T = _threshold_T_from_SMR(Xj, SMRj, bands) - alpha0 = _alpha_initial_from_frame_max(Xj) - alpha = np.full((NB,), alpha0, dtype=np.int64) + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xj) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) for b, (lo, hi) in enumerate(bands): - alpha[b] = _best_alpha_for_band( - X=Xj, lo=lo, hi=hi, T_b=float(T[b]), - alpha0=int(alpha[b]), + alpha_b = _best_alpha_for_band( + X=Xj, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), alpha_min=-4096, alpha_max=4096, ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) - alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) - - G_arr[0, j] = int(alpha[0]) + # DPCM-coded scalefactors + G_arr[0, j] = float(alpha[0]) sfc[0, j] = int(alpha[0]) for b in range(1, NB): sfc[b, j] = int(alpha[b] - alpha[b - 1]) + # Quantize MDCT coefficients band-by-band Sj = np.zeros((128,), dtype=np.int64) for b, (lo, hi) in enumerate(bands): Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b])) - # Place this subframe in the packed output (column-major subframe layout) - S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj + # Store subframe in packed output + S_pack[:, j] = Sj - return S_out, sfc, G_arr.astype(np.float64) + return S_out, sfc, G_arr - # Long frames + # ------------------------------------------------------------------------- + # Long frames: OLS / LSS / LPS, length 1024 + # ------------------------------------------------------------------------- if X.shape == (1024,): Xv = X elif X.shape == (1024, 1): @@ -496,33 +481,44 @@ def aac_quantizer( else: raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") + # Compute psychoacoustic threshold T(b) for the long frame T = _threshold_T_from_SMR(Xv, SMRv, bands) - alpha0 = _alpha_initial_from_frame_max(Xv) - alpha = np.full((NB,), alpha0, dtype=np.int64) + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xv) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) for b, (lo, hi) in enumerate(bands): - alpha[b] = _best_alpha_for_band( - X=Xv, lo=lo, hi=hi, T_b=float(T[b]), - alpha0=int(alpha[b]), + alpha_b = _best_alpha_for_band( + X=Xv, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), alpha_min=-4096, alpha_max=4096, ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) - alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) - - sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) - sfc[0, 0] = int(alpha[0]) + # DPCM-coded scalefactors + sfc_out: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) + sfc_out[0, 0] = int(alpha[0]) for b in range(1, NB): - sfc[b, 0] = int(alpha[b] - alpha[b - 1]) + sfc_out[b, 0] = int(alpha[b] - alpha[b - 1]) G: float = float(alpha[0]) + # Quantize MDCT coefficients band-by-band S_vec = np.zeros((1024,), dtype=np.int64) for b, (lo, hi) in enumerate(bands): S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b])) - return S_vec.reshape(1024, 1), sfc, G + return S_vec.reshape(1024, 1), sfc_out, G def aac_i_quantizer( diff --git a/source/level_3/material/LicorDeCalandraca_out2.wav b/source/level_3/material/LicorDeCalandraca_out2.wav deleted file mode 100644 index 5d2fe28..0000000 Binary files a/source/level_3/material/LicorDeCalandraca_out2.wav and /dev/null differ diff --git a/source/level_3/material/aac_seq_3.mat b/source/level_3/material/aac_seq_3.mat deleted file mode 100644 index a859043..0000000 Binary files a/source/level_3/material/aac_seq_3.mat and /dev/null differ