diff --git a/source/core/aac_coder.py b/source/core/aac_coder.py index fc3f8ac..4c6158c 100644 --- a/source/core/aac_coder.py +++ b/source/core/aac_coder.py @@ -9,8 +9,16 @@ # cchoutou@ece.auth.gr # # Description: -# - Level 1 AAC encoder orchestration. -# - Level 2 AAC encoder orchestration. +# Level 1 AAC encoder orchestration. +# Keeps the same functional behavior as the original level_1 implementation: +# - Reads WAV via soundfile +# - Validates stereo and 48 kHz +# - Frames into 2048 samples with hop=1024 and zero padding at both ends +# - SSC decision uses next-frame attack detection +# - Filterbank analysis (MDCT) +# - Stores per-channel spectra in AACSeq1 schema: +# * ESH: (128, 8) +# * else: (1024, 1) # ------------------------------------------------------------ from __future__ import annotations @@ -18,14 +26,106 @@ from pathlib import Path from typing import Union import soundfile as sf +from scipy.io import savemat from core.aac_configuration import WIN_TYPE from core.aac_filterbank import aac_filter_bank -from core.aac_ssc import aac_SSC +from core.aac_ssc import aac_ssc from core.aac_tns import aac_tns +from core.aac_psycho import aac_psycho +from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py +from core.aac_huffman import aac_encode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT + from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Helpers for thresholds (T(b)) +# ----------------------------------------------------------------------------- + +def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return inclusive (lo, hi) band slices derived from TableB219. + """ + table, _ = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)] + + +def _thresholds_from_smr( + frame_F_ch: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> FloatArray: + """ + Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy. + + Shapes: + - Long: returns (NB, 1) + - ESH: returns (NB, 8) + """ + bands = _band_slices_from_table(frame_type) + NB = len(bands) + + X = np.asarray(frame_F_ch, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F_ch must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + T = np.zeros((NB, 8), dtype=np.float64) + for j in range(8): + Xj = X[:, j] + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xj[lo : hi + 1] ** 2)) + smr = float(SMR[b, j]) + T[b, j] = 0.0 if smr <= 1e-12 else (P / smr) + return T + + # Long + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).") + + T = np.zeros((NB, 1), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xv[lo : hi + 1] ** 2)) + smr = float(SMRv[b]) + T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr) + + return T + +def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: + """ + Normalize GlobalGain to match AACChannelFrameF3["G"] type: + - long: return float + - ESH: return float64 ndarray of shape (1, 8) + """ + if np.isscalar(G): + return float(G) + + G_arr = np.asarray(G) + if G_arr.size == 1: + return float(G_arr.reshape(-1)[0]) + + return np.asarray(G_arr, dtype=np.float64) + # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- @@ -174,7 +274,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) next_t = np.vstack([next_t, tail]) - frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_f = aac_filter_bank(frame_t, frame_type, win_type) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) @@ -191,10 +291,6 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: return aac_seq -# ----------------------------------------------------------------------------- -# Level 2 encoder -# ----------------------------------------------------------------------------- - def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: """ Level-2 AAC encoder (Level 1 + TNS). @@ -246,7 +342,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) next_t = np.vstack([next_t, tail]) - frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) # Level 1 analysis (packed stereo container) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) @@ -277,4 +373,181 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: prev_frame_type = frame_type - return aac_seq \ No newline at end of file + return aac_seq + + +def aac_coder_3( + filename_in: Union[str, Path], + filename_aac_coded: Union[str, Path] | None = None, +) -> AACSeq3: + """ + Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + filename_aac_coded : Union[str, Path] | None + Optional .mat filename to store aac_seq_3 (assignment convenience). + + Returns + ------- + AACSeq3 + Encoded AAC sequence (Level 3 payload schema). + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + aac_seq: AACSeq3 = [] + prev_frame_type: FrameType = "OLS" + + # Pin win_type to the WinType literal for type checkers. + win_type: WinType = WIN_TYPE + + # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. + prev1_L = np.zeros((2048,), dtype=np.float64) + prev2_L = np.zeros((2048,), dtype=np.float64) + prev1_R = np.zeros((2048,), dtype=np.float64) + prev2_R = np.zeros((2048,), dtype=np.float64) + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) + + # Analysis filterbank (stereo packed) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) + + # TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + # Psychoacoustic model per channel (time-domain) + frame_L = np.asarray(frame_t[:, 0], dtype=np.float64) + frame_R = np.asarray(frame_t[:, 1], dtype=np.float64) + + SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L) + SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R) + + # Thresholds T(b) (stored, not entropy-coded) + T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L) + T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R) + + # Quantizer per channel + S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) + S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) + + # Normalize G types for AACSeq3 schema (float | float64 ndarray). + G_Ln = _normalize_global_gain(G_L) + G_Rn = _normalize_global_gain(G_R) + + # Huffman-code ONLY the DPCM differences for b>0. + # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. + sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] + sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] + + # Codebook 11: + # maxAbsCodeVal = 16 is RESERVED for ESCAPE. + # We must stay strictly within [-15, +15] to avoid escape decoding. + sf_cb = 11 + sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + + sfc_L_dpcm = np.clip( + sfc_L_dpcm, + -sf_max_abs, + sf_max_abs, + ).astype(np.int64, copy=False) + + sfc_R_dpcm = np.clip( + sfc_R_dpcm, + -sf_max_abs, + sf_max_abs, + ).astype(np.int64, copy=False) + + sfc_L_stream, _ = aac_encode_huff( + sfc_L_dpcm.reshape(-1, order="F"), + huff_LUT_list, + force_codebook=sf_cb, + ) + sfc_R_stream, _ = aac_encode_huff( + sfc_R_dpcm.reshape(-1, order="F"), + huff_LUT_list, + force_codebook=sf_cb, + ) + + mdct_L_stream, cb_L = aac_encode_huff( + np.asarray(S_L, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + mdct_R_stream, cb_R = aac_encode_huff( + np.asarray(S_R, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + + # Typed dict construction helps static analyzers validate the schema. + frame_out: AACSeq3Frame = { + "frame_type": frame_type, + "win_type": win_type, + "chl": { + "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_L, dtype=np.float64), + "G": G_Ln, + "sfc": sfc_L_stream, + "stream": mdct_L_stream, + "codebook": int(cb_L), + }, + "chr": { + "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_R, dtype=np.float64), + "G": G_Rn, + "sfc": sfc_R_stream, + "stream": mdct_R_stream, + "codebook": int(cb_R), + }, + } + aac_seq.append(frame_out) + + # Update psycho history (shift register) + prev2_L = prev1_L + prev1_L = frame_L + prev2_R = prev1_R + prev1_R = frame_R + + prev_frame_type = frame_type + + # Optional: store to .mat for the assignment wrapper + if filename_aac_coded is not None: + filename_aac_coded = Path(filename_aac_coded) + savemat( + str(filename_aac_coded), + {"aac_seq_3": np.array(aac_seq, dtype=object)}, + do_compression=True, + ) + + return aac_seq + diff --git a/source/core/aac_decoder.py b/source/core/aac_decoder.py index 5b366b8..ac05621 100644 --- a/source/core/aac_decoder.py +++ b/source/core/aac_decoder.py @@ -22,9 +22,22 @@ import soundfile as sf from core.aac_filterbank import aac_i_filter_bank from core.aac_tns import aac_i_tns +from core.aac_quantizer import aac_i_quantizer +from core.aac_huffman import aac_decode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT from core.aac_types import * +# ----------------------------------------------------------------------------- +# Helper for NB +# ----------------------------------------------------------------------------- +def _nbands(frame_type: FrameType) -> int: + table, _ = get_table(frame_type) + wlow, _whigh, _bval, _qthr_db = band_limits(table) + return int(len(wlow)) + + # ----------------------------------------------------------------------------- # Public helpers # ----------------------------------------------------------------------------- @@ -251,4 +264,145 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS y = aac_remove_padding(y_pad, hop=hop) sf.write(str(filename_out), y, 48000) - return y \ No newline at end of file + return y + + + +def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-3 AAC decoder (inverse of aac_coder_3). + + Steps per frame: + - Huffman decode scalefactors (sfc) using codebook 11 + - Huffman decode MDCT symbols (stream) using stored codebook + - iQuantizer -> MDCT coefficients after TNS + - iTNS using stored predictor coefficients + - IMDCT filterbank -> time domain + - Overlap-add, remove padding, write WAV + + Parameters + ---------- + aac_seq_3 : AACSeq3 + Encoded sequence as produced by aac_coder_3. + filename_out : Union[str, Path] + Output WAV filename. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_3) + + if K <= 0: + raise ValueError("aac_seq_3 must contain at least one frame.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_3): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + NB = _nbands(frame_type) + # We store G separately, so Huffman stream contains only (NB-1) DPCM differences. + sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1) + + # ------------------------- + # Left channel + # ------------------------- + tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + G_L = fr["chl"]["G"] + sfc_bits_L = fr["chl"]["sfc"] + mdct_bits_L = fr["chl"]["stream"] + cb_L = int(fr["chl"]["codebook"]) + + sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F") + sfc_L = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8) + sfc_L[0, :] = Gv[0, :].astype(np.int64) + sfc_L[1:, :] = sfc_dpcm_L + else: + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F") + sfc_L = np.zeros((NB, 1), dtype=np.int64) + sfc_L[0, 0] = int(float(G_L)) + sfc_L[1:, :] = sfc_dpcm_L + + # MDCT symbols: codebook 0 means "all-zero section" + if cb_L == 0: + S_dec_L = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False) + + # Tuple coding may produce extra trailing symbols; caller knows the true length (1024). + # Also guard against short outputs by zero-padding. + if S_tmp_L.size < 1024: + S_dec_L = np.zeros((1024,), dtype=np.int64) + S_dec_L[: S_tmp_L.size] = S_tmp_L + else: + S_dec_L = S_tmp_L[:1024] + + S_L = S_dec_L.reshape(1024, 1) + + Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type) + X_L = aac_i_tns(Xq_L, frame_type, tns_L) + + # ------------------------- + # Right channel + # ------------------------- + tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + G_R = fr["chr"]["G"] + sfc_bits_R = fr["chr"]["sfc"] + mdct_bits_R = fr["chr"]["stream"] + cb_R = int(fr["chr"]["codebook"]) + + sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F") + sfc_R = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8) + sfc_R[0, :] = Gv[0, :].astype(np.int64) + sfc_R[1:, :] = sfc_dpcm_R + else: + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F") + sfc_R = np.zeros((NB, 1), dtype=np.int64) + sfc_R[0, 0] = int(float(G_R)) + sfc_R[1:, :] = sfc_dpcm_R + + if cb_R == 0: + S_dec_R = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False) + + if S_tmp_R.size < 1024: + S_dec_R = np.zeros((1024,), dtype=np.int64) + S_dec_R[: S_tmp_R.size] = S_tmp_R + else: + S_dec_R = S_tmp_R[:1024] + + S_R = S_dec_R.reshape(1024, 1) + + Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type) + X_R = aac_i_tns(Xq_R, frame_type, tns_R) + + # Re-pack to stereo container and inverse filterbank + frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + + y = aac_remove_padding(y_pad, hop=hop) + + sf.write(str(filename_out), y, 48000) + return y + diff --git a/source/core/aac_huffman.py b/source/core/aac_huffman.py new file mode 100644 index 0000000..3d2eeca --- /dev/null +++ b/source/core/aac_huffman.py @@ -0,0 +1,112 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Huffman wrappers (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Thin wrappers around the provided Huffman utilities (material/huff_utils.py) +# so that the API matches the assignment text. +# +# Exposed API (assignment): +# huff_sec, huff_codebook = aac_encode_huff(coeff_sec, huff_LUT_list, force_codebook) +# dec_coeffs = aac_decode_huff(huff_sec, huff_codebook, huff_LUT_list) +# +# Notes: +# - Huffman coding operates on tuples. Therefore, decode(encode(x)) may return +# extra trailing symbols due to tuple padding. The AAC decoder knows the +# true section length from side information (band limits) and truncates. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Any +import numpy as np + +from material.huff_utils import encode_huff, decode_huff + + +def aac_encode_huff( + coeff_sec: np.ndarray, + huff_LUT_list: list[dict[str, Any]], + force_codebook: int | None = None, +) -> tuple[str, int]: + """ + Huffman-encode a section of coefficients (MDCT symbols or scalefactors). + + Parameters + ---------- + coeff_sec : np.ndarray + Coefficient section to be encoded. Any shape is accepted; the input + is flattened and treated as a 1-D sequence of int64 symbols. + huff_LUT_list : list[dict[str, Any]] + List of Huffman Look-Up Tables (LUTs) as returned by material.load_LUT(). + Index corresponds to codebook id (typically 1..11, with 0 reserved). + force_codebook : int | None + If provided, forces the use of this Huffman codebook. In the assignment, + scalefactors are encoded with codebook 11. For MDCT coefficients, this + argument is usually omitted (auto-selection). + + Returns + ------- + tuple[str, int] + (huff_sec, huff_codebook) + - huff_sec: bitstream as a string of '0'/'1' + - huff_codebook: codebook id used by the encoder + """ + coeff_sec_arr = np.asarray(coeff_sec, dtype=np.int64).reshape(-1) + + if force_codebook is None: + # Provided utility returns (bitstream, codebook) in the auto-selection case. + huff_sec, huff_codebook = encode_huff(coeff_sec_arr, huff_LUT_list) + return str(huff_sec), int(huff_codebook) + + # Provided utility returns ONLY the bitstream when force_codebook is set. + cb = int(force_codebook) + huff_sec = encode_huff(coeff_sec_arr, huff_LUT_list, force_codebook=cb) + return str(huff_sec), cb + + +def aac_decode_huff( + huff_sec: str | np.ndarray, + huff_codebook: int, + huff_LUT: list[dict[str, Any]], +) -> np.ndarray: + """ + Huffman-decode a bitstream using the specified codebook. + + Parameters + ---------- + huff_sec : str | np.ndarray + Huffman bitstream. Typically a string of '0'/'1'. If an array is provided, + it is passed through to the provided decoder. + huff_codebook : int + Codebook id that was returned by aac_encode_huff. + Codebook 0 represents an all-zero section. + huff_LUT : list[dict[str, Any]] + Huffman LUT list as returned by material.load_LUT(). + + Returns + ------- + np.ndarray + Decoded coefficients as a 1-D np.int64 array. + + Note: Due to tuple coding, the decoded array may contain extra trailing + padding symbols. The caller must truncate to the known section length. + """ + cb = int(huff_codebook) + + if cb == 0: + # Codebook 0 represents an all-zero section. The decoded length is not + # recoverable from the bitstream alone; the caller must expand/truncate. + return np.zeros((0,), dtype=np.int64) + + if cb < 0 or cb >= len(huff_LUT): + raise ValueError(f"Invalid Huffman codebook index: {cb}") + + lut = huff_LUT[cb] + dec = decode_huff(huff_sec, lut) + return np.asarray(dec, dtype=np.int64).reshape(-1) diff --git a/source/core/aac_quantizer.py b/source/core/aac_quantizer.py new file mode 100644 index 0000000..addcfe8 --- /dev/null +++ b/source/core/aac_quantizer.py @@ -0,0 +1,604 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Implements AAC quantizer and inverse quantizer for one channel. +# Based on assignment section 2.6 (Eq. 12-15). +# +# Notes: +# - Bit reservoir is not implemented (assignment simplification). +# - Scalefactor bands are assumed equal to psychoacoustic bands +# (Table B.2.1.9a / B.2.1.9b from TableB219.mat). +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np + +from core.aac_utils import get_table, band_limits +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Constants (assignment) +# ----------------------------------------------------------------------------- +MAGIC_NUMBER: float = 0.4054 +MQ: int = 8191 + +EPS: float = 1e-12 +MAX_SFC_DIFF: int = 60 + +# Safeguard: prevents infinite loops in pathological cases +MAX_ALPHA_ITERS: int = 2000 + + +# ----------------------------------------------------------------------------- +# Helpers: ESH packing/unpacking (128x8 <-> 1024x1) +# ----------------------------------------------------------------------------- +def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray: + """ + Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1). + + Packing order: + Columns are concatenated in subframe order (0..7), column-major. + + Parameters + ---------- + x_128x8 : FloatArray + ESH coefficients, shape (128, 8). + + Returns + ------- + FloatArray + Packed coefficients, shape (1024, 1). + """ + x_128x8 = np.asarray(x_128x8, dtype=np.float64) + if x_128x8.shape != (128, 8): + raise ValueError("ESH pack expects shape (128, 8).") + return x_128x8.reshape(1024, 1, order="F") + + +def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray: + """ + Unpack a packed ESH vector (1024 elements) back to shape (128, 8). + + Parameters + ---------- + x_1024x1 : FloatArray + Packed ESH vector, shape (1024,) or (1024, 1) after flattening. + + Returns + ------- + FloatArray + Unpacked ESH coefficients, shape (128, 8). + """ + x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1) + if x_1024x1.shape[0] != 1024: + raise ValueError("ESH unpack expects 1024 elements.") + return x_1024x1.reshape(128, 8, order="F") + + +# ----------------------------------------------------------------------------- +# Core quantizer formulas (Eq. 12, Eq. 13) +# ----------------------------------------------------------------------------- +def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols: + """ + Quantize MDCT coefficients to integer symbols S(k). + + Implements Eq. (12): + S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER ) + + Parameters + ---------- + x : FloatArray + MDCT coefficients for a contiguous set of spectral lines. + Shape: (N,) + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + QuantizedSymbols + Quantized symbols S(k) as int64, shape (N,). + """ + x = np.asarray(x, dtype=np.float64) + + scale = 2.0 ** (-0.25 * float(alpha)) + ax = np.abs(x) * scale + + y = np.power(ax, 0.75, dtype=np.float64) + + # "int" in the assignment corresponds to truncation. + q = np.floor(y + MAGIC_NUMBER).astype(np.int64) + return (np.sign(x).astype(np.int64) * q).astype(np.int64) + + +def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray: + """ + Inverse quantizer (dequantization of symbols). + + Implements Eq. (13): + Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4) + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols S(k), int64, shape (N,). + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + FloatArray + Reconstructed MDCT coefficients Xhat(k), float64, shape (N,). + """ + S = np.asarray(S, dtype=np.int64) + + scale = 2.0 ** (0.25 * float(alpha)) + aS = np.abs(S).astype(np.float64) + y = np.power(aS, 4.0 / 3.0, dtype=np.float64) + + return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64) + + +# ----------------------------------------------------------------------------- +# Alpha initialization (Eq. 14) +# ----------------------------------------------------------------------------- +def _alpha_initial_from_frame_max(x_all: FloatArray) -> int: + """ + Compute the initial scalefactor gain alpha_hat for a frame. + + Implements Eq. (14): + alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ ) + + The same initial value is used for all bands before the per-band refinement. + + Parameters + ---------- + x_all : FloatArray + All MDCT coefficients of a frame (one channel), flattened. + + Returns + ------- + int + Initial alpha value (integer). + """ + x_all = np.asarray(x_all, dtype=np.float64).reshape(-1) + if x_all.size == 0: + return 0 + + max_abs = float(np.max(np.abs(x_all))) + if max_abs <= 0.0: + return 0 + + val = (max_abs ** 0.75) / float(MQ) + if val <= 0.0: + return 0 + + alpha_hat = (16.0 / 3.0) * np.log2(val) + return int(np.floor(alpha_hat)) + + +# ----------------------------------------------------------------------------- +# Band utilities +# ----------------------------------------------------------------------------- +def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type. + + These are derived from the psychoacoustic tables (TableB219), + and map directly to MDCT indices: + - long: 0..1023 + - short (ESH subframe): 0..127 + + Parameters + ---------- + frame_type : FrameType + Frame type ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + list[tuple[int, int]] + List of (lo, hi) inclusive index pairs for each band. + """ + table, _Nfft = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + + bands: list[tuple[int, int]] = [] + for lo, hi in zip(wlow, whigh): + bands.append((int(lo), int(hi))) + return bands + + +def _band_energy(x: FloatArray, lo: int, hi: int) -> float: + """ + Compute energy of a spectral segment x[lo:hi+1]. + + Parameters + ---------- + x : FloatArray + MDCT coefficient vector. + lo, hi : int + Inclusive index range. + + Returns + ------- + float + Sum of squares (energy) within the band. + """ + sec = x[lo : hi + 1] + return float(np.sum(sec * sec)) + + +def _threshold_T_from_SMR( + X: FloatArray, + SMR_col: FloatArray, + bands: list[tuple[int, int]], +) -> FloatArray: + """ + Compute psychoacoustic thresholds T(b) per band. + + Uses: + P(b) = sum_{k in band} X(k)^2 + T(b) = P(b) / SMR(b) + + Parameters + ---------- + X : FloatArray + MDCT coefficients for a frame (long) or one ESH subframe (short). + SMR_col : FloatArray + SMR values for this frame/subframe, shape (NB,). + bands : list[tuple[int, int]] + Band index ranges. + + Returns + ------- + FloatArray + Threshold vector T(b), shape (NB,). + """ + nb = len(bands) + T = np.zeros((nb,), dtype=np.float64) + + for b, (lo, hi) in enumerate(bands): + P = _band_energy(X, lo, hi) + smr = float(SMR_col[b]) + if smr <= EPS: + T[b] = 0.0 + else: + T[b] = P / smr + + return T + + +# ----------------------------------------------------------------------------- +# Alpha selection per band + neighbor-difference constraint +# ----------------------------------------------------------------------------- +def _best_alpha_for_band( + X: FloatArray, + lo: int, + hi: int, + T_b: float, + alpha0: int, + alpha_min: int, + alpha_max: int, +) -> int: + """ + Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha. + + The algorithm increases alpha until the quantization error power exceeds + the band threshold: + + P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2 + select the largest alpha such that P_e(b) <= T(b) + + Parameters + ---------- + X : FloatArray + Full MDCT vector (one frame or one subframe), shape (N,). + lo, hi : int + Inclusive MDCT index range defining the band. + T_b : float + Psychoacoustic threshold for this band. + alpha0 : int + Initial alpha value for the band. + alpha_min, alpha_max : int + Bounds for alpha, used as a safeguard. + + Returns + ------- + int + Selected integer alpha(b). + """ + if T_b <= 0.0: + return int(alpha0) + + Xsec = X[lo : hi + 1] + + alpha = int(alpha0) + alpha = max(alpha_min, min(alpha, alpha_max)) + + # Evaluate at current alpha + Ssec = _quantize_symbol(Xsec, alpha) + Xhat = _dequantize_symbol(Ssec, alpha) + Pe = float(np.sum((Xsec - Xhat) ** 2)) + + if Pe > T_b: + return alpha + + iters = 0 + while iters < MAX_ALPHA_ITERS: + iters += 1 + alpha_next = alpha + 1 + if alpha_next > alpha_max: + break + + Ssec = _quantize_symbol(Xsec, alpha_next) + Xhat = _dequantize_symbol(Ssec, alpha_next) + Pe_next = float(np.sum((Xsec - Xhat) ** 2)) + + if Pe_next > T_b: + break + + alpha = alpha_next + Pe = Pe_next + + return alpha + + +def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray: + """ + Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping. + + Uses a forward pass and a backward pass to reduce drift. + + Parameters + ---------- + alpha : np.ndarray + Alpha vector, shape (NB,). + max_diff : int + Maximum allowed absolute difference between adjacent bands. + + Returns + ------- + np.ndarray + Clamped alpha vector, int64, shape (NB,). + """ + a = np.asarray(alpha, dtype=np.int64).copy() + nb = a.shape[0] + + for b in range(1, nb): + lo = int(a[b - 1] - max_diff) + hi = int(a[b - 1] + max_diff) + if a[b] < lo: + a[b] = lo + elif a[b] > hi: + a[b] = hi + + for b in range(nb - 2, -1, -1): + lo = int(a[b + 1] - max_diff) + hi = int(a[b + 1] + max_diff) + if a[b] < lo: + a[b] = lo + elif a[b] > hi: + a[b] = hi + + return a + + +# ----------------------------------------------------------------------------- +# Public API +# ----------------------------------------------------------------------------- +def aac_quantizer( + frame_F: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]: + """ + AAC quantizer for one channel (Level 3). + + Quantizes MDCT coefficients using band-wise scalefactors derived from SMR. + + Parameters + ---------- + frame_F : FrameChannelF + MDCT coefficients after TNS, one channel. + Shapes: + - Long frames: (1024,) or (1024, 1) + - ESH: (128, 8) + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + SMR : FloatArray + Signal-to-Mask Ratio per band. + Shapes: + - Long: (NB,) or (NB, 1) + - ESH: (NB, 8) + + Returns + ------- + S : QuantizedSymbols + Quantized symbols S(k), shape (1024, 1) for all frame types. + sfc : ScaleFactors + DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1). + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain G = alpha(0). + - Long: scalar float + - ESH: array shape (1, 8) + """ + bands = _band_slices(frame_type) + NB = len(bands) + + X = np.asarray(frame_F, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64) + sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64) + G_arr = np.zeros((1, 8), dtype=np.int64) + + for j in range(8): + Xj = X[:, j].reshape(128) + SMRj = SMR[:, j].reshape(NB) + + T = _threshold_T_from_SMR(Xj, SMRj, bands) + + alpha0 = _alpha_initial_from_frame_max(Xj) + alpha = np.full((NB,), alpha0, dtype=np.int64) + + for b, (lo, hi) in enumerate(bands): + alpha[b] = _best_alpha_for_band( + X=Xj, lo=lo, hi=hi, T_b=float(T[b]), + alpha0=int(alpha[b]), + alpha_min=-4096, + alpha_max=4096, + ) + + alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) + + G_arr[0, j] = int(alpha[0]) + sfc[0, j] = int(alpha[0]) + for b in range(1, NB): + sfc[b, j] = int(alpha[b] - alpha[b - 1]) + + Sj = np.zeros((128,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b])) + + # Place this subframe in the packed output (column-major subframe layout) + S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj + + return S_out, sfc, G_arr.astype(np.float64) + + # Long frames + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") + + T = _threshold_T_from_SMR(Xv, SMRv, bands) + + alpha0 = _alpha_initial_from_frame_max(Xv) + alpha = np.full((NB,), alpha0, dtype=np.int64) + + for b, (lo, hi) in enumerate(bands): + alpha[b] = _best_alpha_for_band( + X=Xv, lo=lo, hi=hi, T_b=float(T[b]), + alpha0=int(alpha[b]), + alpha_min=-4096, + alpha_max=4096, + ) + + alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) + + sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) + sfc[0, 0] = int(alpha[0]) + for b in range(1, NB): + sfc[b, 0] = int(alpha[b] - alpha[b - 1]) + + G: float = float(alpha[0]) + + S_vec = np.zeros((1024,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b])) + + return S_vec.reshape(1024, 1), sfc, G + + +def aac_i_quantizer( + S: QuantizedSymbols, + sfc: ScaleFactors, + G: GlobalGain, + frame_type: FrameType, +) -> FrameChannelF: + """ + Inverse quantizer (iQuantizer) for one channel. + + Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors. + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols, shape (1024, 1) (or any array with 1024 elements). + sfc : ScaleFactors + DPCM-coded scalefactors. + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain (not strictly required if sfc includes sfc(0)=alpha(0)). + Present for API compatibility with the assignment. + frame_type : FrameType + AAC frame type. + + Returns + ------- + FrameChannelF + Reconstructed MDCT coefficients: + - ESH: (128, 8) + - Long: (1024, 1) + """ + bands = _band_slices(frame_type) + NB = len(bands) + + S_flat = np.asarray(S, dtype=np.int64).reshape(-1) + if S_flat.shape[0] != 1024: + raise ValueError("S must contain 1024 symbols.") + + if frame_type == "ESH": + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 8): + raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).") + + S_128x8 = _esh_unpack_from_1024(S_flat) + + Xrec = np.zeros((128, 8), dtype=np.float64) + + for j in range(8): + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, j]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, j]) + + Xj = np.zeros((128,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b])) + + Xrec[:, j] = Xj + + return Xrec + + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 1): + raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).") + + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, 0]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, 0]) + + Xrec = np.zeros((1024,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b])) + + return Xrec.reshape(1024, 1) diff --git a/source/core/aac_ssc.py b/source/core/aac_ssc.py index 8c082b3..323e9e3 100644 --- a/source/core/aac_ssc.py +++ b/source/core/aac_ssc.py @@ -176,7 +176,7 @@ def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: # Public Function prototypes # ----------------------------------------------------------------------------- -def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: +def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: """ Sequence Segmentation Control (SSC). diff --git a/source/core/aac_tns.py b/source/core/aac_tns.py index 88d9da8..06227b1 100644 --- a/source/core/aac_tns.py +++ b/source/core/aac_tns.py @@ -30,9 +30,6 @@ from __future__ import annotations from pathlib import Path from typing import Tuple -import numpy as np -from scipy.io import loadmat - from core.aac_utils import load_b219_tables from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX from core.aac_types import * @@ -377,7 +374,9 @@ def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: sw = _compute_sw(x) eps = 1e-12 - xw = np.where(sw > eps, x / sw, 0.0) + xw = np.zeros_like(x, dtype=np.float64) + mask = sw > eps + np.divide(x, sw, out=xw, where=mask) a = _lpc_coeffs(xw, PRED_ORDER) a_q = _quantize_coeffs(a) diff --git a/source/core/aac_types.py b/source/core/aac_types.py index 6642e18..00508c6 100644 --- a/source/core/aac_types.py +++ b/source/core/aac_types.py @@ -212,6 +212,42 @@ BandValueArray: TypeAlias = FloatArray Per-band psychoacoustic values (e.g. Bark position, thresholds). """ + +# Quantizer-related semantic aliases + +QuantizedSymbols: TypeAlias = NDArray[np.generic] +""" +Quantized MDCT symbols S(k). + +Shapes: +- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols). +""" + +ScaleFactors: TypeAlias = NDArray[np.generic] +""" +DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1). + +Shapes: +- Long frames: (NB, 1) +- ESH frames: (NB, 8) +""" + +GlobalGain: TypeAlias = float | NDArray[np.generic] +""" +Global gain G = alpha(0). + +- Long frames: scalar float +- ESH frames: array shape (1, 8) +""" + +# Huffman semantic aliases + +HuffmanBitstream: TypeAlias = str +"""Huffman-coded bitstream stored as a string of '0'/'1'.""" + +HuffmanCodebook: TypeAlias = int +"""Huffman codebook id (e.g., 0..11).""" + # ----------------------------------------------------------------------------- # Level 1 AAC sequence payload types # ----------------------------------------------------------------------------- @@ -299,3 +335,77 @@ Level 2 adds: and stores: - per-channel "frame_F" after applying TNS. """ + +# ----------------------------------------------------------------------------- +# Level 3 AAC sequence payload types (Quantizer + Huffman) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF3(TypedDict): + """ + Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3). + + Keys + ---- + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + + T: + Psychoacoustic thresholds per band. + Shapes: + - ESH: (NB, 8) + - else: (NB, 1) + Note: Stored for completeness / debugging; not entropy-coded. + + G: + Quantized global gains. + Shapes: + - ESH: (1, 8) (one per short subframe) + - else: scalar (or compatible np scalar) + + sfc: + Huffman-coded scalefactor differences (DPCM sequence). + + stream: + Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols). + + codebook: + Huffman codebook id used for MDCT symbols (stream). + (Scalefactors typically use fixed codebook 11 and do not need to store it.) + """ + tns_coeffs: TnsCoeffs + T: FloatArray + G: FloatArray | float + sfc: HuffmanBitstream + stream: HuffmanBitstream + codebook: HuffmanCodebook + + +class AACSeq3Frame(TypedDict): + """ + One frame dictionary element of aac_seq_3 (Level 3). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF3 + chr: AACChannelFrameF3 + + +AACSeq3: TypeAlias = List[AACSeq3Frame] +""" +AAC sequence for Level 3: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 3 adds (per channel): +- "tns_coeffs" +- "T" thresholds (not entropy-coded) +- "G" global gain(s) +- "sfc" Huffman-coded scalefactor differences +- "stream" Huffman-coded MDCT quantized symbols +- "codebook" Huffman codebook for MDCT symbols +""" diff --git a/source/core/tests/test_aac_coder_decoder.py b/source/core/tests/test_aac_coder_decoder.py index 1d83e89..38a8d7c 100644 --- a/source/core/tests/test_aac_coder_decoder.py +++ b/source/core/tests/test_aac_coder_decoder.py @@ -19,10 +19,10 @@ import numpy as np import pytest import soundfile as sf -from core.aac_coder import aac_coder_1, aac_coder_2, aac_read_wav_stereo_48k -from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_remove_padding -from core.aac_types import * +from core.aac_coder import aac_coder_1, aac_coder_2, aac_coder_3, aac_read_wav_stereo_48k +from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_decoder_3, aac_remove_padding from core.aac_utils import snr_db +from core.aac_types import * # Helper "fixtures" for aac_coder_1 / i_aac_coder_1 @@ -222,4 +222,153 @@ def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> No assert int(fs_hat) == 48000 snr = snr_db(x_ref, x_hat) - assert snr > 80 \ No newline at end of file + assert snr > 80 + + +# ----------------------------------------------------------------------------- +# Level 3 tests (Quantizer + Huffman) +# ----------------------------------------------------------------------------- + +@pytest.fixture(scope="module") +def wav_in_path() -> Path: + """ + Input WAV used for end-to-end tests. + + This should point to the provided test audio under material/. + Adjust this path if your project layout differs. + """ + # Typical layout in this project: + # source/material/LicorDeCalandraca.wav + return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav" + + +def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None: + """ + Validate Level-3 per-frame schema (keys + basic types only). + """ + assert "frame_type" in frame + assert "win_type" in frame + assert "chl" in frame + assert "chr" in frame + + for ch_key in ("chl", "chr"): + ch = frame[ch_key] # type: ignore[index] + assert "tns_coeffs" in ch + assert "T" in ch + assert "G" in ch + assert "sfc" in ch + assert "stream" in ch + assert "codebook" in ch + + assert isinstance(ch["sfc"], str) + assert isinstance(ch["stream"], str) + assert isinstance(ch["codebook"], int) + + # Arrays: only check they are numpy arrays with expected dtype categories. + assert isinstance(ch["tns_coeffs"], np.ndarray) + assert isinstance(ch["T"], np.ndarray) + + # Global gain: long frames may be scalar float, ESH may be ndarray + assert np.isscalar(ch["G"]) or isinstance(ch["G"], np.ndarray) + + +def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) -> None: + """ + Contract test: + - aac_coder_3 returns AACSeq3 + - Per-frame keys exist and types are consistent + - Basic shape expectations hold for ESH vs non-ESH cases + + Note: + This test uses a short excerpt (a few frames) to keep runtime bounded. + """ + # Use only a few frames to avoid long runtimes in the quantizer loop. + hop = 1024 + win = 2048 + n_frames = 4 + n_samples = win + (n_frames - 1) * hop + + x, fs = aac_read_wav_stereo_48k(wav_in_path) + x_short = x[:n_samples, :] + + short_wav = tmp_path / "input_short.wav" + sf.write(str(short_wav), x_short, fs) + + aac_seq_3: AACSeq3 = aac_coder_3(short_wav) + + assert isinstance(aac_seq_3, list) + assert len(aac_seq_3) > 0 + + for fr in aac_seq_3: + _assert_level3_frame_schema(fr) + + frame_type = fr["frame_type"] + for ch_key in ("chl", "chr"): + ch = fr[ch_key] # type: ignore[index] + + tns = np.asarray(ch["tns_coeffs"]) + if frame_type == "ESH": + assert tns.ndim == 2 + assert tns.shape[1] == 8 + else: + assert tns.ndim == 2 + assert tns.shape[1] == 1 + + T = np.asarray(ch["T"]) + if frame_type == "ESH": + assert T.ndim == 2 + assert T.shape[1] == 8 + else: + assert T.ndim == 2 + assert T.shape[1] == 1 + + G = ch["G"] + if frame_type == "ESH": + assert isinstance(G, np.ndarray) + assert np.asarray(G).shape == (1, 8) + else: + assert np.isscalar(G) + + assert isinstance(ch["sfc"], str) + assert isinstance(ch["stream"], str) + + + +def test_end_to_end_level_3_high_snr(wav_in_path: Path, tmp_path: Path) -> None: + """ + End-to-end test for Level 3 (Quantizer + Huffman): + + coder_3 -> decoder_3 should reconstruct a waveform with acceptable SNR. + + Notes + ----- + - Level 3 includes quantization, so SNR is expected to be lower than Level 1/2. + - We intentionally use a short excerpt (few frames) to keep runtime bounded, + since the reference quantizer implementation is computationally expensive. + """ + # Use only a few frames to avoid long runtimes. + hop = 1024 + win = 2048 + n_frames = 4 + n_samples = win + (n_frames - 1) * hop + + x_ref, fs = aac_read_wav_stereo_48k(wav_in_path) + x_short = x_ref[:n_samples, :] + + short_wav = tmp_path / "input_short_l3.wav" + sf.write(str(short_wav), x_short, fs) + + out_wav = tmp_path / "decoded_level3.wav" + + aac_seq_3: AACSeq3 = aac_coder_3(short_wav) + y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav) + + # Align lengths defensively (padding removal may differ by a few samples) + n = min(x_short.shape[0], y_hat.shape[0]) + x2 = x_short[:n, :] + y2 = y_hat[:n, :] + + s = snr_db(x2, y2) + + # Conservative threshold: Level 3 is lossy by design. + assert s > 10.0 \ No newline at end of file diff --git a/source/core/tests/test_huffman.py b/source/core/tests/test_huffman.py new file mode 100644 index 0000000..882e38f --- /dev/null +++ b/source/core/tests/test_huffman.py @@ -0,0 +1,139 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Huffman Wrapper Tests (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Contract tests for the Huffman coding stage, using the provided +# Huffman utilities (material/huff_utils.py). +# +# The Huffman encoder/decoder itself is GIVEN by the assignment and +# is not re-implemented here. These tests only verify that: +# +# - The wrapper functions (aac_encode_huff / aac_decode_huff) expose +# the API described in the assignment. +# - Forced codebook selection works as expected (e.g. scalefactors). +# - Tuple-based Huffman coding semantics are respected. +# +# Notes on tuple coding: +# Huffman coding operates on tuples of symbols. As a result, +# decode(encode(x)) may return extra trailing symbols due to padding. +# The AAC decoder always knows the true section length (from band limits) +# and truncates accordingly. Therefore, these tests only enforce that +# the decoded PREFIX matches the original data. +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np +import pytest + +from core.aac_huffman import aac_encode_huff, aac_decode_huff +from material.huff_utils import load_LUT + + +# ----------------------------------------------------------------------------- +# Fixtures +# ----------------------------------------------------------------------------- + +@pytest.fixture(scope="module") +def huff_LUT(): + """ + Load Huffman Look-Up Tables (LUTs) once per test module. + + The LUTs are provided by the assignment (huffCodebooks.mat) via + material.huff_utils.load_LUT(). + """ + return load_LUT() + + +# ----------------------------------------------------------------------------- +# Roundtrip (prefix) tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize( + "coeff_sec", + [ + np.array([1, -1, 2, -2, 0, 0, 3], dtype=np.int64), + np.array([0, 0, 0, 0], dtype=np.int64), + np.array([5], dtype=np.int64), + np.array([-3, -3, -3, -3], dtype=np.int64), + ], +) +def test_huffman_roundtrip_prefix_matches( + coeff_sec: np.ndarray, + huff_LUT, +) -> None: + """ + Contract test for Huffman encode/decode. + + Guarantees: + - Encoding followed by decoding does not crash. + - The decoded output has at least as many symbols as the input. + - The prefix of the decoded output matches the original coefficients. + + Rationale: + Huffman tuple coding may introduce padding, so exact length equality + is NOT required or expected. + """ + huff_sec, cb = aac_encode_huff(coeff_sec, huff_LUT) + dec = aac_decode_huff(huff_sec, cb, huff_LUT) + + if cb == 0: + # Codebook 0 represents an all-zero section. + assert np.all(coeff_sec == 0) + assert dec.size == 0 + return + + assert dec.size >= coeff_sec.size + np.testing.assert_array_equal(dec[: coeff_sec.size], coeff_sec) + + +# ----------------------------------------------------------------------------- +# Forced codebook tests +# ----------------------------------------------------------------------------- + +def test_huffman_force_codebook_returns_requested_codebook(huff_LUT) -> None: + """ + Verify forced codebook selection. + + According to the assignment, scalefactors must be encoded using + Huffman codebook 11. This test checks that: + - The requested codebook is actually used. + - The decoded prefix matches the original scalefactors. + """ + scalefactors = np.array([10, -2, 1, 0, -1, 3], dtype=np.int64) + + huff_sec, cb = aac_encode_huff( + scalefactors, + huff_LUT, + force_codebook=11, + ) + + assert cb == 11 + assert isinstance(huff_sec, str) + + dec = aac_decode_huff(huff_sec, cb, huff_LUT) + + assert dec.size >= scalefactors.size + np.testing.assert_array_equal(dec[: scalefactors.size], scalefactors) + + +# ----------------------------------------------------------------------------- +# Error handling +# ----------------------------------------------------------------------------- + +def test_huffman_invalid_codebook_raises(huff_LUT) -> None: + """ + Decoding with an invalid Huffman codebook index must raise an error. + """ + with pytest.raises(Exception): + _ = aac_decode_huff( + huff_sec="010101", + huff_codebook=99, + huff_LUT=huff_LUT, + ) diff --git a/source/core/tests/test_quantizer.py b/source/core/tests/test_quantizer.py new file mode 100644 index 0000000..37a1a8a --- /dev/null +++ b/source/core/tests/test_quantizer.py @@ -0,0 +1,395 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Quantizer Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Quantizer / iQuantizer module. +# +# These tests are deliberately "contract-oriented": +# - They validate shapes, dtypes and invariants that downstream stages +# (e.g., Huffman coding) depend on. +# - They do not attempt to validate psychoacoustic optimality (that would +# require a reference implementation and careful numerical baselines). +# +# Validates: +# - I/O shapes for long and ESH modes +# - DPCM scalefactor coding consistency (sfc) +# - ESH packing order of quantized symbols (128x8 <-> 1024) +# - Edge cases (zeros / near silence) +# - Sanity (finite outputs, no extreme numerical blow-up) +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np +import pytest + +from core.aac_quantizer import aac_quantizer, aac_i_quantizer +from core.aac_utils import get_table, band_limits +from core.aac_types import FrameType + + +# Small epsilon to avoid divisions by zero in sanity ratios +EPS = 1e-12 + + +# ----------------------------------------------------------------------------- +# Helper utilities +# ----------------------------------------------------------------------------- + +def _nbands(frame_type: FrameType) -> int: + """ + Return number of scalefactor bands for the given frame type. + + This is derived from TableB219 (psycho tables) via aac_utils helpers, + so the tests remain consistent even if tables are updated. + """ + table, _nfft = get_table(frame_type) + wlow, _whigh, _bval, _qthr = band_limits(table) + return int(len(wlow)) + + +def _make_smr(frame_type: FrameType, seed: int = 0) -> np.ndarray: + """ + Create a strictly positive SMR array with the correct shape. + + These tests are not about psycho correctness; they only need SMR > 0 + to avoid division by zero and to make the quantizer's threshold logic + behave deterministically. + """ + rng = np.random.default_rng(seed) + NB = _nbands(frame_type) + + if frame_type == "ESH": + # ESH uses 8 short windows, thus SMR has 8 columns. + return (1.0 + np.abs(rng.normal(size=(NB, 8)))).astype(np.float64) + + # Long frames: use a column vector (NB, 1). + return (1.0 + np.abs(rng.normal(size=(NB, 1)))).astype(np.float64) + + +def _reconstruct_alpha_from_sfc(sfc: np.ndarray) -> np.ndarray: + """ + Reconstruct alpha(b) from DPCM-coded scalefactors sfc(b). + + By definition in the assignment: + sfc(0) = alpha(0) + alpha(b) = alpha(b-1) + sfc(b) for b > 0 + + This reconstruction is useful to validate the internal consistency + of the produced scalefactor information. + """ + sfc = np.asarray(sfc, dtype=np.int64) + + # Long frames: sfc shape (NB, 1) + if sfc.ndim == 2 and sfc.shape[1] == 1: + NB = sfc.shape[0] + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, 0]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, 0]) + return alpha + + # ESH frames: sfc shape (NB, 8) + if sfc.ndim == 2 and sfc.shape[1] == 8: + NB = sfc.shape[0] + alpha = np.zeros((NB, 8), dtype=np.int64) + alpha[0, :] = sfc[0, :] + for b in range(1, NB): + alpha[b, :] = alpha[b - 1, :] + sfc[b, :] + return alpha + + raise ValueError("Unsupported sfc shape.") + + +# ----------------------------------------------------------------------------- +# Shape / contract tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_quantizer_shapes_long(frame_type: FrameType) -> None: + """ + Contract test for long frames: + - Input: MDCT coefficients shape (1024, 1) + - Output S: always (1024, 1) + - Output sfc: (NB, 1) + - G: scalar float for long frames + - iQuantizer output: (1024, 1) + """ + NB = _nbands(frame_type) + rng = np.random.default_rng(1) + + X = rng.normal(size=(1024, 1)).astype(np.float64) + SMR = _make_smr(frame_type, seed=2) + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + + assert S.shape == (1024, 1) + assert sfc.shape == (NB, 1) + assert isinstance(G, (float, np.floating)) + + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + assert Xhat.shape == (1024, 1) + + +def test_quantizer_shapes_esh() -> None: + """ + Contract test for ESH frames: + - Input: MDCT coefficients shape (128, 8) + - Output S: packed to (1024, 1) + - Output sfc: (NB, 8) + - G: array shape (1, 8) for ESH (one gain per short window) + - iQuantizer output: (128, 8) + """ + frame_type: FrameType = "ESH" + NB = _nbands(frame_type) + rng = np.random.default_rng(3) + + X = rng.normal(size=(128, 8)).astype(np.float64) + SMR = _make_smr(frame_type, seed=4) + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + + assert S.shape == (1024, 1) + assert sfc.shape == (NB, 8) + assert isinstance(G, np.ndarray) + assert G.shape == (1, 8) + + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + assert Xhat.shape == (128, 8) + + +# ----------------------------------------------------------------------------- +# DPCM consistency tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_quantizer_dpcm_reconstructs_alpha_long(frame_type: FrameType) -> None: + """ + Verify the DPCM coding rule for long frames. + + The quantizer returns: + sfc(0) = alpha(0) + sfc(b) = alpha(b) - alpha(b-1), b>0 + + Reconstruct alpha from sfc and check: + alpha(0) == sfc(0) == G + """ + rng = np.random.default_rng(5) + + X = rng.normal(size=(1024, 1)).astype(np.float64) + SMR = _make_smr(frame_type, seed=6) + + _S, sfc, G = aac_quantizer(X, frame_type, SMR) + + alpha = _reconstruct_alpha_from_sfc(sfc) + + assert int(sfc[0, 0]) == int(alpha[0]) + assert float(alpha[0]) == float(G) + + +def test_quantizer_dpcm_reconstructs_alpha_esh() -> None: + """ + Verify the DPCM coding rule for ESH frames. + + For each short window j: + sfc(0, j) = alpha(0, j) == G(0, j) + """ + frame_type: FrameType = "ESH" + rng = np.random.default_rng(7) + + X = rng.normal(size=(128, 8)).astype(np.float64) + SMR = _make_smr(frame_type, seed=8) + + _S, sfc, G = aac_quantizer(X, frame_type, SMR) + + alpha = _reconstruct_alpha_from_sfc(sfc) + + assert np.all(alpha[0, :] == sfc[0, :]) + assert np.all(alpha[0, :] == G.reshape(-1)) + + +# ----------------------------------------------------------------------------- +# ESH packing order test +# ----------------------------------------------------------------------------- + +def test_quantizer_esh_packing_order_matches_iquantizer_layout() -> None: + """ + Verify ESH packing order. + + The quantizer outputs S in packed shape (1024, 1). The expected packing + is column-major concatenation of the 8 short subframes. + + This test constructs a deterministic input where each subframe column + has a distinct constant value. After quantize+inverse-quantize, the + reconstructed columns should remain distinguishable in the same order. + + This primarily tests ordering, not exact numerical values. + """ + frame_type: FrameType = "ESH" + NB = _nbands(frame_type) + + # Create 8 distinct subframes: column j is constant (j+1) + X = np.zeros((128, 8), dtype=np.float64) + for j in range(8): + X[:, j] = float(j + 1) + + # Use very large SMR so thresholds are permissive and alpha changes are + # minimal. This helps keep the ordering signal strong. + SMR = np.ones((NB, 8), dtype=np.float64) * 1e6 + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + + # The average magnitude per column must be increasing with the original order. + col_means = np.mean(Xhat, axis=0) + assert np.all(np.diff(col_means) > 0.0) + + +# ----------------------------------------------------------------------------- +# Edge cases: zeros and near-silence +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_quantizer_zero_input_long_is_finite(frame_type: FrameType) -> None: + """ + Edge case: zero MDCT coefficients should not produce NaN/Inf. + + We do not require identity here (quantizer is lossy), but we require + the pipeline to remain numerically safe and produce finite outputs. + """ + NB = _nbands(frame_type) + + X = np.zeros((1024, 1), dtype=np.float64) + SMR = np.ones((NB, 1), dtype=np.float64) + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + assert np.isfinite(S).all() + assert np.isfinite(sfc).all() + assert isinstance(G, (float, np.floating)) + + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + assert np.isfinite(Xhat).all() + + +def test_quantizer_zero_input_esh_is_finite() -> None: + """ + Edge case: same as above, for ESH mode. + """ + frame_type: FrameType = "ESH" + NB = _nbands(frame_type) + + X = np.zeros((128, 8), dtype=np.float64) + SMR = np.ones((NB, 8), dtype=np.float64) + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + assert np.isfinite(S).all() + assert np.isfinite(sfc).all() + assert np.isfinite(G).all() + + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + assert np.isfinite(Xhat).all() + + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_quantizer_near_silence_long_is_finite(frame_type: FrameType) -> None: + """ + Edge case: extremely small values. + + This stresses numerical guards (EPS usage) and ensures no invalid operations. + """ + NB = _nbands(frame_type) + + X = (1e-15 * np.ones((1024, 1), dtype=np.float64)) + SMR = np.ones((NB, 1), dtype=np.float64) + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + assert np.isfinite(S).all() + assert np.isfinite(sfc).all() + + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + assert np.isfinite(Xhat).all() + + +def test_quantizer_near_silence_esh_is_finite() -> None: + """ + Edge case: extremely small values, ESH mode. + """ + frame_type: FrameType = "ESH" + NB = _nbands(frame_type) + + X = (1e-15 * np.ones((128, 8), dtype=np.float64)) + SMR = np.ones((NB, 8), dtype=np.float64) + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + assert np.isfinite(S).all() + assert np.isfinite(sfc).all() + assert np.isfinite(G).all() + + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + assert np.isfinite(Xhat).all() + + +# ----------------------------------------------------------------------------- +# Sanity: avoid catastrophic numerical blow-up +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_quantizer_sanity_no_extreme_blowup_long(frame_type: FrameType) -> None: + """ + Loose sanity guard. + + The quantizer is lossy, but it should not produce reconstructions with + catastrophic peak/energy growth compared to the input. + """ + NB = _nbands(frame_type) + rng = np.random.default_rng(11) + + X = rng.normal(size=(1024, 1)).astype(np.float64) + SMR = np.ones((NB, 1), dtype=np.float64) * 10.0 + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + + in_peak = float(np.max(np.abs(X))) + out_peak = float(np.max(np.abs(Xhat))) + peak_ratio = out_peak / (in_peak + EPS) + + in_energy = float(np.sum(X * X)) + out_energy = float(np.sum(Xhat * Xhat)) + energy_ratio = out_energy / (in_energy + EPS) + + # Very loose thresholds: only catch severe regressions. + assert peak_ratio < 100.0 + assert energy_ratio < 1e4 + + +def test_quantizer_sanity_no_extreme_blowup_esh() -> None: + """ + Same loose sanity guard for ESH mode. + """ + frame_type: FrameType = "ESH" + NB = _nbands(frame_type) + rng = np.random.default_rng(12) + + X = rng.normal(size=(128, 8)).astype(np.float64) + SMR = np.ones((NB, 8), dtype=np.float64) * 10.0 + + S, sfc, G = aac_quantizer(X, frame_type, SMR) + Xhat = aac_i_quantizer(S, sfc, G, frame_type) + + in_peak = float(np.max(np.abs(X))) + out_peak = float(np.max(np.abs(Xhat))) + peak_ratio = out_peak / (in_peak + EPS) + + in_energy = float(np.sum(X * X)) + out_energy = float(np.sum(Xhat * Xhat)) + energy_ratio = out_energy / (in_energy + EPS) + + assert peak_ratio < 100.0 + assert energy_ratio < 1e4 diff --git a/source/core/tests/test_SSC.py b/source/core/tests/test_ssc.py similarity index 92% rename from source/core/tests/test_SSC.py rename to source/core/tests/test_ssc.py index 91bcf21..a4d6926 100644 --- a/source/core/tests/test_SSC.py +++ b/source/core/tests/test_ssc.py @@ -16,7 +16,7 @@ from __future__ import annotations import numpy as np -from core.aac_ssc import aac_SSC +from core.aac_ssc import aac_ssc from core.aac_types import FrameT # ----------------------------------------------------------------------------- @@ -117,10 +117,10 @@ def test_ssc_fixed_cases_prev_lss_and_lps() -> None: next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True) - out1 = aac_SSC(frame_t, next_attack, "LSS") + out1 = aac_ssc(frame_t, next_attack, "LSS") assert out1 == "ESH" - out2 = aac_SSC(frame_t, next_attack, "LPS") + out2 = aac_ssc(frame_t, next_attack, "LPS") assert out2 == "OLS" @@ -138,7 +138,7 @@ def test_prev_ols_next_not_esh_returns_ols() -> None: frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) next_t = _next_frame_no_attack() - out = aac_SSC(frame_t, next_t, "OLS") + out = aac_ssc(frame_t, next_t, "OLS") assert out == "OLS" @@ -151,7 +151,7 @@ def test_prev_ols_next_esh_both_channels_returns_lss() -> None: frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) - out = aac_SSC(frame_t, next_t, "OLS") + out = aac_ssc(frame_t, next_t, "OLS") assert out == "LSS" @@ -165,11 +165,11 @@ def test_prev_ols_next_esh_one_channel_returns_lss() -> None: frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) - out1 = aac_SSC(frame_t, next1_t, "OLS") + out1 = aac_ssc(frame_t, next1_t, "OLS") assert out1 == "LSS" next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) - out2 = aac_SSC(frame_t, next2_t, "OLS") + out2 = aac_ssc(frame_t, next2_t, "OLS") assert out2 == "LSS" @@ -182,7 +182,7 @@ def test_prev_esh_next_esh_both_channels_returns_esh() -> None: frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) - out = aac_SSC(frame_t, next_t, "ESH") + out = aac_ssc(frame_t, next_t, "ESH") assert out == "ESH" @@ -195,7 +195,7 @@ def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None: frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) next_t = _next_frame_no_attack() - out = aac_SSC(frame_t, next_t, "ESH") + out = aac_ssc(frame_t, next_t, "ESH") assert out == "LPS" @@ -209,11 +209,11 @@ def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None: frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) - out1 = aac_SSC(frame_t, next1_t, "ESH") + out1 = aac_ssc(frame_t, next1_t, "ESH") assert out1 == "ESH" next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) - out2 = aac_SSC(frame_t, next2_t, "ESH") + out2 = aac_ssc(frame_t, next2_t, "ESH") assert out2 == "ESH" @@ -230,5 +230,5 @@ def test_threshold_s_must_exceed_1e_3() -> None: frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01) - out = aac_SSC(frame_t, next_t, "OLS") + out = aac_ssc(frame_t, next_t, "OLS") assert out == "OLS" diff --git a/source/core/tests/test_tns.py b/source/core/tests/test_tns.py index 410b4dd..08407c5 100644 --- a/source/core/tests/test_tns.py +++ b/source/core/tests/test_tns.py @@ -26,6 +26,7 @@ from core.aac_configuration import PRED_ORDER, QUANT_MAX, QUANT_STEP from core.aac_tns import aac_tns, aac_i_tns from core.aac_types import * +EPS = 1e-12 # ----------------------------------------------------------------------------- # Helper utilities @@ -194,3 +195,132 @@ def test_tns_outputs_are_finite() -> None: out_esh, coeffs_esh = aac_tns(frame_F_esh, "ESH") assert np.isfinite(out_esh).all() assert np.isfinite(coeffs_esh).all() + + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_tns_zero_input_is_identity_long(frame_type: FrameType) -> None: + """ + Edge case: zero MDCT coefficients should remain zero after TNS and iTNS. + This checks that no NaN/Inf appears and the pipeline is numerically safe. + """ + frame_F_in = np.zeros((1024, 1), dtype=np.float64) + + frame_F_tns, tns_coeffs = aac_tns(frame_F_in, frame_type) + assert np.isfinite(frame_F_tns).all() + assert np.isfinite(tns_coeffs).all() + assert np.all(frame_F_tns == 0.0) + + frame_F_hat = aac_i_tns(frame_F_tns, frame_type, tns_coeffs) + assert np.isfinite(frame_F_hat).all() + assert np.all(frame_F_hat == 0.0) + + +def test_tns_zero_input_is_identity_esh() -> None: + """ + Edge case: zero MDCT coefficients should remain zero for ESH too. + """ + frame_F_in = np.zeros((128, 8), dtype=np.float64) + + frame_F_tns, tns_coeffs = aac_tns(frame_F_in, "ESH") + assert np.isfinite(frame_F_tns).all() + assert np.isfinite(tns_coeffs).all() + assert np.all(frame_F_tns == 0.0) + + frame_F_hat = aac_i_tns(frame_F_tns, "ESH", tns_coeffs) + assert np.isfinite(frame_F_hat).all() + assert np.all(frame_F_hat == 0.0) + + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_tns_near_silence_is_finite_and_roundtrips(frame_type: FrameType) -> None: + """ + Edge case: extremely small values should not cause NaN/Inf, + and round-trip should remain close. + """ + frame_F_in = (1e-15 * np.ones((1024, 1), dtype=np.float64)) + + frame_F_tns, tns_coeffs = aac_tns(frame_F_in, frame_type) + assert np.isfinite(frame_F_tns).all() + assert np.isfinite(tns_coeffs).all() + + frame_F_hat = aac_i_tns(frame_F_tns, frame_type, tns_coeffs) + assert np.isfinite(frame_F_hat).all() + + np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-6, atol=1e-12) + + +def test_tns_near_silence_esh_is_finite_and_roundtrips() -> None: + """ + Near-silence test for ESH mode. + """ + frame_F_in = (1e-15 * np.ones((128, 8), dtype=np.float64)) + + frame_F_tns, tns_coeffs = aac_tns(frame_F_in, "ESH") + assert np.isfinite(frame_F_tns).all() + assert np.isfinite(tns_coeffs).all() + + frame_F_hat = aac_i_tns(frame_F_tns, "ESH", tns_coeffs) + assert np.isfinite(frame_F_hat).all() + + np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-6, atol=1e-12) + + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_tns_accepts_flat_vector_shape_long(frame_type: FrameType) -> None: + """ + Contract test: for non-ESH, aac_tns must accept input shape (1024,) + in addition to (1024, 1), and preserve the shape convention. + """ + rng = np.random.default_rng(7) + frame_F_in = rng.normal(size=(1024,)).astype(np.float64) + + frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type) + + assert frame_F_out.shape == (1024,) + assert tns_coeffs.shape == (PRED_ORDER, 1) + + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_tns_does_not_explode_peak_or_energy_long(frame_type: FrameType) -> None: + """ + Sanity: TNS should not cause extreme peak/energy blow-up on typical inputs. + This is a loose guard to catch regressions. + """ + rng = np.random.default_rng(8) + frame_F_in = rng.normal(size=(1024, 1)).astype(np.float64) + + in_peak = float(np.max(np.abs(frame_F_in))) + in_energy = float(np.sum(frame_F_in * frame_F_in)) + + frame_F_out, _ = aac_tns(frame_F_in, frame_type) + + out_peak = float(np.max(np.abs(frame_F_out))) + out_energy = float(np.sum(frame_F_out * frame_F_out)) + + peak_ratio = out_peak / (in_peak + EPS) + energy_ratio = out_energy / (in_energy + EPS) + + assert peak_ratio < 50.0 + assert energy_ratio < 2500.0 + + +def test_tns_does_not_explode_peak_or_energy_esh() -> None: + """ + Sanity: same blow-up guard for ESH mode. + """ + rng = np.random.default_rng(9) + frame_F_in = rng.normal(size=(128, 8)).astype(np.float64) + + in_peak = float(np.max(np.abs(frame_F_in))) + in_energy = float(np.sum(frame_F_in * frame_F_in)) + + frame_F_out, _ = aac_tns(frame_F_in, "ESH") + + out_peak = float(np.max(np.abs(frame_F_out))) + out_energy = float(np.sum(frame_F_out * frame_F_out)) + + peak_ratio = out_peak / (in_peak + EPS) + energy_ratio = out_energy / (in_energy + EPS) + + assert peak_ratio < 50.0 + assert energy_ratio < 2500.0 diff --git a/source/level_1/core/aac_coder.py b/source/level_1/core/aac_coder.py index fc3f8ac..4c6158c 100644 --- a/source/level_1/core/aac_coder.py +++ b/source/level_1/core/aac_coder.py @@ -9,8 +9,16 @@ # cchoutou@ece.auth.gr # # Description: -# - Level 1 AAC encoder orchestration. -# - Level 2 AAC encoder orchestration. +# Level 1 AAC encoder orchestration. +# Keeps the same functional behavior as the original level_1 implementation: +# - Reads WAV via soundfile +# - Validates stereo and 48 kHz +# - Frames into 2048 samples with hop=1024 and zero padding at both ends +# - SSC decision uses next-frame attack detection +# - Filterbank analysis (MDCT) +# - Stores per-channel spectra in AACSeq1 schema: +# * ESH: (128, 8) +# * else: (1024, 1) # ------------------------------------------------------------ from __future__ import annotations @@ -18,14 +26,106 @@ from pathlib import Path from typing import Union import soundfile as sf +from scipy.io import savemat from core.aac_configuration import WIN_TYPE from core.aac_filterbank import aac_filter_bank -from core.aac_ssc import aac_SSC +from core.aac_ssc import aac_ssc from core.aac_tns import aac_tns +from core.aac_psycho import aac_psycho +from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py +from core.aac_huffman import aac_encode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT + from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Helpers for thresholds (T(b)) +# ----------------------------------------------------------------------------- + +def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return inclusive (lo, hi) band slices derived from TableB219. + """ + table, _ = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)] + + +def _thresholds_from_smr( + frame_F_ch: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> FloatArray: + """ + Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy. + + Shapes: + - Long: returns (NB, 1) + - ESH: returns (NB, 8) + """ + bands = _band_slices_from_table(frame_type) + NB = len(bands) + + X = np.asarray(frame_F_ch, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F_ch must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + T = np.zeros((NB, 8), dtype=np.float64) + for j in range(8): + Xj = X[:, j] + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xj[lo : hi + 1] ** 2)) + smr = float(SMR[b, j]) + T[b, j] = 0.0 if smr <= 1e-12 else (P / smr) + return T + + # Long + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).") + + T = np.zeros((NB, 1), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xv[lo : hi + 1] ** 2)) + smr = float(SMRv[b]) + T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr) + + return T + +def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: + """ + Normalize GlobalGain to match AACChannelFrameF3["G"] type: + - long: return float + - ESH: return float64 ndarray of shape (1, 8) + """ + if np.isscalar(G): + return float(G) + + G_arr = np.asarray(G) + if G_arr.size == 1: + return float(G_arr.reshape(-1)[0]) + + return np.asarray(G_arr, dtype=np.float64) + # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- @@ -174,7 +274,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) next_t = np.vstack([next_t, tail]) - frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_f = aac_filter_bank(frame_t, frame_type, win_type) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) @@ -191,10 +291,6 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: return aac_seq -# ----------------------------------------------------------------------------- -# Level 2 encoder -# ----------------------------------------------------------------------------- - def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: """ Level-2 AAC encoder (Level 1 + TNS). @@ -246,7 +342,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) next_t = np.vstack([next_t, tail]) - frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) # Level 1 analysis (packed stereo container) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) @@ -277,4 +373,181 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: prev_frame_type = frame_type - return aac_seq \ No newline at end of file + return aac_seq + + +def aac_coder_3( + filename_in: Union[str, Path], + filename_aac_coded: Union[str, Path] | None = None, +) -> AACSeq3: + """ + Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + filename_aac_coded : Union[str, Path] | None + Optional .mat filename to store aac_seq_3 (assignment convenience). + + Returns + ------- + AACSeq3 + Encoded AAC sequence (Level 3 payload schema). + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + aac_seq: AACSeq3 = [] + prev_frame_type: FrameType = "OLS" + + # Pin win_type to the WinType literal for type checkers. + win_type: WinType = WIN_TYPE + + # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. + prev1_L = np.zeros((2048,), dtype=np.float64) + prev2_L = np.zeros((2048,), dtype=np.float64) + prev1_R = np.zeros((2048,), dtype=np.float64) + prev2_R = np.zeros((2048,), dtype=np.float64) + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) + + # Analysis filterbank (stereo packed) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) + + # TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + # Psychoacoustic model per channel (time-domain) + frame_L = np.asarray(frame_t[:, 0], dtype=np.float64) + frame_R = np.asarray(frame_t[:, 1], dtype=np.float64) + + SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L) + SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R) + + # Thresholds T(b) (stored, not entropy-coded) + T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L) + T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R) + + # Quantizer per channel + S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) + S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) + + # Normalize G types for AACSeq3 schema (float | float64 ndarray). + G_Ln = _normalize_global_gain(G_L) + G_Rn = _normalize_global_gain(G_R) + + # Huffman-code ONLY the DPCM differences for b>0. + # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. + sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] + sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] + + # Codebook 11: + # maxAbsCodeVal = 16 is RESERVED for ESCAPE. + # We must stay strictly within [-15, +15] to avoid escape decoding. + sf_cb = 11 + sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + + sfc_L_dpcm = np.clip( + sfc_L_dpcm, + -sf_max_abs, + sf_max_abs, + ).astype(np.int64, copy=False) + + sfc_R_dpcm = np.clip( + sfc_R_dpcm, + -sf_max_abs, + sf_max_abs, + ).astype(np.int64, copy=False) + + sfc_L_stream, _ = aac_encode_huff( + sfc_L_dpcm.reshape(-1, order="F"), + huff_LUT_list, + force_codebook=sf_cb, + ) + sfc_R_stream, _ = aac_encode_huff( + sfc_R_dpcm.reshape(-1, order="F"), + huff_LUT_list, + force_codebook=sf_cb, + ) + + mdct_L_stream, cb_L = aac_encode_huff( + np.asarray(S_L, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + mdct_R_stream, cb_R = aac_encode_huff( + np.asarray(S_R, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + + # Typed dict construction helps static analyzers validate the schema. + frame_out: AACSeq3Frame = { + "frame_type": frame_type, + "win_type": win_type, + "chl": { + "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_L, dtype=np.float64), + "G": G_Ln, + "sfc": sfc_L_stream, + "stream": mdct_L_stream, + "codebook": int(cb_L), + }, + "chr": { + "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_R, dtype=np.float64), + "G": G_Rn, + "sfc": sfc_R_stream, + "stream": mdct_R_stream, + "codebook": int(cb_R), + }, + } + aac_seq.append(frame_out) + + # Update psycho history (shift register) + prev2_L = prev1_L + prev1_L = frame_L + prev2_R = prev1_R + prev1_R = frame_R + + prev_frame_type = frame_type + + # Optional: store to .mat for the assignment wrapper + if filename_aac_coded is not None: + filename_aac_coded = Path(filename_aac_coded) + savemat( + str(filename_aac_coded), + {"aac_seq_3": np.array(aac_seq, dtype=object)}, + do_compression=True, + ) + + return aac_seq + diff --git a/source/level_1/core/aac_decoder.py b/source/level_1/core/aac_decoder.py index 5b366b8..ac05621 100644 --- a/source/level_1/core/aac_decoder.py +++ b/source/level_1/core/aac_decoder.py @@ -22,9 +22,22 @@ import soundfile as sf from core.aac_filterbank import aac_i_filter_bank from core.aac_tns import aac_i_tns +from core.aac_quantizer import aac_i_quantizer +from core.aac_huffman import aac_decode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT from core.aac_types import * +# ----------------------------------------------------------------------------- +# Helper for NB +# ----------------------------------------------------------------------------- +def _nbands(frame_type: FrameType) -> int: + table, _ = get_table(frame_type) + wlow, _whigh, _bval, _qthr_db = band_limits(table) + return int(len(wlow)) + + # ----------------------------------------------------------------------------- # Public helpers # ----------------------------------------------------------------------------- @@ -251,4 +264,145 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS y = aac_remove_padding(y_pad, hop=hop) sf.write(str(filename_out), y, 48000) - return y \ No newline at end of file + return y + + + +def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-3 AAC decoder (inverse of aac_coder_3). + + Steps per frame: + - Huffman decode scalefactors (sfc) using codebook 11 + - Huffman decode MDCT symbols (stream) using stored codebook + - iQuantizer -> MDCT coefficients after TNS + - iTNS using stored predictor coefficients + - IMDCT filterbank -> time domain + - Overlap-add, remove padding, write WAV + + Parameters + ---------- + aac_seq_3 : AACSeq3 + Encoded sequence as produced by aac_coder_3. + filename_out : Union[str, Path] + Output WAV filename. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_3) + + if K <= 0: + raise ValueError("aac_seq_3 must contain at least one frame.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_3): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + NB = _nbands(frame_type) + # We store G separately, so Huffman stream contains only (NB-1) DPCM differences. + sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1) + + # ------------------------- + # Left channel + # ------------------------- + tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + G_L = fr["chl"]["G"] + sfc_bits_L = fr["chl"]["sfc"] + mdct_bits_L = fr["chl"]["stream"] + cb_L = int(fr["chl"]["codebook"]) + + sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F") + sfc_L = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8) + sfc_L[0, :] = Gv[0, :].astype(np.int64) + sfc_L[1:, :] = sfc_dpcm_L + else: + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F") + sfc_L = np.zeros((NB, 1), dtype=np.int64) + sfc_L[0, 0] = int(float(G_L)) + sfc_L[1:, :] = sfc_dpcm_L + + # MDCT symbols: codebook 0 means "all-zero section" + if cb_L == 0: + S_dec_L = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False) + + # Tuple coding may produce extra trailing symbols; caller knows the true length (1024). + # Also guard against short outputs by zero-padding. + if S_tmp_L.size < 1024: + S_dec_L = np.zeros((1024,), dtype=np.int64) + S_dec_L[: S_tmp_L.size] = S_tmp_L + else: + S_dec_L = S_tmp_L[:1024] + + S_L = S_dec_L.reshape(1024, 1) + + Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type) + X_L = aac_i_tns(Xq_L, frame_type, tns_L) + + # ------------------------- + # Right channel + # ------------------------- + tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + G_R = fr["chr"]["G"] + sfc_bits_R = fr["chr"]["sfc"] + mdct_bits_R = fr["chr"]["stream"] + cb_R = int(fr["chr"]["codebook"]) + + sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F") + sfc_R = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8) + sfc_R[0, :] = Gv[0, :].astype(np.int64) + sfc_R[1:, :] = sfc_dpcm_R + else: + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F") + sfc_R = np.zeros((NB, 1), dtype=np.int64) + sfc_R[0, 0] = int(float(G_R)) + sfc_R[1:, :] = sfc_dpcm_R + + if cb_R == 0: + S_dec_R = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False) + + if S_tmp_R.size < 1024: + S_dec_R = np.zeros((1024,), dtype=np.int64) + S_dec_R[: S_tmp_R.size] = S_tmp_R + else: + S_dec_R = S_tmp_R[:1024] + + S_R = S_dec_R.reshape(1024, 1) + + Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type) + X_R = aac_i_tns(Xq_R, frame_type, tns_R) + + # Re-pack to stereo container and inverse filterbank + frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + + y = aac_remove_padding(y_pad, hop=hop) + + sf.write(str(filename_out), y, 48000) + return y + diff --git a/source/level_1/core/aac_ssc.py b/source/level_1/core/aac_ssc.py index 8c082b3..323e9e3 100644 --- a/source/level_1/core/aac_ssc.py +++ b/source/level_1/core/aac_ssc.py @@ -176,7 +176,7 @@ def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: # Public Function prototypes # ----------------------------------------------------------------------------- -def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: +def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: """ Sequence Segmentation Control (SSC). diff --git a/source/level_1/core/aac_types.py b/source/level_1/core/aac_types.py index 6642e18..00508c6 100644 --- a/source/level_1/core/aac_types.py +++ b/source/level_1/core/aac_types.py @@ -212,6 +212,42 @@ BandValueArray: TypeAlias = FloatArray Per-band psychoacoustic values (e.g. Bark position, thresholds). """ + +# Quantizer-related semantic aliases + +QuantizedSymbols: TypeAlias = NDArray[np.generic] +""" +Quantized MDCT symbols S(k). + +Shapes: +- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols). +""" + +ScaleFactors: TypeAlias = NDArray[np.generic] +""" +DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1). + +Shapes: +- Long frames: (NB, 1) +- ESH frames: (NB, 8) +""" + +GlobalGain: TypeAlias = float | NDArray[np.generic] +""" +Global gain G = alpha(0). + +- Long frames: scalar float +- ESH frames: array shape (1, 8) +""" + +# Huffman semantic aliases + +HuffmanBitstream: TypeAlias = str +"""Huffman-coded bitstream stored as a string of '0'/'1'.""" + +HuffmanCodebook: TypeAlias = int +"""Huffman codebook id (e.g., 0..11).""" + # ----------------------------------------------------------------------------- # Level 1 AAC sequence payload types # ----------------------------------------------------------------------------- @@ -299,3 +335,77 @@ Level 2 adds: and stores: - per-channel "frame_F" after applying TNS. """ + +# ----------------------------------------------------------------------------- +# Level 3 AAC sequence payload types (Quantizer + Huffman) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF3(TypedDict): + """ + Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3). + + Keys + ---- + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + + T: + Psychoacoustic thresholds per band. + Shapes: + - ESH: (NB, 8) + - else: (NB, 1) + Note: Stored for completeness / debugging; not entropy-coded. + + G: + Quantized global gains. + Shapes: + - ESH: (1, 8) (one per short subframe) + - else: scalar (or compatible np scalar) + + sfc: + Huffman-coded scalefactor differences (DPCM sequence). + + stream: + Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols). + + codebook: + Huffman codebook id used for MDCT symbols (stream). + (Scalefactors typically use fixed codebook 11 and do not need to store it.) + """ + tns_coeffs: TnsCoeffs + T: FloatArray + G: FloatArray | float + sfc: HuffmanBitstream + stream: HuffmanBitstream + codebook: HuffmanCodebook + + +class AACSeq3Frame(TypedDict): + """ + One frame dictionary element of aac_seq_3 (Level 3). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF3 + chr: AACChannelFrameF3 + + +AACSeq3: TypeAlias = List[AACSeq3Frame] +""" +AAC sequence for Level 3: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 3 adds (per channel): +- "tns_coeffs" +- "T" thresholds (not entropy-coded) +- "G" global gain(s) +- "sfc" Huffman-coded scalefactor differences +- "stream" Huffman-coded MDCT quantized symbols +- "codebook" Huffman codebook for MDCT symbols +""" diff --git a/source/level_1/material/TableB219.mat b/source/level_1/material/TableB219.mat deleted file mode 100644 index 7b2e403..0000000 Binary files a/source/level_1/material/TableB219.mat and /dev/null differ diff --git a/source/level_1/material/huffCodebooks.mat b/source/level_1/material/huffCodebooks.mat deleted file mode 100644 index a208b3d..0000000 Binary files a/source/level_1/material/huffCodebooks.mat and /dev/null differ diff --git a/source/level_1/material/huff_utils.py b/source/level_1/material/huff_utils.py deleted file mode 100644 index a8c7fe7..0000000 --- a/source/level_1/material/huff_utils.py +++ /dev/null @@ -1,400 +0,0 @@ -import numpy as np -import scipy.io as sio -import os - -# ------------------ LOAD LUT ------------------ - -def load_LUT(mat_filename=None): - """ - Loads the list of Huffman Codebooks (LUTs) - - Returns: - huffLUT : list (index 1..11 used, index 0 unused) - """ - if mat_filename is None: - current_dir = os.path.dirname(os.path.abspath(__file__)) - mat_filename = os.path.join(current_dir, "huffCodebooks.mat") - - mat = sio.loadmat(mat_filename) - - - huffCodebooks_raw = mat['huffCodebooks'].squeeze() - - huffCodebooks = [] - for i in range(11): - huffCodebooks.append(np.array(huffCodebooks_raw[i])) - - # Build inverse VLC tables - invTable = [None] * 11 - - for i in range(11): - h = huffCodebooks[i][:, 2].astype(int) # column 3 - hlength = huffCodebooks[i][:, 1].astype(int) # column 2 - - hbin = [] - for j in range(len(h)): - hbin.append(format(h[j], f'0{hlength[j]}b')) - - invTable[i] = vlc_table(hbin) - - # Build Huffman LUT dicts - huffLUT = [None] * 12 # index 0 unused - params = [ - (4, 1, True), - (4, 1, True), - (4, 2, False), - (4, 2, False), - (2, 4, True), - (2, 4, True), - (2, 7, False), - (2, 7, False), - (2, 12, False), - (2, 12, False), - (2, 16, False), - ] - - for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1): - huffLUT[i] = { - 'LUT': huffCodebooks[i-1], - 'invTable': invTable[i-1], - 'codebook': i, - 'nTupleSize': nTupleSize, - 'maxAbsCodeVal': maxAbs, - 'signedValues': signed - } - - return huffLUT - -def vlc_table(code_array): - """ - codeArray: list of strings, each string is a Huffman codeword (e.g. '0101') - returns: - h : NumPy array of shape (num_nodes, 3) - columns: - [ next_if_0 , next_if_1 , symbol_index ] - """ - h = np.zeros((1, 3), dtype=int) - - for code_index, code in enumerate(code_array, start=1): - word = [int(bit) for bit in code] - h_index = 0 - - for bit in word: - k = bit - next_node = h[h_index, k] - if next_node == 0: - h = np.vstack([h, [0, 0, 0]]) - new_index = h.shape[0] - 1 - h[h_index, k] = new_index - h_index = new_index - else: - h_index = next_node - - h[h_index, 2] = code_index - - return h - -# ------------------ ENCODE ------------------ - -def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None): - """ - Huffman-encode a sequence of quantized coefficients. - - This function selects the appropriate Huffman codebook based on the - maximum absolute value of the input coefficients, encodes the coefficients - into a binary Huffman bitstream, and returns both the bitstream and the - selected codebook index. - - This is the Python equivalent of the MATLAB `encodeHuff.m` function used - in audio/image coding (e.g., scale factor band encoding). The input - coefficient sequence is grouped into fixed-size tuples as defined by - the chosen Huffman LUT. Zero-padding may be applied internally. - - Parameters - ---------- - coeff_sec : array_like of int - 1-D array of quantized integer coefficients to encode. - Typically corresponds to a "section" or scale-factor band. - - huff_LUT_list : list - List of Huffman lookup-table dictionaries as returned by `loadLUT()`. - Index 1..11 correspond to valid Huffman codebooks. - Index 0 is unused. - - Returns - ------- - huffSec : str - Huffman-encoded bitstream represented as a string of '0' and '1' - characters. - - huffCodebook : int - Index (1..11) of the Huffman codebook used for encoding. - A value of 0 indicates a special all-zero section. - """ - if force_codebook is not None: - return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec) - - maxAbsVal = np.max(np.abs(coeff_sec)) - - if maxAbsVal == 0: - huffCodebook = 0 - huffSec = huff_LUT_code_0() - - elif maxAbsVal == 1: - candidates = [1, 2] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal == 2: - candidates = [3, 4] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (3, 4): - candidates = [5, 6] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (5, 6, 7): - candidates = [7, 8] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (8, 9, 10, 11, 12): - candidates = [9, 10] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (13, 14, 15): - huffCodebook = 11 - huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec) - - else: - huffCodebook = 11 - huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec) - - return huffSec, huffCodebook - -def huff_LUT_code_1(huff_LUT, coeff_sec): - LUT = huff_LUT['LUT'] - nTupleSize = huff_LUT['nTupleSize'] - maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] - signedValues = huff_LUT['signedValues'] - - numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) - - if signedValues: - coeff = coeff_sec + maxAbsCodeVal - base = 2 * maxAbsCodeVal + 1 - else: - coeff = coeff_sec - base = maxAbsCodeVal + 1 - - coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) - coeffPad[:len(coeff)] = coeff - - huffSec = [] - - powers = base ** np.arange(nTupleSize - 1, -1, -1) - - for i in range(numTuples): - nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] - huffIndex = int(np.abs(nTuple) @ powers) - - hexVal = LUT[huffIndex, 2] - huffLen = LUT[huffIndex, 1] - - bits = format(int(hexVal), f'0{int(huffLen)}b') - - if signedValues: - huffSec.append(bits) - else: - signBits = ''.join('1' if v < 0 else '0' for v in nTuple) - huffSec.append(bits + signBits) - - return ''.join(huffSec) - -def huff_LUT_code_0(): - return '' - -def huff_LUT_code_ESC(huff_LUT, coeff_sec): - LUT = huff_LUT['LUT'] - nTupleSize = huff_LUT['nTupleSize'] - maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] - - numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) - base = maxAbsCodeVal + 1 - - coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) - coeffPad[:len(coeff_sec)] = coeff_sec - - huffSec = [] - powers = base ** np.arange(nTupleSize - 1, -1, -1) - - for i in range(numTuples): - nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] - - lnTuple = nTuple.astype(float) - lnTuple[lnTuple == 0] = np.finfo(float).eps - - N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int)) - N = np.maximum(0, N4 - 4) - esc = np.abs(nTuple) > 15 - - nTupleESC = nTuple.copy() - nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16 - - huffIndex = int(np.abs(nTupleESC) @ powers) - - hexVal = LUT[huffIndex, 2] - huffLen = LUT[huffIndex, 1] - - bits = format(int(hexVal), f'0{int(huffLen)}b') - - escSeq = '' - for k in range(nTupleSize): - if esc[k]: - escSeq += '1' * N[k] - escSeq += '0' - escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b') - - signBits = ''.join('1' if v < 0 else '0' for v in nTuple) - huffSec.append(bits + signBits + escSeq) - - return ''.join(huffSec) - -# ------------------ DECODE ------------------ - -def decode_huff(huff_sec, huff_LUT): - """ - Decode a Huffman-encoded stream. - - Parameters - ---------- - huff_sec : array-like of int or str - Huffman encoded stream as a sequence of 0 and 1 (string or list/array). - huff_LUT : dict - Huffman lookup table with keys: - - 'invTable': inverse table (numpy array) - - 'codebook': codebook number - - 'nTupleSize': tuple size - - 'maxAbsCodeVal': maximum absolute code value - - 'signedValues': True/False - - Returns - ------- - decCoeffs : list of int - Decoded quantized coefficients. - """ - - h = huff_LUT['invTable'] - huffCodebook = huff_LUT['codebook'] - nTupleSize = huff_LUT['nTupleSize'] - maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] - signedValues = huff_LUT['signedValues'] - - # Convert string to array of ints - if isinstance(huff_sec, str): - huff_sec = np.array([int(b) for b in huff_sec]) - - eos = False - decCoeffs = [] - streamIndex = 0 - - while not eos: - wordbit = 0 - r = 0 # start at root - - # Decode Huffman word using inverse table - while True: - b = huff_sec[streamIndex + wordbit] - wordbit += 1 - rOld = r - r = h[rOld, b] - if h[r, 0] == 0 and h[r, 1] == 0: - symbolIndex = h[r, 2] - 1 # zero-based - streamIndex += wordbit - break - - # Decode n-tuple magnitudes - if signedValues: - base = 2 * maxAbsCodeVal + 1 - nTupleDec = [] - tmp = symbolIndex - for p in reversed(range(nTupleSize)): - val = tmp // (base ** p) - nTupleDec.append(val - maxAbsCodeVal) - tmp = tmp % (base ** p) - nTupleDec = np.array(nTupleDec) - else: - base = maxAbsCodeVal + 1 - nTupleDec = [] - tmp = symbolIndex - for p in reversed(range(nTupleSize)): - val = tmp // (base ** p) - nTupleDec.append(val) - tmp = tmp % (base ** p) - nTupleDec = np.array(nTupleDec) - - # Apply sign bits - nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize] - nTupleSign = -(np.sign(nTupleSignBits - 0.5)) - streamIndex += nTupleSize - nTupleDec = nTupleDec * nTupleSign - - # Handle escape sequences - escIndex = np.where(np.abs(nTupleDec) == 16)[0] - if huffCodebook == 11 and escIndex.size > 0: - for idx in escIndex: - N = 0 - b = huff_sec[streamIndex] - while b: - N += 1 - b = huff_sec[streamIndex + N] - streamIndex += N - N4 = N + 4 - escape_word = huff_sec[streamIndex:streamIndex + N4] - escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2) - nTupleDec[idx] = escape_value - streamIndex += N4 + 1 - # Apply signs again - nTupleDec[escIndex] *= nTupleSign[escIndex] - - decCoeffs.extend(nTupleDec.tolist()) - - if streamIndex >= len(huff_sec): - eos = True - - return decCoeffs - - diff --git a/source/level_2/core/aac_coder.py b/source/level_2/core/aac_coder.py index fc3f8ac..4c6158c 100644 --- a/source/level_2/core/aac_coder.py +++ b/source/level_2/core/aac_coder.py @@ -9,8 +9,16 @@ # cchoutou@ece.auth.gr # # Description: -# - Level 1 AAC encoder orchestration. -# - Level 2 AAC encoder orchestration. +# Level 1 AAC encoder orchestration. +# Keeps the same functional behavior as the original level_1 implementation: +# - Reads WAV via soundfile +# - Validates stereo and 48 kHz +# - Frames into 2048 samples with hop=1024 and zero padding at both ends +# - SSC decision uses next-frame attack detection +# - Filterbank analysis (MDCT) +# - Stores per-channel spectra in AACSeq1 schema: +# * ESH: (128, 8) +# * else: (1024, 1) # ------------------------------------------------------------ from __future__ import annotations @@ -18,14 +26,106 @@ from pathlib import Path from typing import Union import soundfile as sf +from scipy.io import savemat from core.aac_configuration import WIN_TYPE from core.aac_filterbank import aac_filter_bank -from core.aac_ssc import aac_SSC +from core.aac_ssc import aac_ssc from core.aac_tns import aac_tns +from core.aac_psycho import aac_psycho +from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py +from core.aac_huffman import aac_encode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT + from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Helpers for thresholds (T(b)) +# ----------------------------------------------------------------------------- + +def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return inclusive (lo, hi) band slices derived from TableB219. + """ + table, _ = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)] + + +def _thresholds_from_smr( + frame_F_ch: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> FloatArray: + """ + Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy. + + Shapes: + - Long: returns (NB, 1) + - ESH: returns (NB, 8) + """ + bands = _band_slices_from_table(frame_type) + NB = len(bands) + + X = np.asarray(frame_F_ch, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F_ch must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + T = np.zeros((NB, 8), dtype=np.float64) + for j in range(8): + Xj = X[:, j] + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xj[lo : hi + 1] ** 2)) + smr = float(SMR[b, j]) + T[b, j] = 0.0 if smr <= 1e-12 else (P / smr) + return T + + # Long + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).") + + T = np.zeros((NB, 1), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xv[lo : hi + 1] ** 2)) + smr = float(SMRv[b]) + T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr) + + return T + +def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: + """ + Normalize GlobalGain to match AACChannelFrameF3["G"] type: + - long: return float + - ESH: return float64 ndarray of shape (1, 8) + """ + if np.isscalar(G): + return float(G) + + G_arr = np.asarray(G) + if G_arr.size == 1: + return float(G_arr.reshape(-1)[0]) + + return np.asarray(G_arr, dtype=np.float64) + # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- @@ -174,7 +274,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) next_t = np.vstack([next_t, tail]) - frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_f = aac_filter_bank(frame_t, frame_type, win_type) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) @@ -191,10 +291,6 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: return aac_seq -# ----------------------------------------------------------------------------- -# Level 2 encoder -# ----------------------------------------------------------------------------- - def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: """ Level-2 AAC encoder (Level 1 + TNS). @@ -246,7 +342,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) next_t = np.vstack([next_t, tail]) - frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) # Level 1 analysis (packed stereo container) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) @@ -277,4 +373,181 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: prev_frame_type = frame_type - return aac_seq \ No newline at end of file + return aac_seq + + +def aac_coder_3( + filename_in: Union[str, Path], + filename_aac_coded: Union[str, Path] | None = None, +) -> AACSeq3: + """ + Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + filename_aac_coded : Union[str, Path] | None + Optional .mat filename to store aac_seq_3 (assignment convenience). + + Returns + ------- + AACSeq3 + Encoded AAC sequence (Level 3 payload schema). + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + aac_seq: AACSeq3 = [] + prev_frame_type: FrameType = "OLS" + + # Pin win_type to the WinType literal for type checkers. + win_type: WinType = WIN_TYPE + + # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. + prev1_L = np.zeros((2048,), dtype=np.float64) + prev2_L = np.zeros((2048,), dtype=np.float64) + prev1_R = np.zeros((2048,), dtype=np.float64) + prev2_R = np.zeros((2048,), dtype=np.float64) + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) + + # Analysis filterbank (stereo packed) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) + + # TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + # Psychoacoustic model per channel (time-domain) + frame_L = np.asarray(frame_t[:, 0], dtype=np.float64) + frame_R = np.asarray(frame_t[:, 1], dtype=np.float64) + + SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L) + SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R) + + # Thresholds T(b) (stored, not entropy-coded) + T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L) + T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R) + + # Quantizer per channel + S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) + S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) + + # Normalize G types for AACSeq3 schema (float | float64 ndarray). + G_Ln = _normalize_global_gain(G_L) + G_Rn = _normalize_global_gain(G_R) + + # Huffman-code ONLY the DPCM differences for b>0. + # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. + sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] + sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] + + # Codebook 11: + # maxAbsCodeVal = 16 is RESERVED for ESCAPE. + # We must stay strictly within [-15, +15] to avoid escape decoding. + sf_cb = 11 + sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + + sfc_L_dpcm = np.clip( + sfc_L_dpcm, + -sf_max_abs, + sf_max_abs, + ).astype(np.int64, copy=False) + + sfc_R_dpcm = np.clip( + sfc_R_dpcm, + -sf_max_abs, + sf_max_abs, + ).astype(np.int64, copy=False) + + sfc_L_stream, _ = aac_encode_huff( + sfc_L_dpcm.reshape(-1, order="F"), + huff_LUT_list, + force_codebook=sf_cb, + ) + sfc_R_stream, _ = aac_encode_huff( + sfc_R_dpcm.reshape(-1, order="F"), + huff_LUT_list, + force_codebook=sf_cb, + ) + + mdct_L_stream, cb_L = aac_encode_huff( + np.asarray(S_L, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + mdct_R_stream, cb_R = aac_encode_huff( + np.asarray(S_R, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + + # Typed dict construction helps static analyzers validate the schema. + frame_out: AACSeq3Frame = { + "frame_type": frame_type, + "win_type": win_type, + "chl": { + "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_L, dtype=np.float64), + "G": G_Ln, + "sfc": sfc_L_stream, + "stream": mdct_L_stream, + "codebook": int(cb_L), + }, + "chr": { + "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_R, dtype=np.float64), + "G": G_Rn, + "sfc": sfc_R_stream, + "stream": mdct_R_stream, + "codebook": int(cb_R), + }, + } + aac_seq.append(frame_out) + + # Update psycho history (shift register) + prev2_L = prev1_L + prev1_L = frame_L + prev2_R = prev1_R + prev1_R = frame_R + + prev_frame_type = frame_type + + # Optional: store to .mat for the assignment wrapper + if filename_aac_coded is not None: + filename_aac_coded = Path(filename_aac_coded) + savemat( + str(filename_aac_coded), + {"aac_seq_3": np.array(aac_seq, dtype=object)}, + do_compression=True, + ) + + return aac_seq + diff --git a/source/level_2/core/aac_decoder.py b/source/level_2/core/aac_decoder.py index 5b366b8..ac05621 100644 --- a/source/level_2/core/aac_decoder.py +++ b/source/level_2/core/aac_decoder.py @@ -22,9 +22,22 @@ import soundfile as sf from core.aac_filterbank import aac_i_filter_bank from core.aac_tns import aac_i_tns +from core.aac_quantizer import aac_i_quantizer +from core.aac_huffman import aac_decode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT from core.aac_types import * +# ----------------------------------------------------------------------------- +# Helper for NB +# ----------------------------------------------------------------------------- +def _nbands(frame_type: FrameType) -> int: + table, _ = get_table(frame_type) + wlow, _whigh, _bval, _qthr_db = band_limits(table) + return int(len(wlow)) + + # ----------------------------------------------------------------------------- # Public helpers # ----------------------------------------------------------------------------- @@ -251,4 +264,145 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS y = aac_remove_padding(y_pad, hop=hop) sf.write(str(filename_out), y, 48000) - return y \ No newline at end of file + return y + + + +def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-3 AAC decoder (inverse of aac_coder_3). + + Steps per frame: + - Huffman decode scalefactors (sfc) using codebook 11 + - Huffman decode MDCT symbols (stream) using stored codebook + - iQuantizer -> MDCT coefficients after TNS + - iTNS using stored predictor coefficients + - IMDCT filterbank -> time domain + - Overlap-add, remove padding, write WAV + + Parameters + ---------- + aac_seq_3 : AACSeq3 + Encoded sequence as produced by aac_coder_3. + filename_out : Union[str, Path] + Output WAV filename. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_3) + + if K <= 0: + raise ValueError("aac_seq_3 must contain at least one frame.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_3): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + NB = _nbands(frame_type) + # We store G separately, so Huffman stream contains only (NB-1) DPCM differences. + sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1) + + # ------------------------- + # Left channel + # ------------------------- + tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + G_L = fr["chl"]["G"] + sfc_bits_L = fr["chl"]["sfc"] + mdct_bits_L = fr["chl"]["stream"] + cb_L = int(fr["chl"]["codebook"]) + + sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F") + sfc_L = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8) + sfc_L[0, :] = Gv[0, :].astype(np.int64) + sfc_L[1:, :] = sfc_dpcm_L + else: + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F") + sfc_L = np.zeros((NB, 1), dtype=np.int64) + sfc_L[0, 0] = int(float(G_L)) + sfc_L[1:, :] = sfc_dpcm_L + + # MDCT symbols: codebook 0 means "all-zero section" + if cb_L == 0: + S_dec_L = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False) + + # Tuple coding may produce extra trailing symbols; caller knows the true length (1024). + # Also guard against short outputs by zero-padding. + if S_tmp_L.size < 1024: + S_dec_L = np.zeros((1024,), dtype=np.int64) + S_dec_L[: S_tmp_L.size] = S_tmp_L + else: + S_dec_L = S_tmp_L[:1024] + + S_L = S_dec_L.reshape(1024, 1) + + Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type) + X_L = aac_i_tns(Xq_L, frame_type, tns_L) + + # ------------------------- + # Right channel + # ------------------------- + tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + G_R = fr["chr"]["G"] + sfc_bits_R = fr["chr"]["sfc"] + mdct_bits_R = fr["chr"]["stream"] + cb_R = int(fr["chr"]["codebook"]) + + sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F") + sfc_R = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8) + sfc_R[0, :] = Gv[0, :].astype(np.int64) + sfc_R[1:, :] = sfc_dpcm_R + else: + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F") + sfc_R = np.zeros((NB, 1), dtype=np.int64) + sfc_R[0, 0] = int(float(G_R)) + sfc_R[1:, :] = sfc_dpcm_R + + if cb_R == 0: + S_dec_R = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False) + + if S_tmp_R.size < 1024: + S_dec_R = np.zeros((1024,), dtype=np.int64) + S_dec_R[: S_tmp_R.size] = S_tmp_R + else: + S_dec_R = S_tmp_R[:1024] + + S_R = S_dec_R.reshape(1024, 1) + + Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type) + X_R = aac_i_tns(Xq_R, frame_type, tns_R) + + # Re-pack to stereo container and inverse filterbank + frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + + y = aac_remove_padding(y_pad, hop=hop) + + sf.write(str(filename_out), y, 48000) + return y + diff --git a/source/level_2/core/aac_ssc.py b/source/level_2/core/aac_ssc.py index 8c082b3..323e9e3 100644 --- a/source/level_2/core/aac_ssc.py +++ b/source/level_2/core/aac_ssc.py @@ -176,7 +176,7 @@ def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: # Public Function prototypes # ----------------------------------------------------------------------------- -def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: +def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: """ Sequence Segmentation Control (SSC). diff --git a/source/level_2/core/aac_tns.py b/source/level_2/core/aac_tns.py index 88d9da8..06227b1 100644 --- a/source/level_2/core/aac_tns.py +++ b/source/level_2/core/aac_tns.py @@ -30,9 +30,6 @@ from __future__ import annotations from pathlib import Path from typing import Tuple -import numpy as np -from scipy.io import loadmat - from core.aac_utils import load_b219_tables from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX from core.aac_types import * @@ -377,7 +374,9 @@ def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: sw = _compute_sw(x) eps = 1e-12 - xw = np.where(sw > eps, x / sw, 0.0) + xw = np.zeros_like(x, dtype=np.float64) + mask = sw > eps + np.divide(x, sw, out=xw, where=mask) a = _lpc_coeffs(xw, PRED_ORDER) a_q = _quantize_coeffs(a) diff --git a/source/level_2/core/aac_types.py b/source/level_2/core/aac_types.py index 6642e18..00508c6 100644 --- a/source/level_2/core/aac_types.py +++ b/source/level_2/core/aac_types.py @@ -212,6 +212,42 @@ BandValueArray: TypeAlias = FloatArray Per-band psychoacoustic values (e.g. Bark position, thresholds). """ + +# Quantizer-related semantic aliases + +QuantizedSymbols: TypeAlias = NDArray[np.generic] +""" +Quantized MDCT symbols S(k). + +Shapes: +- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols). +""" + +ScaleFactors: TypeAlias = NDArray[np.generic] +""" +DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1). + +Shapes: +- Long frames: (NB, 1) +- ESH frames: (NB, 8) +""" + +GlobalGain: TypeAlias = float | NDArray[np.generic] +""" +Global gain G = alpha(0). + +- Long frames: scalar float +- ESH frames: array shape (1, 8) +""" + +# Huffman semantic aliases + +HuffmanBitstream: TypeAlias = str +"""Huffman-coded bitstream stored as a string of '0'/'1'.""" + +HuffmanCodebook: TypeAlias = int +"""Huffman codebook id (e.g., 0..11).""" + # ----------------------------------------------------------------------------- # Level 1 AAC sequence payload types # ----------------------------------------------------------------------------- @@ -299,3 +335,77 @@ Level 2 adds: and stores: - per-channel "frame_F" after applying TNS. """ + +# ----------------------------------------------------------------------------- +# Level 3 AAC sequence payload types (Quantizer + Huffman) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF3(TypedDict): + """ + Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3). + + Keys + ---- + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + + T: + Psychoacoustic thresholds per band. + Shapes: + - ESH: (NB, 8) + - else: (NB, 1) + Note: Stored for completeness / debugging; not entropy-coded. + + G: + Quantized global gains. + Shapes: + - ESH: (1, 8) (one per short subframe) + - else: scalar (or compatible np scalar) + + sfc: + Huffman-coded scalefactor differences (DPCM sequence). + + stream: + Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols). + + codebook: + Huffman codebook id used for MDCT symbols (stream). + (Scalefactors typically use fixed codebook 11 and do not need to store it.) + """ + tns_coeffs: TnsCoeffs + T: FloatArray + G: FloatArray | float + sfc: HuffmanBitstream + stream: HuffmanBitstream + codebook: HuffmanCodebook + + +class AACSeq3Frame(TypedDict): + """ + One frame dictionary element of aac_seq_3 (Level 3). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF3 + chr: AACChannelFrameF3 + + +AACSeq3: TypeAlias = List[AACSeq3Frame] +""" +AAC sequence for Level 3: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 3 adds (per channel): +- "tns_coeffs" +- "T" thresholds (not entropy-coded) +- "G" global gain(s) +- "sfc" Huffman-coded scalefactor differences +- "stream" Huffman-coded MDCT quantized symbols +- "codebook" Huffman codebook for MDCT symbols +""" diff --git a/source/level_2/material/huffCodebooks.mat b/source/level_2/material/huffCodebooks.mat deleted file mode 100644 index a208b3d..0000000 Binary files a/source/level_2/material/huffCodebooks.mat and /dev/null differ diff --git a/source/level_2/material/huff_utils.py b/source/level_2/material/huff_utils.py deleted file mode 100644 index a8c7fe7..0000000 --- a/source/level_2/material/huff_utils.py +++ /dev/null @@ -1,400 +0,0 @@ -import numpy as np -import scipy.io as sio -import os - -# ------------------ LOAD LUT ------------------ - -def load_LUT(mat_filename=None): - """ - Loads the list of Huffman Codebooks (LUTs) - - Returns: - huffLUT : list (index 1..11 used, index 0 unused) - """ - if mat_filename is None: - current_dir = os.path.dirname(os.path.abspath(__file__)) - mat_filename = os.path.join(current_dir, "huffCodebooks.mat") - - mat = sio.loadmat(mat_filename) - - - huffCodebooks_raw = mat['huffCodebooks'].squeeze() - - huffCodebooks = [] - for i in range(11): - huffCodebooks.append(np.array(huffCodebooks_raw[i])) - - # Build inverse VLC tables - invTable = [None] * 11 - - for i in range(11): - h = huffCodebooks[i][:, 2].astype(int) # column 3 - hlength = huffCodebooks[i][:, 1].astype(int) # column 2 - - hbin = [] - for j in range(len(h)): - hbin.append(format(h[j], f'0{hlength[j]}b')) - - invTable[i] = vlc_table(hbin) - - # Build Huffman LUT dicts - huffLUT = [None] * 12 # index 0 unused - params = [ - (4, 1, True), - (4, 1, True), - (4, 2, False), - (4, 2, False), - (2, 4, True), - (2, 4, True), - (2, 7, False), - (2, 7, False), - (2, 12, False), - (2, 12, False), - (2, 16, False), - ] - - for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1): - huffLUT[i] = { - 'LUT': huffCodebooks[i-1], - 'invTable': invTable[i-1], - 'codebook': i, - 'nTupleSize': nTupleSize, - 'maxAbsCodeVal': maxAbs, - 'signedValues': signed - } - - return huffLUT - -def vlc_table(code_array): - """ - codeArray: list of strings, each string is a Huffman codeword (e.g. '0101') - returns: - h : NumPy array of shape (num_nodes, 3) - columns: - [ next_if_0 , next_if_1 , symbol_index ] - """ - h = np.zeros((1, 3), dtype=int) - - for code_index, code in enumerate(code_array, start=1): - word = [int(bit) for bit in code] - h_index = 0 - - for bit in word: - k = bit - next_node = h[h_index, k] - if next_node == 0: - h = np.vstack([h, [0, 0, 0]]) - new_index = h.shape[0] - 1 - h[h_index, k] = new_index - h_index = new_index - else: - h_index = next_node - - h[h_index, 2] = code_index - - return h - -# ------------------ ENCODE ------------------ - -def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None): - """ - Huffman-encode a sequence of quantized coefficients. - - This function selects the appropriate Huffman codebook based on the - maximum absolute value of the input coefficients, encodes the coefficients - into a binary Huffman bitstream, and returns both the bitstream and the - selected codebook index. - - This is the Python equivalent of the MATLAB `encodeHuff.m` function used - in audio/image coding (e.g., scale factor band encoding). The input - coefficient sequence is grouped into fixed-size tuples as defined by - the chosen Huffman LUT. Zero-padding may be applied internally. - - Parameters - ---------- - coeff_sec : array_like of int - 1-D array of quantized integer coefficients to encode. - Typically corresponds to a "section" or scale-factor band. - - huff_LUT_list : list - List of Huffman lookup-table dictionaries as returned by `loadLUT()`. - Index 1..11 correspond to valid Huffman codebooks. - Index 0 is unused. - - Returns - ------- - huffSec : str - Huffman-encoded bitstream represented as a string of '0' and '1' - characters. - - huffCodebook : int - Index (1..11) of the Huffman codebook used for encoding. - A value of 0 indicates a special all-zero section. - """ - if force_codebook is not None: - return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec) - - maxAbsVal = np.max(np.abs(coeff_sec)) - - if maxAbsVal == 0: - huffCodebook = 0 - huffSec = huff_LUT_code_0() - - elif maxAbsVal == 1: - candidates = [1, 2] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal == 2: - candidates = [3, 4] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (3, 4): - candidates = [5, 6] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (5, 6, 7): - candidates = [7, 8] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (8, 9, 10, 11, 12): - candidates = [9, 10] - huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) - huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) - if len(huffSec1) <= len(huffSec2): - huffSec = huffSec1 - huffCodebook = candidates[0] - else: - huffSec = huffSec2 - huffCodebook = candidates[1] - - elif maxAbsVal in (13, 14, 15): - huffCodebook = 11 - huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec) - - else: - huffCodebook = 11 - huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec) - - return huffSec, huffCodebook - -def huff_LUT_code_1(huff_LUT, coeff_sec): - LUT = huff_LUT['LUT'] - nTupleSize = huff_LUT['nTupleSize'] - maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] - signedValues = huff_LUT['signedValues'] - - numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) - - if signedValues: - coeff = coeff_sec + maxAbsCodeVal - base = 2 * maxAbsCodeVal + 1 - else: - coeff = coeff_sec - base = maxAbsCodeVal + 1 - - coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) - coeffPad[:len(coeff)] = coeff - - huffSec = [] - - powers = base ** np.arange(nTupleSize - 1, -1, -1) - - for i in range(numTuples): - nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] - huffIndex = int(np.abs(nTuple) @ powers) - - hexVal = LUT[huffIndex, 2] - huffLen = LUT[huffIndex, 1] - - bits = format(int(hexVal), f'0{int(huffLen)}b') - - if signedValues: - huffSec.append(bits) - else: - signBits = ''.join('1' if v < 0 else '0' for v in nTuple) - huffSec.append(bits + signBits) - - return ''.join(huffSec) - -def huff_LUT_code_0(): - return '' - -def huff_LUT_code_ESC(huff_LUT, coeff_sec): - LUT = huff_LUT['LUT'] - nTupleSize = huff_LUT['nTupleSize'] - maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] - - numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) - base = maxAbsCodeVal + 1 - - coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) - coeffPad[:len(coeff_sec)] = coeff_sec - - huffSec = [] - powers = base ** np.arange(nTupleSize - 1, -1, -1) - - for i in range(numTuples): - nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] - - lnTuple = nTuple.astype(float) - lnTuple[lnTuple == 0] = np.finfo(float).eps - - N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int)) - N = np.maximum(0, N4 - 4) - esc = np.abs(nTuple) > 15 - - nTupleESC = nTuple.copy() - nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16 - - huffIndex = int(np.abs(nTupleESC) @ powers) - - hexVal = LUT[huffIndex, 2] - huffLen = LUT[huffIndex, 1] - - bits = format(int(hexVal), f'0{int(huffLen)}b') - - escSeq = '' - for k in range(nTupleSize): - if esc[k]: - escSeq += '1' * N[k] - escSeq += '0' - escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b') - - signBits = ''.join('1' if v < 0 else '0' for v in nTuple) - huffSec.append(bits + signBits + escSeq) - - return ''.join(huffSec) - -# ------------------ DECODE ------------------ - -def decode_huff(huff_sec, huff_LUT): - """ - Decode a Huffman-encoded stream. - - Parameters - ---------- - huff_sec : array-like of int or str - Huffman encoded stream as a sequence of 0 and 1 (string or list/array). - huff_LUT : dict - Huffman lookup table with keys: - - 'invTable': inverse table (numpy array) - - 'codebook': codebook number - - 'nTupleSize': tuple size - - 'maxAbsCodeVal': maximum absolute code value - - 'signedValues': True/False - - Returns - ------- - decCoeffs : list of int - Decoded quantized coefficients. - """ - - h = huff_LUT['invTable'] - huffCodebook = huff_LUT['codebook'] - nTupleSize = huff_LUT['nTupleSize'] - maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] - signedValues = huff_LUT['signedValues'] - - # Convert string to array of ints - if isinstance(huff_sec, str): - huff_sec = np.array([int(b) for b in huff_sec]) - - eos = False - decCoeffs = [] - streamIndex = 0 - - while not eos: - wordbit = 0 - r = 0 # start at root - - # Decode Huffman word using inverse table - while True: - b = huff_sec[streamIndex + wordbit] - wordbit += 1 - rOld = r - r = h[rOld, b] - if h[r, 0] == 0 and h[r, 1] == 0: - symbolIndex = h[r, 2] - 1 # zero-based - streamIndex += wordbit - break - - # Decode n-tuple magnitudes - if signedValues: - base = 2 * maxAbsCodeVal + 1 - nTupleDec = [] - tmp = symbolIndex - for p in reversed(range(nTupleSize)): - val = tmp // (base ** p) - nTupleDec.append(val - maxAbsCodeVal) - tmp = tmp % (base ** p) - nTupleDec = np.array(nTupleDec) - else: - base = maxAbsCodeVal + 1 - nTupleDec = [] - tmp = symbolIndex - for p in reversed(range(nTupleSize)): - val = tmp // (base ** p) - nTupleDec.append(val) - tmp = tmp % (base ** p) - nTupleDec = np.array(nTupleDec) - - # Apply sign bits - nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize] - nTupleSign = -(np.sign(nTupleSignBits - 0.5)) - streamIndex += nTupleSize - nTupleDec = nTupleDec * nTupleSign - - # Handle escape sequences - escIndex = np.where(np.abs(nTupleDec) == 16)[0] - if huffCodebook == 11 and escIndex.size > 0: - for idx in escIndex: - N = 0 - b = huff_sec[streamIndex] - while b: - N += 1 - b = huff_sec[streamIndex + N] - streamIndex += N - N4 = N + 4 - escape_word = huff_sec[streamIndex:streamIndex + N4] - escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2) - nTupleDec[idx] = escape_value - streamIndex += N4 + 1 - # Apply signs again - nTupleDec[escIndex] *= nTupleSign[escIndex] - - decCoeffs.extend(nTupleDec.tolist()) - - if streamIndex >= len(huff_sec): - eos = True - - return decCoeffs - - diff --git a/source/pytest.ini b/source/pytest.ini index cdcbe3c..b6e2820 100644 --- a/source/pytest.ini +++ b/source/pytest.ini @@ -1,4 +1,7 @@ [pytest] pythonpath = . testpaths = - core/tests \ No newline at end of file + core/tests + +filterwarnings = + error::RuntimeWarning \ No newline at end of file