diff --git a/source/core/aac_coder.py b/source/core/aac_coder.py index 837ac34..0b6b939 100644 --- a/source/core/aac_coder.py +++ b/source/core/aac_coder.py @@ -30,6 +30,7 @@ import soundfile as sf from core.aac_configuration import WIN_TYPE from core.aac_filterbank import aac_filter_bank from core.aac_ssc import aac_SSC +from core.aac_tns import aac_tns from core.aac_types import * @@ -144,8 +145,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: AACSeq1 List of encoded frames (Level 1 schema). """ - x, fs = aac_read_wav_stereo_48k(filename_in) - _ = fs # kept for clarity; The assignment assumes 48 kHz + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz hop = 1024 win = 2048 @@ -196,3 +197,88 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: prev_frame_type = frame_type return aac_seq + + +def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: + """ + Level-2 AAC encoder (Level 1 + TNS). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + + Returns + ------- + AACSeq2 + Encoded AAC sequence (Level 2 payload schema). + For each frame i: + - "frame_type": FrameType + - "win_type": WinType + - "chl"/"chr": + - "frame_F": FrameChannelF (after TNS) + - "tns_coeffs": TnsCoeffs + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq2 = [] + prev_frame_type: FrameType = "OLS" + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + + # Level 1 analysis (packed stereo container) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) + + # Unpack to per-channel (as you already do in Level 1) + if frame_type == "ESH": + chl_f = np.empty((128, 8), dtype=np.float64) + chr_f = np.empty((128, 8), dtype=np.float64) + for j in range(8): + chl_f[:, j] = frame_f_stereo[:, 2 * j + 0] + chr_f[:, j] = frame_f_stereo[:, 2 * j + 1] + else: + chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False) + chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False) + + # Level 2: apply TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + aac_seq.append( + { + "frame_type": frame_type, + "win_type": WIN_TYPE, + "chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs}, + "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, + } + ) + + prev_frame_type = frame_type + + return aac_seq \ No newline at end of file diff --git a/source/core/aac_configuration.py b/source/core/aac_configuration.py index 262e884..bb75d2e 100644 --- a/source/core/aac_configuration.py +++ b/source/core/aac_configuration.py @@ -17,6 +17,15 @@ from __future__ import annotations # Imports from core.aac_types import WinType +# Filterbank +# ------------------------------------------------------------ # Window type # Options: "SIN", "KBD" -WIN_TYPE: WinType = "SIN" \ No newline at end of file +WIN_TYPE: WinType = "SIN" + + +# TNS +# ------------------------------------------------------------ +PRED_ORDER = 4 +QUANT_STEP = 0.1 +QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7] \ No newline at end of file diff --git a/source/core/aac_decoder.py b/source/core/aac_decoder.py index eb30011..606ce78 100644 --- a/source/core/aac_decoder.py +++ b/source/core/aac_decoder.py @@ -28,6 +28,7 @@ from typing import Union import soundfile as sf from core.aac_filterbank import aac_i_filter_bank +from core.aac_tns import aac_i_tns from core.aac_types import * @@ -164,3 +165,93 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS sf.write(str(filename_out), y, 48000) return y + + +def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-2 AAC decoder (inverse of aac_coder_2). + + Behavior matches Level 1 decoder pipeline, with additional iTNS stage: + - Per frame/channel: inverse TNS using stored coefficients + - Re-pack to stereo frame_F + - IMDCT + windowing + - Overlap-add over frames + - Remove Level-1 padding (hop samples start/end) + - Write output WAV (48 kHz) + + Parameters + ---------- + aac_seq_2 : AACSeq2 + Encoded sequence as produced by aac_coder_2(). + filename_out : Union[str, Path] + Output WAV filename. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_2) + + if K <= 0: + raise ValueError("aac_seq_2 must contain at least one frame.") + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_2): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + + # Inverse TNS per channel + chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs) + chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs) + + # Re-pack to the stereo container expected by aac_i_filter_bank + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f: FrameF = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + else: + # Accept either (1024,1) or (1024,) from your internal convention. + if chl_f.shape == (1024,): + chl_col = chl_f.reshape(1024, 1) + elif chl_f.shape == (1024, 1): + chl_col = chl_f + else: + raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).") + + if chr_f.shape == (1024,): + chr_col = chr_f.reshape(1024, 1) + elif chr_f.shape == (1024, 1): + chr_col = chr_f + else: + raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_col[:, 0] + frame_f[:, 1] = chr_col[:, 0] + + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + + y = aac_remove_padding(y_pad, hop=hop) + + sf.write(str(filename_out), y, 48000) + return y \ No newline at end of file diff --git a/source/core/aac_snr_db.py b/source/core/aac_snr_db.py new file mode 100644 index 0000000..21e5184 --- /dev/null +++ b/source/core/aac_snr_db.py @@ -0,0 +1,60 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - SNR dB calculator +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements SNR calculation in dB +# ------------------------------------------------------------ +from __future__ import annotations + +from core.aac_types import StereoSignal +import numpy as np + +def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Compute overall SNR (dB) over all samples and channels after aligning lengths. + + Parameters + ---------- + x_ref : StereoSignal + Reference stereo stream. + x_hat : StereoSignal + Reconstructed stereo stream. + + Returns + ------- + float + SNR in dB. + - Returns +inf if noise power is zero. + - Returns -inf if signal power is zero. + """ + x_ref = np.asarray(x_ref, dtype=np.float64) + x_hat = np.asarray(x_hat, dtype=np.float64) + + if x_ref.ndim == 1: + x_ref = x_ref.reshape(-1, 1) + if x_hat.ndim == 1: + x_hat = x_hat.reshape(-1, 1) + + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + x_ref = x_ref[:n, :c] + x_hat = x_hat[:n, :c] + + err = x_ref - x_hat + ps = float(np.sum(x_ref * x_ref)) + pn = float(np.sum(err * err)) + + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") + + return float(10.0 * np.log10(ps / pn)) \ No newline at end of file diff --git a/source/core/aac_tns.py b/source/core/aac_tns.py new file mode 100644 index 0000000..29e9921 --- /dev/null +++ b/source/core/aac_tns.py @@ -0,0 +1,549 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Temporal Noise Shaping (TNS) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Temporal Noise Shaping (TNS) module (Level 2). +# +# Public API: +# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type) +# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs) +# +# Notes (per assignment): +# - TNS is applied per channel (not stereo). +# - For ESH, TNS is applied independently to each of the 8 short subframes. +# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short) +# provided in TableB219.mat. +# - Predictor order is fixed to p = 4. +# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1. +# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p +# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Tuple + +import numpy as np +from scipy.io import loadmat + +from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Private helpers +# ----------------------------------------------------------------------------- + +_B219_CACHE: dict[str, FloatArray] | None = None + + +def _load_b219_tables() -> dict[str, FloatArray]: + """ + Load TableB219.mat and cache the contents. + + The project layout guarantees that a 'material' directory is discoverable + from the current working directory (tests and level_123 entrypoints). + + Returns + ------- + dict[str, FloatArray] + Keys: + - "B219a": long bands table (for K=1024 MDCT lines) + - "B219b": short bands table (for K=128 MDCT lines) + """ + global _B219_CACHE + if _B219_CACHE is not None: + return _B219_CACHE + + mat_path = Path("material") / "TableB219.mat" + if not mat_path.exists(): + raise FileNotFoundError("Could not locate material/TableB219.mat in the current working directory.") + + d = loadmat(str(mat_path)) + if "B219a" not in d or "B219b" not in d: + raise ValueError("TableB219.mat missing required variables B219a and/or B219b.") + + _B219_CACHE = { + "B219a": np.asarray(d["B219a"], dtype=np.float64), + "B219b": np.asarray(d["B219b"], dtype=np.float64), + } + return _B219_CACHE + + +def _band_ranges_for_kcount(k_count: int) -> BandRanges: + """ + Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count. + + Parameters + ---------- + k_count : int + Number of MDCT lines: + - 1024 for long frames + - 128 for short subframes (ESH) + + Returns + ------- + BandRanges (list[tuple[int, int]]) + Each tuple is (start_k, end_k) inclusive. + """ + tables = _load_b219_tables() + if k_count == 1024: + tbl = tables["B219a"] + elif k_count == 128: + tbl = tables["B219b"] + else: + raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).") + + start = tbl[:, 1].astype(int) + end = tbl[:, 2].astype(int) + + ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)] + + for s, e in ranges: + if s < 0 or e < s or e >= k_count: + raise ValueError("Invalid band table ranges for given k_count.") + return ranges + + +# ----------------------------------------------------------------------------- +# Core DSP helpers +# ----------------------------------------------------------------------------- + +def _smooth_sw_inplace(sw: MdctCoeffs) -> None: + """ + Smooth Sw(k) to reduce discontinuities between adjacent Bark bands. + + The assignment applies two passes: + - Backward: Sw(k) = (Sw(k) + Sw(k+1))/2 + - Forward: Sw(k) = (Sw(k) + Sw(k-1))/2 + + Parameters + ---------- + sw : MdctCoeffs + 1-D array of length K (float64). Modified in-place. + """ + k_count = int(sw.shape[0]) + + for k in range(k_count - 2, -1, -1): + sw[k] = 0.5 * (sw[k] + sw[k + 1]) + + for k in range(1, k_count): + sw[k] = 0.5 * (sw[k] + sw[k - 1]) + + +def _compute_sw(x: MdctCoeffs) -> MdctCoeffs: + """ + Compute Sw(k) from band energies P(j) and apply boundary smoothing. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT line array, length K. + + Returns + ------- + MdctCoeffs + Sw(k), 1-D array of length K, float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + k_count = int(x.shape[0]) + + bands = _band_ranges_for_kcount(k_count) + sw = np.zeros(k_count, dtype=np.float64) + + for s, e in bands: + seg = x[s : e + 1] + p_j = float(np.sum(seg * seg)) + sw_val = float(np.sqrt(p_j)) + sw[s : e + 1] = sw_val + + _smooth_sw_inplace(sw) + return sw + + +def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Autocorrelation r(m) for m=0..p. + + Parameters + ---------- + x : MdctCoeffs + 1-D signal. + p : int + Maximum lag. + + Returns + ------- + MdctCoeffs + r, shape (p+1,), float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + n = int(x.shape[0]) + + r = np.zeros(p + 1, dtype=np.float64) + for m in range(p + 1): + r[m] = float(np.dot(x[m:], x[: n - m])) + return r + + +def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Solve Yule-Walker normal equations for LPC coefficients of order p. + + Parameters + ---------- + xw : MdctCoeffs + 1-D normalized sequence Xw(k). + p : int + Predictor order. + + Returns + ------- + MdctCoeffs + LPC coefficients a[0..p-1], shape (p,), float64. + """ + r = _autocorr(xw, p) + + R = np.empty((p, p), dtype=np.float64) + for i in range(p): + for j in range(p): + R[i, j] = r[abs(i - j)] + + rhs = r[1 : p + 1].reshape(p) + + reg = 1e-12 + R_reg = R + reg * np.eye(p, dtype=np.float64) + + a = np.linalg.solve(R_reg, rhs) + return a + + +def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs: + """ + Quantize LPC coefficients with uniform symmetric quantizer and clamp. + + Parameters + ---------- + a : MdctCoeffs + LPC coefficient array, shape (p,). + + Returns + ------- + MdctCoeffs + Quantized coefficients, shape (p,), float64. + """ + a = np.asarray(a, dtype=np.float64).reshape(-1) + q = np.round(a / QUANT_STEP) * QUANT_STEP + q = np.clip(q, -QUANT_MAX, QUANT_MAX) + return q.astype(np.float64, copy=False) + + +def _is_inverse_stable(a_q: MdctCoeffs) -> bool: + """ + Check stability of the inverse TNS filter H_TNS^{-1}. + + Forward filter: + H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p + + Inverse filter poles are roots of: + A(z) = 1 - a1 z^-1 - ... - ap z^-p + Multiply by z^p: + z^p - a1 z^{p-1} - ... - ap = 0 + + Stability condition: + all roots satisfy |z| < 1. + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + bool + True if stable, else False. + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + + # Polynomial in z: z^p - a1 z^{p-1} - ... - ap + poly = np.empty(p + 1, dtype=np.float64) + poly[0] = 1.0 + poly[1:] = -a_q + + roots = np.roots(poly) + + # Strictly inside unit circle for stability. Add tiny margin for numeric safety. + margin = 1e-12 + return bool(np.all(np.abs(roots) < (1.0 - margin))) + + +def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs: + """ + Make quantized predictor coefficients stable for inverse filtering. + + Policy: + - If already stable: return as-is. + - Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid. + - If still unstable after attempts: fall back to all-zero coefficients (disable TNS). + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Stable quantized coefficients, shape (p,). + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + + if _is_inverse_stable(a_q): + return a_q + + # Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid. + gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1) + + for g in gammas: + cand = _quantize_coeffs(g * a_q) + if _is_inverse_stable(cand): + return cand + + # Last resort: disable TNS for this vector + return np.zeros_like(a_q, dtype=np.float64) + + +def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply forward TNS FIR filter: + y[k] = x[k] - sum_{l=1..p} a_l * x[k-l] + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT lines, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Filtered MDCT lines y, length K. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(x.shape[0]) + + y = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = x[k] + for l in range(1, p + 1): + if k - l >= 0: + acc -= a_q[l - 1] * x[k - l] + y[k] = acc + return y + + +def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply inverse TNS IIR filter: + x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l] + + Parameters + ---------- + y : MdctCoeffs + 1-D MDCT lines after TNS, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Reconstructed MDCT lines x_hat, length K. + """ + y = np.asarray(y, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(y.shape[0]) + + x_hat = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = y[k] + for l in range(1, p + 1): + if k - l >= 0: + acc += a_q[l - 1] * x_hat[k - l] + x_hat[k] = acc + return x_hat + + +def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: + """ + TNS for a single MDCT vector (one long frame or one short subframe). + + Steps: + 1) Compute Sw(k) from Bark band energies and smooth it. + 2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0). + 3) Compute LPC coefficients (order p=PRED_ORDER) on Xw. + 4) Quantize coefficients (4-bit symmetric, step QUANT_STEP). + 5) Apply FIR filter on original X(k) using quantized coefficients. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT vector. + + Returns + ------- + y : MdctCoeffs + TNS-processed MDCT vector (same length). + a_q : MdctCoeffs + Quantized LPC coefficients, shape (PRED_ORDER,). + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + sw = _compute_sw(x) + + eps = 1e-12 + xw = np.where(sw > eps, x / sw, 0.0) + + a = _lpc_coeffs(xw, PRED_ORDER) + a_q = _quantize_coeffs(a) + + # Ensure inverse stability (assignment requirement) + a_q = _stabilize_quantized_coeffs(a_q) + + y = _apply_tns_fir(x, a_q) + return y, a_q + + + +# ----------------------------------------------------------------------------- +# Public Functions (Level 2) +# ----------------------------------------------------------------------------- + +def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]: + """ + Temporal Noise Shaping (TNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + frame_F_out : FrameChannelF + Per-channel MDCT coefficients after applying TNS. + Same shape convention as input. + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + """ + x = np.asarray(frame_F_in, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + + y = np.empty_like(x, dtype=np.float64) + a_out = np.empty((PRED_ORDER, 8), dtype=np.float64) + + for j in range(8): + y[:, j], a_out[:, j] = _tns_one_vector(x[:, j]) + + return y, a_out + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec, a_q = _tns_one_vector(x_vec) + + if out_shape == (1024,): + y_out = y_vec + else: + y_out = y_vec.reshape(1024, 1) + + a_out = a_q.reshape(PRED_ORDER, 1) + return y_out, a_out + + +def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF: + """ + Inverse Temporal Noise Shaping (iTNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients after TNS. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + + Returns + ------- + FrameChannelF + Per-channel MDCT coefficients after inverse TNS. + Same shape convention as input frame_F_in. + """ + x = np.asarray(frame_F_in, dtype=np.float64) + a = np.asarray(tns_coeffs, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + if a.shape != (PRED_ORDER, 8): + raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).") + + y = np.empty_like(x, dtype=np.float64) + for j in range(8): + y[:, j] = _apply_itns_iir(x[:, j], a[:, j]) + return y + + if a.shape != (PRED_ORDER, 1): + raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).") + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec = _apply_itns_iir(x_vec, a[:, 0]) + + if out_shape == (1024,): + return y_vec + return y_vec.reshape(1024, 1) diff --git a/source/core/aac_types.py b/source/core/aac_types.py index 8094163..b52891a 100644 --- a/source/core/aac_types.py +++ b/source/core/aac_types.py @@ -10,7 +10,6 @@ # # Description: # This module implements Public Type aliases -# # ------------------------------------------------------------ from __future__ import annotations @@ -39,7 +38,7 @@ Window type codes (AAC): """ ChannelKey: TypeAlias = Literal["chl", "chr"] -"""Channel dictionary keys used in Level 1 payloads.""" +"""Channel dictionary keys used in Level payloads.""" # ----------------------------------------------------------------------------- @@ -105,6 +104,40 @@ Examples: dtype: float64 """ +MdctFrameChannel: TypeAlias = FloatArray +""" +Per-channel MDCT container used in Level-1/2 sequences. + +Typical shapes: +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) + +dtype: float64 + +Notes +----- +Some parts of the assignment store long-frame coefficients as a column vector +(1024, 1) to match MATLAB conventions. Internally you may also use (1024,) +when convenient, but the semantic meaning is identical. +""" + +TnsCoeffs: TypeAlias = FloatArray +""" +Quantized TNS predictor coefficients (one channel). + +Typical shapes (Level 2): +- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes) +- Else: (4, 1) (order p=4 for the long frame) + +dtype: float64 + +Notes +----- +The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1. +We store the quantized coefficient values as float64 (typically multiples of 0.1) +to keep the pipeline simple and readable. +""" + FrameT: TypeAlias = FloatArray """ @@ -142,17 +175,23 @@ Rationale for ESH (128, 16): dtype: float64 """ -FrameChannelF: TypeAlias = FloatArray +FrameChannelF: TypeAlias = MdctFrameChannel """ -Frequency-domain single-channel frame (MDCT coefficients). +Frequency-domain single-channel MDCT coefficients. -Typical shapes (Level 1): -- If frame_type in {"OLS","LSS","LPS"}: (1024,) -- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) +Typical shapes (Level 1/2): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) dtype: float64 """ +BandRanges: TypeAlias = list[tuple[int, int]] +""" +Bark-band index ranges [start, end] (inclusive) for MDCT lines. + +Used by TNS to map MDCT indices k to Bark bands. +""" # ----------------------------------------------------------------------------- # Level 1 AAC sequence payload types @@ -168,7 +207,7 @@ class AACChannelFrameF(TypedDict): The MDCT coefficients for ONE channel. Typical shapes: - ESH: (128, 8) (8 short subframes) - - else: (1024, ) + - else: (1024, 1) or (1024,) """ frame_F: FrameChannelF @@ -191,3 +230,53 @@ List of length K (K = number of frames). Each element is a dict with keys: - "frame_type", "win_type", "chl", "chr" """ + + +# ----------------------------------------------------------------------------- +# Level 2 AAC sequence payload types (TNS) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF2(TypedDict): + """ + Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2). + + Keys + ---- + frame_F: + The TNS-processed MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) + - else: (1024, 1) or (1024,) + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Typical shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + """ + frame_F: FrameChannelF + tns_coeffs: TnsCoeffs + + +class AACSeq2Frame(TypedDict): + """ + One frame dictionary element of aac_seq_2 (Level 2). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF2 + chr: AACChannelFrameF2 + + +AACSeq2: TypeAlias = List[AACSeq2Frame] +""" +AAC sequence for Level 2: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 2 adds: +- per-channel "tns_coeffs" +and stores: +- per-channel "frame_F" after applying TNS. +""" diff --git a/source/core/tests/test_aac_coder_decoder.py b/source/core/tests/test_aac_coder_decoder.py index e8bb669..696c94f 100644 --- a/source/core/tests/test_aac_coder_decoder.py +++ b/source/core/tests/test_aac_coder_decoder.py @@ -19,59 +19,15 @@ import numpy as np import pytest import soundfile as sf -from core.aac_coder import aac_coder_1 -from core.aac_decoder import aac_decoder_1 +from core.aac_coder import aac_coder_1, aac_coder_2, aac_read_wav_stereo_48k +from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_remove_padding from core.aac_types import * +from core.aac_snr_db import snr_db # Helper "fixtures" for aac_coder_1 / i_aac_coder_1 # ----------------------------------------------------------------------------- -def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: - """ - Compute overall SNR (dB) over all samples and channels after aligning lengths. - - Parameters - ---------- - x_ref : StereoSignal - Reference signal, shape (N, 2) typical. - x_hat : StereoSignal - Reconstructed signal, shape (M, 2) typical. - - Returns - ------- - float - SNR in dB. - - Returns +inf if noise power is zero. - - Returns -inf if signal power is zero. - """ - x_ref = np.asarray(x_ref, dtype=np.float64) - x_hat = np.asarray(x_hat, dtype=np.float64) - - # Be conservative: align lengths and common channels. - if x_ref.ndim == 1: - x_ref = x_ref.reshape(-1, 1) - if x_hat.ndim == 1: - x_hat = x_hat.reshape(-1, 1) - - n = min(x_ref.shape[0], x_hat.shape[0]) - c = min(x_ref.shape[1], x_hat.shape[1]) - - x_ref = x_ref[:n, :c] - x_hat = x_hat[:n, :c] - - err = x_ref - x_hat - ps = float(np.sum(x_ref * x_ref)) - pn = float(np.sum(err * err)) - - if pn <= 0.0: - return float("inf") - if ps <= 0.0: - return float("-inf") - - return float(10.0 * np.log10(ps / pn)) - - @pytest.fixture() def tmp_stereo_wav(tmp_path: Path) -> Path: """ @@ -89,6 +45,56 @@ def tmp_stereo_wav(tmp_path: Path) -> Path: return wav_path +# ----------------------------------------------------------------------------- +# Helper-function tests +# ----------------------------------------------------------------------------- + +def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None: + """ + Contract test for aac_read_wav_stereo_48k(): + - Reads stereo WAV + - Returns float64 array with shape (N,2) + - Returns fs = 48000 + """ + x, fs = aac_read_wav_stereo_48k(tmp_stereo_wav) + + assert int(fs) == 48000 + assert isinstance(x, np.ndarray) + assert x.dtype == np.float64 + assert x.ndim == 2 + assert x.shape[1] == 2 + assert x.shape[0] > 0 + + +def test_aac_remove_padding_removes_hop_from_both_ends() -> None: + """ + Contract test for aac_remove_padding(): + - Removes 'hop' samples from start and end. + """ + hop = 1024 + n = 10000 + + y_pad: StereoSignal = np.zeros((n, 2), dtype=np.float64) + y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + + assert y.shape == (n - 2 * hop, 2) + assert y.dtype == np.float64 + + +def test_aac_remove_padding_errors_on_too_short_input() -> None: + """ + aac_remove_padding must raise if y_pad is shorter than 2*hop. + """ + hop = 1024 + y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64) + + with pytest.raises(ValueError): + _ = aac_remove_padding(y_pad, hop=hop) + + +# ----------------------------------------------------------------------------- +# Level 1 tests +# ----------------------------------------------------------------------------- def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: """ Module-level contract test: @@ -152,5 +158,68 @@ def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: P assert int(fs_hat) == 48000 # SNR against returned array (file should match closely, but we do not require it here). - snr = _snr_db(x_ref, x_hat) + snr = snr_db(x_ref, x_hat) assert snr > 80.0 + +# ----------------------------------------------------------------------------- +# Level 2 tests (new) +# ----------------------------------------------------------------------------- + +def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: + """ + Module-level contract test (Level 2): + Ensure aac_seq_2 follows the expected schema and per-frame shapes, including tns_coeffs. + """ + aac_seq: AACSeq2 = aac_coder_2(tmp_stereo_wav) + + assert isinstance(aac_seq, list) + assert len(aac_seq) > 0 + + for fr in aac_seq: + assert "frame_type" in fr + assert "win_type" in fr + assert "chl" in fr + assert "chr" in fr + + frame_type: FrameType = fr["frame_type"] + assert frame_type in ("OLS", "LSS", "ESH", "LPS") + + for ch_key in ("chl", "chr"): + ch = fr[ch_key] + assert "frame_F" in ch + assert "tns_coeffs" in ch + + frame_f = np.asarray(ch["frame_F"], dtype=np.float64) + coeffs = np.asarray(ch["tns_coeffs"], dtype=np.float64) + + if frame_type == "ESH": + assert frame_f.shape == (128, 8) + assert coeffs.shape[0] == 4 + assert coeffs.shape[1] == 8 + else: + assert frame_f.shape == (1024, 1) + assert coeffs.shape == (4, 1) + + +def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: + """ + End-to-end test (Level 2): + Encode + decode and check SNR remains very high. + + Level 2 is still floating-point (TNS is reversible), so reconstruction + should remain numerical-noise only. + """ + x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True) + x_ref = np.asarray(x_ref, dtype=np.float64) + assert int(fs) == 48000 + + out_wav = tmp_path / "out_l2.wav" + aac_seq = aac_coder_2(tmp_stereo_wav) + x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav) + + assert out_wav.exists() + _, fs_hat = sf.read(str(out_wav), always_2d=True) + assert int(fs_hat) == 48000 + + snr = snr_db(x_ref, x_hat) + assert snr > 75.0 \ No newline at end of file diff --git a/source/core/tests/test_filterbank.py b/source/core/tests/test_filterbank.py index ad2bd45..e0309ed 100644 --- a/source/core/tests/test_filterbank.py +++ b/source/core/tests/test_filterbank.py @@ -17,6 +17,7 @@ from typing import Sequence import pytest from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank +from core.aac_snr_db import snr_db from core.aac_types import * # Helper fixtures for filterbank @@ -56,20 +57,6 @@ def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type return y -def _snr_db(x: StereoSignal, y: StereoSignal) -> float: - """ - Compute SNR in dB over all samples/channels. - """ - err = x - y - ps = float(np.sum(x * x)) - pn = float(np.sum(err * err)) - if pn <= 0.0: - return float("inf") - if ps <= 0.0: - return float("-inf") - return 10.0 * float(np.log10(ps / pn)) - - # ----------------------------------------------------------------------------- # Forward filterbank tests # ----------------------------------------------------------------------------- @@ -223,7 +210,7 @@ def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None: a = 1024 b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) + snr = snr_db(x[a:b, :], y[a:b, :]) assert snr > 50.0 @@ -244,7 +231,7 @@ def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None: a = 1024 b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) + snr = snr_db(x[a:b, :], y[a:b, :]) assert snr > 45.0 @@ -265,5 +252,5 @@ def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None: a = 1024 b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) + snr = snr_db(x[a:b, :], y[a:b, :]) assert snr > 40.0 diff --git a/source/core/tests/test_snr_db.py b/source/core/tests/test_snr_db.py new file mode 100644 index 0000000..639fdd1 --- /dev/null +++ b/source/core/tests/test_snr_db.py @@ -0,0 +1,98 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - SNR dB Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Basic tests for SNR calculation utility. +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np +import pytest + +from core.aac_snr_db import snr_db +from core.aac_types import StereoSignal + + +def test_snr_perfect_reconstruction_returns_inf() -> None: + """ + If x_hat == x_ref exactly, noise power is zero and SNR must be +inf. + """ + rng = np.random.default_rng(0) + x: StereoSignal = rng.normal(size=(1024, 2)).astype(np.float64) + + snr = snr_db(x, x) + assert snr == float("inf") + + +def test_snr_zero_reference_returns_minus_inf() -> None: + """ + If reference signal is identically zero, signal power is zero + and SNR must be -inf (unless noise is also zero, which is degenerate). + """ + x_ref: StereoSignal = np.zeros((1024, 2), dtype=np.float64) + x_hat: StereoSignal = np.ones((1024, 2), dtype=np.float64) + + snr = snr_db(x_ref, x_hat) + assert snr == float("-inf") + + +def test_snr_known_noise_level_matches_expected_value() -> None: + """ + Deterministic test with known signal and noise power. + + Let: + x_ref = ones + x_hat = ones + noise + + With noise variance sigma^2, expected SNR: + 10 * log10(Ps / Pn) + """ + n = 1000 + sigma = 0.1 + + x_ref: StereoSignal = np.ones((n, 2), dtype=np.float64) + noise = sigma * np.ones((n, 2), dtype=np.float64) + x_hat: StereoSignal = x_ref + noise + + ps = float(np.sum(x_ref * x_ref)) + pn = float(np.sum(noise * noise)) + expected = 10.0 * np.log10(ps / pn) + + snr = snr_db(x_ref, x_hat) + assert np.isclose(snr, expected, rtol=1e-12, atol=1e-12) + + +def test_snr_aligns_different_lengths_and_channels() -> None: + """ + The function must: + - align to minimum length + - align to minimum channel count + without crashing. + """ + rng = np.random.default_rng(1) + + x_ref: StereoSignal = rng.normal(size=(1000, 2)).astype(np.float64) + x_hat: StereoSignal = rng.normal(size=(800, 1)).astype(np.float64) + + snr = snr_db(x_ref, x_hat) + assert np.isfinite(snr) + + +def test_snr_accepts_1d_inputs() -> None: + """ + 1-D inputs must be accepted and treated as single-channel signals. + """ + rng = np.random.default_rng(2) + + x_ref = rng.normal(size=1024).astype(np.float64) + x_hat = x_ref + 0.01 * rng.normal(size=1024).astype(np.float64) + + snr = snr_db(x_ref, x_hat) + assert np.isfinite(snr) diff --git a/source/core/tests/test_tns.py b/source/core/tests/test_tns.py new file mode 100644 index 0000000..410b4dd --- /dev/null +++ b/source/core/tests/test_tns.py @@ -0,0 +1,196 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - TNS Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Temporal Noise Shaping (TNS) module (Level 2). +# +# Validates: +# - I/O shapes for long and ESH modes +# - Quantization grid and clamping of predictor coefficients +# - Inverse-filter stability (all poles inside unit circle) +# - Functional correctness: iTNS(TNS(X)) ≈ X +# ------------------------------------------------------------ + +from __future__ import annotations + +import pytest + +from core.aac_configuration import PRED_ORDER, QUANT_MAX, QUANT_STEP +from core.aac_tns import aac_tns, aac_i_tns +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Helper utilities +# ----------------------------------------------------------------------------- + +def _is_inverse_stable_from_coeffs(a_q: MdctCoeffs) -> bool: + """ + Check stability of the inverse TNS filter H_TNS^{-1}. + + Poles are roots of: + z^p - a1 z^{p-1} - ... - ap = 0 + + Stability condition: + |root| < 1 for all roots. + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + + poly = np.empty(p + 1, dtype=np.float64) + poly[0] = 1.0 + poly[1:] = -a_q + + roots = np.roots(poly) + margin = 1e-12 + return bool(np.all(np.abs(roots) < (1.0 - margin))) + + +def _assert_quantized_and_clamped(a_q: MdctCoeffs) -> None: + """ + Assert that coefficients: + - lie on the QUANT_STEP grid + - are clamped to [-QUANT_MAX, +QUANT_MAX] + """ + a_q = np.asarray(a_q, dtype=np.float64) + + assert np.max(np.abs(a_q)) <= (QUANT_MAX + 1e-12) + + grid = a_q / float(QUANT_STEP) + assert np.max(np.abs(grid - np.round(grid))) < 1e-12 + + +# ----------------------------------------------------------------------------- +# Shape / contract tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_tns_shapes_long_sequences(frame_type: FrameType) -> None: + """ + Contract test (long frames): + - Input shape: (1024, 1) + - Output shape preserved + - Predictor coeffs shape: (PRED_ORDER, 1) + """ + rng = np.random.default_rng(0) + frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64) + + frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type) + + assert frame_F_out.shape == frame_F_in.shape + assert tns_coeffs.shape == (PRED_ORDER, 1) + + +def test_tns_shapes_esh() -> None: + """ + Contract test (ESH): + - Input shape: (128, 8) + - Output shape preserved + - Predictor coeffs shape: (PRED_ORDER, 8) + """ + rng = np.random.default_rng(1) + frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64) + + frame_F_out, tns_coeffs = aac_tns(frame_F_in, "ESH") + + assert frame_F_out.shape == (128, 8) + assert tns_coeffs.shape == (PRED_ORDER, 8) + + +# ----------------------------------------------------------------------------- +# Coefficient properties +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_tns_coeffs_quantized_clamped_and_stable_long(frame_type: FrameType) -> None: + """ + Long-frame predictor coefficients must be: + - quantized on QUANT_STEP grid + - clamped to [-QUANT_MAX, +QUANT_MAX] + - stable for inverse filtering + """ + rng = np.random.default_rng(2) + frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64) + + _, tns_coeffs = aac_tns(frame_F_in, frame_type) + + a_q: MdctCoeffs = tns_coeffs[:, 0] + _assert_quantized_and_clamped(a_q) + assert _is_inverse_stable_from_coeffs(a_q) + + +def test_tns_coeffs_quantized_clamped_and_stable_esh() -> None: + """ + ESH predictor coefficients must satisfy quantization and stability + independently for each of the 8 short subframes. + """ + rng = np.random.default_rng(3) + frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64) + + _, tns_coeffs = aac_tns(frame_F_in, "ESH") + + for j in range(8): + a_q: MdctCoeffs = tns_coeffs[:, j] + _assert_quantized_and_clamped(a_q) + assert _is_inverse_stable_from_coeffs(a_q) + + +# ----------------------------------------------------------------------------- +# Functional correctness (round-trip) +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_tns_roundtrip_long_is_close(frame_type: FrameType) -> None: + """ + Functional test: + iTNS(TNS(X)) ≈ X for long frames. + """ + rng = np.random.default_rng(4) + frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64) + + frame_F_tns, tns_coeffs = aac_tns(frame_F_in, frame_type) + frame_F_hat = aac_i_tns(frame_F_tns, frame_type, tns_coeffs) + + np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-9, atol=1e-9) + + +def test_tns_roundtrip_esh_is_close() -> None: + """ + Functional test: + iTNS(TNS(X)) ≈ X for ESH frames (8 independent subframes). + """ + rng = np.random.default_rng(5) + frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64) + + frame_F_tns, tns_coeffs = aac_tns(frame_F_in, "ESH") + frame_F_hat = aac_i_tns(frame_F_tns, "ESH", tns_coeffs) + + np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-9, atol=1e-9) + + +# ----------------------------------------------------------------------------- +# Sanity +# ----------------------------------------------------------------------------- + +def test_tns_outputs_are_finite() -> None: + """ + Sanity test: no NaN or inf in outputs. + """ + rng = np.random.default_rng(6) + + frame_F_long: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64) + out_long, coeffs_long = aac_tns(frame_F_long, "OLS") + assert np.isfinite(out_long).all() + assert np.isfinite(coeffs_long).all() + + frame_F_esh: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64) + out_esh, coeffs_esh = aac_tns(frame_F_esh, "ESH") + assert np.isfinite(out_esh).all() + assert np.isfinite(coeffs_esh).all() diff --git a/source/level_1/core/aac_coder.py b/source/level_1/core/aac_coder.py index 837ac34..0b6b939 100644 --- a/source/level_1/core/aac_coder.py +++ b/source/level_1/core/aac_coder.py @@ -30,6 +30,7 @@ import soundfile as sf from core.aac_configuration import WIN_TYPE from core.aac_filterbank import aac_filter_bank from core.aac_ssc import aac_SSC +from core.aac_tns import aac_tns from core.aac_types import * @@ -144,8 +145,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: AACSeq1 List of encoded frames (Level 1 schema). """ - x, fs = aac_read_wav_stereo_48k(filename_in) - _ = fs # kept for clarity; The assignment assumes 48 kHz + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz hop = 1024 win = 2048 @@ -196,3 +197,88 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: prev_frame_type = frame_type return aac_seq + + +def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: + """ + Level-2 AAC encoder (Level 1 + TNS). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + + Returns + ------- + AACSeq2 + Encoded AAC sequence (Level 2 payload schema). + For each frame i: + - "frame_type": FrameType + - "win_type": WinType + - "chl"/"chr": + - "frame_F": FrameChannelF (after TNS) + - "tns_coeffs": TnsCoeffs + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq2 = [] + prev_frame_type: FrameType = "OLS" + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + + # Level 1 analysis (packed stereo container) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) + + # Unpack to per-channel (as you already do in Level 1) + if frame_type == "ESH": + chl_f = np.empty((128, 8), dtype=np.float64) + chr_f = np.empty((128, 8), dtype=np.float64) + for j in range(8): + chl_f[:, j] = frame_f_stereo[:, 2 * j + 0] + chr_f[:, j] = frame_f_stereo[:, 2 * j + 1] + else: + chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False) + chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False) + + # Level 2: apply TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + aac_seq.append( + { + "frame_type": frame_type, + "win_type": WIN_TYPE, + "chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs}, + "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, + } + ) + + prev_frame_type = frame_type + + return aac_seq \ No newline at end of file diff --git a/source/level_1/core/aac_configuration.py b/source/level_1/core/aac_configuration.py index 262e884..bb75d2e 100644 --- a/source/level_1/core/aac_configuration.py +++ b/source/level_1/core/aac_configuration.py @@ -17,6 +17,15 @@ from __future__ import annotations # Imports from core.aac_types import WinType +# Filterbank +# ------------------------------------------------------------ # Window type # Options: "SIN", "KBD" -WIN_TYPE: WinType = "SIN" \ No newline at end of file +WIN_TYPE: WinType = "SIN" + + +# TNS +# ------------------------------------------------------------ +PRED_ORDER = 4 +QUANT_STEP = 0.1 +QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7] \ No newline at end of file diff --git a/source/level_1/core/aac_decoder.py b/source/level_1/core/aac_decoder.py index eb30011..606ce78 100644 --- a/source/level_1/core/aac_decoder.py +++ b/source/level_1/core/aac_decoder.py @@ -28,6 +28,7 @@ from typing import Union import soundfile as sf from core.aac_filterbank import aac_i_filter_bank +from core.aac_tns import aac_i_tns from core.aac_types import * @@ -164,3 +165,93 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS sf.write(str(filename_out), y, 48000) return y + + +def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-2 AAC decoder (inverse of aac_coder_2). + + Behavior matches Level 1 decoder pipeline, with additional iTNS stage: + - Per frame/channel: inverse TNS using stored coefficients + - Re-pack to stereo frame_F + - IMDCT + windowing + - Overlap-add over frames + - Remove Level-1 padding (hop samples start/end) + - Write output WAV (48 kHz) + + Parameters + ---------- + aac_seq_2 : AACSeq2 + Encoded sequence as produced by aac_coder_2(). + filename_out : Union[str, Path] + Output WAV filename. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_2) + + if K <= 0: + raise ValueError("aac_seq_2 must contain at least one frame.") + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_2): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + + # Inverse TNS per channel + chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs) + chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs) + + # Re-pack to the stereo container expected by aac_i_filter_bank + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f: FrameF = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + else: + # Accept either (1024,1) or (1024,) from your internal convention. + if chl_f.shape == (1024,): + chl_col = chl_f.reshape(1024, 1) + elif chl_f.shape == (1024, 1): + chl_col = chl_f + else: + raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).") + + if chr_f.shape == (1024,): + chr_col = chr_f.reshape(1024, 1) + elif chr_f.shape == (1024, 1): + chr_col = chr_f + else: + raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_col[:, 0] + frame_f[:, 1] = chr_col[:, 0] + + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + + y = aac_remove_padding(y_pad, hop=hop) + + sf.write(str(filename_out), y, 48000) + return y \ No newline at end of file diff --git a/source/level_1/core/aac_snr_db.py b/source/level_1/core/aac_snr_db.py new file mode 100644 index 0000000..21e5184 --- /dev/null +++ b/source/level_1/core/aac_snr_db.py @@ -0,0 +1,60 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - SNR dB calculator +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements SNR calculation in dB +# ------------------------------------------------------------ +from __future__ import annotations + +from core.aac_types import StereoSignal +import numpy as np + +def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Compute overall SNR (dB) over all samples and channels after aligning lengths. + + Parameters + ---------- + x_ref : StereoSignal + Reference stereo stream. + x_hat : StereoSignal + Reconstructed stereo stream. + + Returns + ------- + float + SNR in dB. + - Returns +inf if noise power is zero. + - Returns -inf if signal power is zero. + """ + x_ref = np.asarray(x_ref, dtype=np.float64) + x_hat = np.asarray(x_hat, dtype=np.float64) + + if x_ref.ndim == 1: + x_ref = x_ref.reshape(-1, 1) + if x_hat.ndim == 1: + x_hat = x_hat.reshape(-1, 1) + + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + x_ref = x_ref[:n, :c] + x_hat = x_hat[:n, :c] + + err = x_ref - x_hat + ps = float(np.sum(x_ref * x_ref)) + pn = float(np.sum(err * err)) + + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") + + return float(10.0 * np.log10(ps / pn)) \ No newline at end of file diff --git a/source/level_1/core/aac_types.py b/source/level_1/core/aac_types.py index 8094163..b52891a 100644 --- a/source/level_1/core/aac_types.py +++ b/source/level_1/core/aac_types.py @@ -10,7 +10,6 @@ # # Description: # This module implements Public Type aliases -# # ------------------------------------------------------------ from __future__ import annotations @@ -39,7 +38,7 @@ Window type codes (AAC): """ ChannelKey: TypeAlias = Literal["chl", "chr"] -"""Channel dictionary keys used in Level 1 payloads.""" +"""Channel dictionary keys used in Level payloads.""" # ----------------------------------------------------------------------------- @@ -105,6 +104,40 @@ Examples: dtype: float64 """ +MdctFrameChannel: TypeAlias = FloatArray +""" +Per-channel MDCT container used in Level-1/2 sequences. + +Typical shapes: +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) + +dtype: float64 + +Notes +----- +Some parts of the assignment store long-frame coefficients as a column vector +(1024, 1) to match MATLAB conventions. Internally you may also use (1024,) +when convenient, but the semantic meaning is identical. +""" + +TnsCoeffs: TypeAlias = FloatArray +""" +Quantized TNS predictor coefficients (one channel). + +Typical shapes (Level 2): +- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes) +- Else: (4, 1) (order p=4 for the long frame) + +dtype: float64 + +Notes +----- +The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1. +We store the quantized coefficient values as float64 (typically multiples of 0.1) +to keep the pipeline simple and readable. +""" + FrameT: TypeAlias = FloatArray """ @@ -142,17 +175,23 @@ Rationale for ESH (128, 16): dtype: float64 """ -FrameChannelF: TypeAlias = FloatArray +FrameChannelF: TypeAlias = MdctFrameChannel """ -Frequency-domain single-channel frame (MDCT coefficients). +Frequency-domain single-channel MDCT coefficients. -Typical shapes (Level 1): -- If frame_type in {"OLS","LSS","LPS"}: (1024,) -- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) +Typical shapes (Level 1/2): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) dtype: float64 """ +BandRanges: TypeAlias = list[tuple[int, int]] +""" +Bark-band index ranges [start, end] (inclusive) for MDCT lines. + +Used by TNS to map MDCT indices k to Bark bands. +""" # ----------------------------------------------------------------------------- # Level 1 AAC sequence payload types @@ -168,7 +207,7 @@ class AACChannelFrameF(TypedDict): The MDCT coefficients for ONE channel. Typical shapes: - ESH: (128, 8) (8 short subframes) - - else: (1024, ) + - else: (1024, 1) or (1024,) """ frame_F: FrameChannelF @@ -191,3 +230,53 @@ List of length K (K = number of frames). Each element is a dict with keys: - "frame_type", "win_type", "chl", "chr" """ + + +# ----------------------------------------------------------------------------- +# Level 2 AAC sequence payload types (TNS) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF2(TypedDict): + """ + Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2). + + Keys + ---- + frame_F: + The TNS-processed MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) + - else: (1024, 1) or (1024,) + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Typical shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + """ + frame_F: FrameChannelF + tns_coeffs: TnsCoeffs + + +class AACSeq2Frame(TypedDict): + """ + One frame dictionary element of aac_seq_2 (Level 2). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF2 + chr: AACChannelFrameF2 + + +AACSeq2: TypeAlias = List[AACSeq2Frame] +""" +AAC sequence for Level 2: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 2 adds: +- per-channel "tns_coeffs" +and stores: +- per-channel "frame_F" after applying TNS. +""" diff --git a/source/level_1/core/tests/test_SSC.py b/source/level_1/core/tests/test_SSC.py deleted file mode 100644 index 91bcf21..0000000 --- a/source/level_1/core/tests/test_SSC.py +++ /dev/null @@ -1,234 +0,0 @@ -# ------------------------------------------------------------ -# AAC Coder/Decoder - Sequence Segmentation Control Tests -# -# Multimedia course at Aristotle University of -# Thessaloniki (AUTh) -# -# Author: -# Christos Choutouridis (ΑΕΜ 8997) -# cchoutou@ece.auth.gr -# -# Description: -# Tests for Sequence Segmentation Control module (SSC). -# ------------------------------------------------------------ - -from __future__ import annotations - -import numpy as np - -from core.aac_ssc import aac_SSC -from core.aac_types import FrameT - -# ----------------------------------------------------------------------------- -# Helper fixtures for SSC -# ----------------------------------------------------------------------------- - -def _next_frame_no_attack() -> FrameT: - """ - Build a next_frame_T that must NOT trigger ESH detection. - - Uses exact zeros so all segment energies are zero and the condition - s[l] > 1e-3 cannot hold for any l. - """ - return np.zeros((2048, 2), dtype=np.float64) - - -def _next_frame_strong_attack( - *, - attack_left: bool, - attack_right: bool, - segment_l: int = 4, - baseline: float = 1e-6, - burst_amp: float = 1.0, -) -> FrameT: - """ - Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels. - - Attack criterion (spec): - Attack exists if there exists l in {1..7} such that: - s[l] > 1e-3 and ds[l] > 10, - where s[l] is the energy of segment l (length 128) after high-pass filtering, - and ds[l] = s[l] / s[l-1]. - - Construction: - - A small baseline is added everywhere to avoid relying on the epsilon guard in ds, - keeping ds behavior stable/reproducible. - - A strong burst is added inside a chosen segment l in 1..7. - """ - if not (1 <= segment_l <= 7): - raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.") - - x = np.full((2048, 2), baseline, dtype=np.float64) - - a = segment_l * 128 - b = (segment_l + 1) * 128 - - if attack_left: - x[a:b, 0] += burst_amp - if attack_right: - x[a:b, 1] += burst_amp - - return x - - -def _next_frame_below_s_threshold( - *, - left: bool, - right: bool, - segment_l: int = 4, - impulse_amp: float = 0.01, -) -> FrameT: - """ - Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered, - even if the ratio ds[l] could be large. - - We place a single impulse of amplitude 'impulse_amp' inside one segment. - Approx. segment energy: s[l] ~= impulse_amp^2. - - Example: - impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3 - """ - if not (1 <= segment_l <= 7): - raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.") - - x = np.zeros((2048, 2), dtype=np.float64) - - idx = segment_l * 128 + 10 # inside segment l - if left: - x[idx, 0] = impulse_amp - if right: - x[idx, 1] = impulse_amp - - return x - - -# ----------------------------------------------------------------------------- -# 1) Fixed/mandatory cases (prev frame type forces current type) -# ----------------------------------------------------------------------------- - -def test_ssc_fixed_cases_prev_lss_and_lps() -> None: - """ - Spec: - - If prev was LSS => current MUST be ESH - - If prev was LPS => current MUST be OLS - independent of attack detection on (i+1). - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - - next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True) - - out1 = aac_SSC(frame_t, next_attack, "LSS") - assert out1 == "ESH" - - out2 = aac_SSC(frame_t, next_attack, "LPS") - assert out2 == "OLS" - - -# ----------------------------------------------------------------------------- -# 2) Cases requiring next-frame ESH prediction (attack computation) -# ----------------------------------------------------------------------------- - -def test_prev_ols_next_not_esh_returns_ols() -> None: - """ - If prev=OLS, current is: - - LSS iff (i+1) is predicted ESH - - else OLS - Here: no attack => expect OLS. - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_no_attack() - - out = aac_SSC(frame_t, next_t, "OLS") - assert out == "OLS" - - -def test_prev_ols_next_esh_both_channels_returns_lss() -> None: - """ - prev=OLS and next predicted ESH for both channels: - per-channel: LSS, LSS - merged: LSS - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) - - out = aac_SSC(frame_t, next_t, "OLS") - assert out == "LSS" - - -def test_prev_ols_next_esh_one_channel_returns_lss() -> None: - """ - prev=OLS: - - one channel predicts ESH => LSS - - other channel predicts not ESH => OLS - Merge table: OLS + LSS => LSS (either side). - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - - next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) - out1 = aac_SSC(frame_t, next1_t, "OLS") - assert out1 == "LSS" - - next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) - out2 = aac_SSC(frame_t, next2_t, "OLS") - assert out2 == "LSS" - - -def test_prev_esh_next_esh_both_channels_returns_esh() -> None: - """ - prev=ESH and next predicted ESH for both channels: - per-channel: ESH, ESH - merged: ESH - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) - - out = aac_SSC(frame_t, next_t, "ESH") - assert out == "ESH" - - -def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None: - """ - prev=ESH and next not predicted ESH for both channels: - per-channel: LPS, LPS - merged: LPS - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_no_attack() - - out = aac_SSC(frame_t, next_t, "ESH") - assert out == "LPS" - - -def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None: - """ - prev=ESH: - - one channel predicts ESH => ESH - - other channel predicts not ESH => LPS - Merge table: ESH + LPS => ESH (either side). - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - - next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) - out1 = aac_SSC(frame_t, next1_t, "ESH") - assert out1 == "ESH" - - next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) - out2 = aac_SSC(frame_t, next2_t, "ESH") - assert out2 == "ESH" - - -def test_threshold_s_must_exceed_1e_3() -> None: - """ - Spec: next frame is predicted ESH only if: - s[l] > 1e-3 AND ds[l] > 10 - for some l in 1..7. - - This test checks the necessity of the s[l] threshold: - - Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01). - - Expect: not classified as ESH -> for prev=OLS return OLS. - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01) - - out = aac_SSC(frame_t, next_t, "OLS") - assert out == "OLS" diff --git a/source/level_1/core/tests/test_aac_coder_decoder.py b/source/level_1/core/tests/test_aac_coder_decoder.py deleted file mode 100644 index e8bb669..0000000 --- a/source/level_1/core/tests/test_aac_coder_decoder.py +++ /dev/null @@ -1,156 +0,0 @@ -# ------------------------------------------------------------ -# AAC Coder/Decoder - AAC Coder/DecoderTests -# -# Multimedia course at Aristotle University of -# Thessaloniki (AUTh) -# -# Author: -# Christos Choutouridis (ΑΕΜ 8997) -# cchoutou@ece.auth.gr -# -# Description: -# Tests for AAC Coder/Decoder module. -# ------------------------------------------------------------ -from __future__ import annotations - -from pathlib import Path - -import numpy as np -import pytest -import soundfile as sf - -from core.aac_coder import aac_coder_1 -from core.aac_decoder import aac_decoder_1 -from core.aac_types import * - - -# Helper "fixtures" for aac_coder_1 / i_aac_coder_1 -# ----------------------------------------------------------------------------- - -def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: - """ - Compute overall SNR (dB) over all samples and channels after aligning lengths. - - Parameters - ---------- - x_ref : StereoSignal - Reference signal, shape (N, 2) typical. - x_hat : StereoSignal - Reconstructed signal, shape (M, 2) typical. - - Returns - ------- - float - SNR in dB. - - Returns +inf if noise power is zero. - - Returns -inf if signal power is zero. - """ - x_ref = np.asarray(x_ref, dtype=np.float64) - x_hat = np.asarray(x_hat, dtype=np.float64) - - # Be conservative: align lengths and common channels. - if x_ref.ndim == 1: - x_ref = x_ref.reshape(-1, 1) - if x_hat.ndim == 1: - x_hat = x_hat.reshape(-1, 1) - - n = min(x_ref.shape[0], x_hat.shape[0]) - c = min(x_ref.shape[1], x_hat.shape[1]) - - x_ref = x_ref[:n, :c] - x_hat = x_hat[:n, :c] - - err = x_ref - x_hat - ps = float(np.sum(x_ref * x_ref)) - pn = float(np.sum(err * err)) - - if pn <= 0.0: - return float("inf") - if ps <= 0.0: - return float("-inf") - - return float(10.0 * np.log10(ps / pn)) - - -@pytest.fixture() -def tmp_stereo_wav(tmp_path: Path) -> Path: - """ - Create a temporary 48 kHz stereo WAV with random samples. - """ - rng = np.random.default_rng(123) - fs = 48000 - - # ~1 second of audio (kept small for test speed). - n = fs - x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64) - - wav_path = tmp_path / "in.wav" - sf.write(str(wav_path), x, fs) - return wav_path - - -def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: - """ - Module-level contract test: - Ensure aac_seq_1 follows the expected schema and per-frame shapes. - """ - aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav) - - assert isinstance(aac_seq, list) - assert len(aac_seq) > 0 - - for fr in aac_seq: - assert isinstance(fr, dict) - - # Required keys - assert "frame_type" in fr - assert "win_type" in fr - assert "chl" in fr - assert "chr" in fr - - frame_type = fr["frame_type"] - win_type = fr["win_type"] - - assert frame_type in ("OLS", "LSS", "ESH", "LPS") - assert win_type in ("SIN", "KBD") - - assert isinstance(fr["chl"], dict) - assert isinstance(fr["chr"], dict) - assert "frame_F" in fr["chl"] - assert "frame_F" in fr["chr"] - - chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) - chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) - - if frame_type == "ESH": - assert chl_f.shape == (128, 8) - assert chr_f.shape == (128, 8) - else: - assert chl_f.shape == (1024, 1) - assert chr_f.shape == (1024, 1) - - -def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: - """ - End-to-end test: - Encode + decode and check SNR is very high (numerical-noise only). - - The threshold is intentionally loose to avoid fragility across platforms/BLAS. - """ - x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True) - x_ref = np.asarray(x_ref, dtype=np.float64) - assert int(fs) == 48000 - - out_wav = tmp_path / "out.wav" - - aac_seq = aac_coder_1(tmp_stereo_wav) - x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav) - - # Basic sanity: output file exists and is readable - assert out_wav.exists() - x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True) - assert int(fs_hat) == 48000 - - # SNR against returned array (file should match closely, but we do not require it here). - snr = _snr_db(x_ref, x_hat) - assert snr > 80.0 diff --git a/source/level_1/core/tests/test_filterbank.py b/source/level_1/core/tests/test_filterbank.py deleted file mode 100644 index ad2bd45..0000000 --- a/source/level_1/core/tests/test_filterbank.py +++ /dev/null @@ -1,269 +0,0 @@ -# ------------------------------------------------------------ -# AAC Coder/Decoder - Filterbank Tests -# -# Multimedia course at Aristotle University of -# Thessaloniki (AUTh) -# -# Author: -# Christos Choutouridis (ΑΕΜ 8997) -# cchoutou@ece.auth.gr -# -# Description: -# Tests for Filterbank module. -# ------------------------------------------------------------ -from __future__ import annotations - -from typing import Sequence -import pytest - -from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank -from core.aac_types import * - -# Helper fixtures for filterbank -# ----------------------------------------------------------------------------- - -def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal: - """ - Analyze-synthesize each frame and overlap-add with hop=1024. - - Parameters - ---------- - x : StereoSignal - Input stereo stream, expected shape (N, 2). - frame_types : Sequence[FrameType] - Length K sequence of frame types for frames starting at i*1024. - win_type : WinType - Window type ("SIN" or "KBD"). - - Returns - ------- - StereoSignal - Reconstructed stereo stream, same shape as x (N, 2). - """ - hop = 1024 - win = 2048 - K = len(frame_types) - - y: StereoSignal = np.zeros_like(x, dtype=np.float64) - - for i in range(K): - start = i * hop - frame_t: FrameT = x[start:start + win, :] - frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type) - frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type) - y[start:start + win, :] += frame_t_hat - - return y - - -def _snr_db(x: StereoSignal, y: StereoSignal) -> float: - """ - Compute SNR in dB over all samples/channels. - """ - err = x - y - ps = float(np.sum(x * x)) - pn = float(np.sum(err * err)) - if pn <= 0.0: - return float("inf") - if ps <= 0.0: - return float("-inf") - return 10.0 * float(np.log10(ps / pn)) - - -# ----------------------------------------------------------------------------- -# Forward filterbank tests -# ----------------------------------------------------------------------------- - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) -def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None: - """ - Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2). - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - frame_f = aac_filter_bank(frame_t, frame_type, win_type) - assert frame_f.shape == (1024, 2) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_shapes_esh(win_type: WinType) -> None: - """ - Contract test: for ESH, aac_filter_bank returns shape (128, 16). - """ - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - frame_f = aac_filter_bank(frame_t, "ESH", win_type) - assert frame_f.shape == (128, 16) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None: - """ - Behavior test: for OLS (representative long-sequence), channels are independent. - If right channel is zero and left is random, right spectrum should be near zero. - """ - rng = np.random.default_rng(0) - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - frame_t[:, 0] = rng.normal(size=2048) - - frame_f = aac_filter_bank(frame_t, "OLS", win_type) - - assert np.max(np.abs(frame_f[:, 1])) < 1e-9 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_channel_isolation_esh(win_type: WinType) -> None: - """ - Behavior test: for ESH, channels are independent. - If right channel is zero and left is random, all odd columns (right) should be near zero. - """ - rng = np.random.default_rng(1) - frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) - frame_t[:, 0] = rng.normal(size=2048) - - frame_f = aac_filter_bank(frame_t, "ESH", win_type) - - right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15 - assert np.max(np.abs(right_cols)) < 1e-9 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None: - """ - Spec-driven behavior test: - ESH uses only the central region [448, 1600), split into 8 overlapping - windows of length 256 with 50% overlap. - - Therefore, changing samples outside [448, 1600) must not affect the output. - """ - rng = np.random.default_rng(2) - - frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64) - frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64) - - center = rng.normal(size=(1152, 2)) - frame_a[448:1600, :] = center - frame_b[448:1600, :] = center - - frame_b[0:448, :] = rng.normal(size=(448, 2)) - frame_b[1600:2048, :] = rng.normal(size=(448, 2)) - - fa = aac_filter_bank(frame_a, "ESH", win_type) - fb = aac_filter_bank(frame_b, "ESH", win_type) - - # Use a tiny tolerance to avoid flaky failures due to floating-point minutiae. - np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_output_is_finite(win_type: WinType) -> None: - """ - Sanity test: output must not contain NaN or inf for representative cases. - """ - rng = np.random.default_rng(3) - frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64) - - for frame_type in ("OLS", "LSS", "ESH", "LPS"): - frame_f = aac_filter_bank(frame_t, frame_type, win_type) - assert np.isfinite(frame_f).all() - - -# ----------------------------------------------------------------------------- -# Reverse i_filterbank tests -# ----------------------------------------------------------------------------- - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None: - """ - Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2). - """ - frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64) - for frame_type in ("OLS", "LSS", "LPS"): - frame_t = aac_i_filter_bank(frame_f, frame_type, win_type) - assert frame_t.shape == (2048, 2) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ifilterbank_shapes_esh(win_type: WinType) -> None: - """ - Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2). - """ - frame_f: FrameF = np.zeros((128, 16), dtype=np.float64) - frame_t = aac_i_filter_bank(frame_f, "ESH", win_type) - assert frame_t.shape == (2048, 2) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None: - """ - Sanity test: per-frame analysis+synthesis must produce finite outputs. - """ - rng = np.random.default_rng(0) - frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64) - - for frame_type in ("OLS", "LSS", "ESH", "LPS"): - frame_f = aac_filter_bank(frame_t, frame_type, win_type) - frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type) - assert np.isfinite(frame_t_hat).all() - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None: - """ - Module-level test: - OLS analysis+synthesis with hop=1024 must reconstruct with high SNR - in the steady-state region. - """ - rng = np.random.default_rng(1) - - K = 6 - N = 1024 * (K + 1) - x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) - - y = _ola_reconstruct(x, ["OLS"] * K, win_type) - - a = 1024 - b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) - assert snr > 50.0 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None: - """ - Module-level test: - ESH analysis+synthesis with hop=1024 must reconstruct with high SNR - in the steady-state region. - """ - rng = np.random.default_rng(2) - - K = 6 - N = 1024 * (K + 1) - x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) - - y = _ola_reconstruct(x, ["ESH"] * K, win_type) - - a = 1024 - b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) - assert snr > 45.0 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None: - """ - Transition sequence test matching the windowing logic: - OLS -> LSS -> ESH -> LPS -> OLS -> OLS - """ - rng = np.random.default_rng(3) - - frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"] - K = len(frame_types) - N = 1024 * (K + 1) - x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) - - y = _ola_reconstruct(x, frame_types, win_type) - - a = 1024 - b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) - assert snr > 40.0 diff --git a/source/level_1/core/tests/test_filterbank_internal.py b/source/level_1/core/tests/test_filterbank_internal.py deleted file mode 100644 index e092ad1..0000000 --- a/source/level_1/core/tests/test_filterbank_internal.py +++ /dev/null @@ -1,117 +0,0 @@ -# ------------------------------------------------------------ -# AAC Coder/Decoder - Filterbank internal (mdct) Tests -# -# Multimedia course at Aristotle University of -# Thessaloniki (AUTh) -# -# Author: -# Christos Choutouridis (ΑΕΜ 8997) -# cchoutou@ece.auth.gr -# -# Description: -# Tests for Filterbank internal MDCT/IMDCT functionality. -# ------------------------------------------------------------ -from __future__ import annotations - -import numpy as np -import pytest - -from core.aac_filterbank import _imdct, _mdct -from core.aac_types import FloatArray, TimeSignal, MdctCoeffs - - -def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None: - """ - Helper for consistent tolerances across tests. - """ - np.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - - -def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float: - """ - Estimate scalar gain g such that y ~= g*x in least-squares sense. - """ - denom = float(np.dot(x, x)) - if denom == 0.0: - return 0.0 - return float(np.dot(y, x) / denom) - - -tolerance = 1e-10 - -@pytest.mark.parametrize("N", [256, 2048]) -def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None: - """ - Consistency test in coefficient domain: - mdct(imdct(X)) ~= g * X - - For the chosen (non-orthonormal) scaling, g is expected to be close to 2. - """ - rng = np.random.default_rng(0) - K = N // 2 - - X: MdctCoeffs = rng.normal(size=K).astype(np.float64) - x: TimeSignal = _imdct(X) - X_hat: MdctCoeffs = _mdct(x) - - g = _estimate_gain(X_hat, X) - _assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance) - _assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance) - - -@pytest.mark.parametrize("N", [256, 2048]) -def test_mdct_linearity(N: int) -> None: - """ - Linearity test: - mdct(a*x + b*y) == a*mdct(x) + b*mdct(y) - """ - rng = np.random.default_rng(1) - x: TimeSignal = rng.normal(size=N).astype(np.float64) - y: TimeSignal = rng.normal(size=N).astype(np.float64) - - a = 0.37 - b = -1.12 - - left: MdctCoeffs = _mdct(a * x + b * y) - right: MdctCoeffs = a * _mdct(x) + b * _mdct(y) - - _assert_allclose(left, right, rtol=tolerance, atol=tolerance) - - -@pytest.mark.parametrize("N", [256, 2048]) -def test_imdct_linearity(N: int) -> None: - """ - Linearity test for IMDCT: - imdct(a*X + b*Y) == a*imdct(X) + b*imdct(Y) - """ - rng = np.random.default_rng(2) - K = N // 2 - - X: MdctCoeffs = rng.normal(size=K).astype(np.float64) - Y: MdctCoeffs = rng.normal(size=K).astype(np.float64) - - a = -0.5 - b = 2.0 - - left: TimeSignal = _imdct(a * X + b * Y) - right: TimeSignal = a * _imdct(X) + b * _imdct(Y) - - _assert_allclose(left, right, rtol=tolerance, atol=tolerance) - - -@pytest.mark.parametrize("N", [256, 2048]) -def test_mdct_imdct_outputs_are_finite(N: int) -> None: - """ - Sanity test: no NaN/inf on random inputs. - """ - rng = np.random.default_rng(3) - K = N // 2 - - x: TimeSignal = rng.normal(size=N).astype(np.float64) - X: MdctCoeffs = rng.normal(size=K).astype(np.float64) - - X1 = _mdct(x) - x1 = _imdct(X) - - assert np.isfinite(X1).all() - assert np.isfinite(x1).all() diff --git a/source/level_1/level_1.py b/source/level_1/level_1.py index 0e0b4a7..84e72c6 100644 --- a/source/level_1/level_1.py +++ b/source/level_1/level_1.py @@ -22,13 +22,13 @@ from __future__ import annotations from pathlib import Path from typing import Union -import numpy as np import soundfile as sf from core.aac_types import AACSeq1, StereoSignal from core.aac_coder import aac_coder_1 as core_aac_coder_1 from core.aac_coder import aac_read_wav_stereo_48k from core.aac_decoder import aac_decoder_1 as core_aac_decoder_1 +from core.aac_snr_db import snr_db # ----------------------------------------------------------------------------- @@ -80,50 +80,6 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS # Demo (Level 1) # ----------------------------------------------------------------------------- -def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: - """ - Compute overall SNR (dB) over all samples and channels after aligning lengths. - - Parameters - ---------- - x_ref : StereoSignal - Reference stereo stream. - x_hat : StereoSignal - Reconstructed stereo stream. - - Returns - ------- - float - SNR in dB. - - Returns +inf if noise power is zero. - - Returns -inf if signal power is zero. - """ - x_ref = np.asarray(x_ref, dtype=np.float64) - x_hat = np.asarray(x_hat, dtype=np.float64) - - if x_ref.ndim == 1: - x_ref = x_ref.reshape(-1, 1) - if x_hat.ndim == 1: - x_hat = x_hat.reshape(-1, 1) - - n = min(x_ref.shape[0], x_hat.shape[0]) - c = min(x_ref.shape[1], x_hat.shape[1]) - - x_ref = x_ref[:n, :c] - x_hat = x_hat[:n, :c] - - err = x_ref - x_hat - ps = float(np.sum(x_ref * x_ref)) - pn = float(np.sum(err * err)) - - if pn <= 0.0: - return float("inf") - if ps <= 0.0: - return float("-inf") - - return float(10.0 * np.log10(ps / pn)) - - def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float: """ Demonstration for the Level-1 codec. @@ -158,12 +114,11 @@ def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> x_hat = aac_decoder_1(aac_seq_1, filename_out) # Optional sanity: ensure output file exists and is readable - x_hat_file, fs_hat = sf.read(str(filename_out), always_2d=True) - _ = x_hat_file + _, fs_hat = sf.read(str(filename_out), always_2d=True) if int(fs_hat) != 48000: raise ValueError("Decoded output sampling rate must be 48 kHz.") - return _snr_db(x_ref, x_hat) + return snr_db(x_ref, x_hat) # ----------------------------------------------------------------------------- @@ -172,11 +127,14 @@ def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> if __name__ == "__main__": # Example: - # python -m level_1.level_1 input.wav output.wav + # cd level_1 + # python -m level_1 input.wav output.wav + # or + # python -m level_1 material/LicorDeCalandraca.wav LicorDeCalandraca_out.wav import sys if len(sys.argv) != 3: - raise SystemExit("Usage: python -m level_1.level_1 ") + raise SystemExit("Usage: python -m level_1 ") in_wav = Path(sys.argv[1]) out_wav = Path(sys.argv[2]) diff --git a/source/level_1/material/LicorDeCalandraca.wav b/source/level_1/material/LicorDeCalandraca.wav new file mode 100644 index 0000000..a527e8c Binary files /dev/null and b/source/level_1/material/LicorDeCalandraca.wav differ diff --git a/source/material/LicorDeCalandraca_out.wav b/source/level_1/material/LicorDeCalandraca_out1.wav similarity index 100% rename from source/material/LicorDeCalandraca_out.wav rename to source/level_1/material/LicorDeCalandraca_out1.wav diff --git a/source/level_1/material/TableB219.mat b/source/level_1/material/TableB219.mat new file mode 100644 index 0000000..7b2e403 Binary files /dev/null and b/source/level_1/material/TableB219.mat differ diff --git a/source/level_1/material/huffCodebooks.mat b/source/level_1/material/huffCodebooks.mat new file mode 100644 index 0000000..a208b3d Binary files /dev/null and b/source/level_1/material/huffCodebooks.mat differ diff --git a/source/level_1/material/huff_utils.py b/source/level_1/material/huff_utils.py new file mode 100644 index 0000000..a8c7fe7 --- /dev/null +++ b/source/level_1/material/huff_utils.py @@ -0,0 +1,400 @@ +import numpy as np +import scipy.io as sio +import os + +# ------------------ LOAD LUT ------------------ + +def load_LUT(mat_filename=None): + """ + Loads the list of Huffman Codebooks (LUTs) + + Returns: + huffLUT : list (index 1..11 used, index 0 unused) + """ + if mat_filename is None: + current_dir = os.path.dirname(os.path.abspath(__file__)) + mat_filename = os.path.join(current_dir, "huffCodebooks.mat") + + mat = sio.loadmat(mat_filename) + + + huffCodebooks_raw = mat['huffCodebooks'].squeeze() + + huffCodebooks = [] + for i in range(11): + huffCodebooks.append(np.array(huffCodebooks_raw[i])) + + # Build inverse VLC tables + invTable = [None] * 11 + + for i in range(11): + h = huffCodebooks[i][:, 2].astype(int) # column 3 + hlength = huffCodebooks[i][:, 1].astype(int) # column 2 + + hbin = [] + for j in range(len(h)): + hbin.append(format(h[j], f'0{hlength[j]}b')) + + invTable[i] = vlc_table(hbin) + + # Build Huffman LUT dicts + huffLUT = [None] * 12 # index 0 unused + params = [ + (4, 1, True), + (4, 1, True), + (4, 2, False), + (4, 2, False), + (2, 4, True), + (2, 4, True), + (2, 7, False), + (2, 7, False), + (2, 12, False), + (2, 12, False), + (2, 16, False), + ] + + for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1): + huffLUT[i] = { + 'LUT': huffCodebooks[i-1], + 'invTable': invTable[i-1], + 'codebook': i, + 'nTupleSize': nTupleSize, + 'maxAbsCodeVal': maxAbs, + 'signedValues': signed + } + + return huffLUT + +def vlc_table(code_array): + """ + codeArray: list of strings, each string is a Huffman codeword (e.g. '0101') + returns: + h : NumPy array of shape (num_nodes, 3) + columns: + [ next_if_0 , next_if_1 , symbol_index ] + """ + h = np.zeros((1, 3), dtype=int) + + for code_index, code in enumerate(code_array, start=1): + word = [int(bit) for bit in code] + h_index = 0 + + for bit in word: + k = bit + next_node = h[h_index, k] + if next_node == 0: + h = np.vstack([h, [0, 0, 0]]) + new_index = h.shape[0] - 1 + h[h_index, k] = new_index + h_index = new_index + else: + h_index = next_node + + h[h_index, 2] = code_index + + return h + +# ------------------ ENCODE ------------------ + +def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None): + """ + Huffman-encode a sequence of quantized coefficients. + + This function selects the appropriate Huffman codebook based on the + maximum absolute value of the input coefficients, encodes the coefficients + into a binary Huffman bitstream, and returns both the bitstream and the + selected codebook index. + + This is the Python equivalent of the MATLAB `encodeHuff.m` function used + in audio/image coding (e.g., scale factor band encoding). The input + coefficient sequence is grouped into fixed-size tuples as defined by + the chosen Huffman LUT. Zero-padding may be applied internally. + + Parameters + ---------- + coeff_sec : array_like of int + 1-D array of quantized integer coefficients to encode. + Typically corresponds to a "section" or scale-factor band. + + huff_LUT_list : list + List of Huffman lookup-table dictionaries as returned by `loadLUT()`. + Index 1..11 correspond to valid Huffman codebooks. + Index 0 is unused. + + Returns + ------- + huffSec : str + Huffman-encoded bitstream represented as a string of '0' and '1' + characters. + + huffCodebook : int + Index (1..11) of the Huffman codebook used for encoding. + A value of 0 indicates a special all-zero section. + """ + if force_codebook is not None: + return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec) + + maxAbsVal = np.max(np.abs(coeff_sec)) + + if maxAbsVal == 0: + huffCodebook = 0 + huffSec = huff_LUT_code_0() + + elif maxAbsVal == 1: + candidates = [1, 2] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal == 2: + candidates = [3, 4] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (3, 4): + candidates = [5, 6] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (5, 6, 7): + candidates = [7, 8] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (8, 9, 10, 11, 12): + candidates = [9, 10] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (13, 14, 15): + huffCodebook = 11 + huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec) + + else: + huffCodebook = 11 + huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec) + + return huffSec, huffCodebook + +def huff_LUT_code_1(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + + if signedValues: + coeff = coeff_sec + maxAbsCodeVal + base = 2 * maxAbsCodeVal + 1 + else: + coeff = coeff_sec + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff)] = coeff + + huffSec = [] + + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + huffIndex = int(np.abs(nTuple) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + if signedValues: + huffSec.append(bits) + else: + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits) + + return ''.join(huffSec) + +def huff_LUT_code_0(): + return '' + +def huff_LUT_code_ESC(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff_sec)] = coeff_sec + + huffSec = [] + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + + lnTuple = nTuple.astype(float) + lnTuple[lnTuple == 0] = np.finfo(float).eps + + N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int)) + N = np.maximum(0, N4 - 4) + esc = np.abs(nTuple) > 15 + + nTupleESC = nTuple.copy() + nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16 + + huffIndex = int(np.abs(nTupleESC) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + escSeq = '' + for k in range(nTupleSize): + if esc[k]: + escSeq += '1' * N[k] + escSeq += '0' + escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b') + + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits + escSeq) + + return ''.join(huffSec) + +# ------------------ DECODE ------------------ + +def decode_huff(huff_sec, huff_LUT): + """ + Decode a Huffman-encoded stream. + + Parameters + ---------- + huff_sec : array-like of int or str + Huffman encoded stream as a sequence of 0 and 1 (string or list/array). + huff_LUT : dict + Huffman lookup table with keys: + - 'invTable': inverse table (numpy array) + - 'codebook': codebook number + - 'nTupleSize': tuple size + - 'maxAbsCodeVal': maximum absolute code value + - 'signedValues': True/False + + Returns + ------- + decCoeffs : list of int + Decoded quantized coefficients. + """ + + h = huff_LUT['invTable'] + huffCodebook = huff_LUT['codebook'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + # Convert string to array of ints + if isinstance(huff_sec, str): + huff_sec = np.array([int(b) for b in huff_sec]) + + eos = False + decCoeffs = [] + streamIndex = 0 + + while not eos: + wordbit = 0 + r = 0 # start at root + + # Decode Huffman word using inverse table + while True: + b = huff_sec[streamIndex + wordbit] + wordbit += 1 + rOld = r + r = h[rOld, b] + if h[r, 0] == 0 and h[r, 1] == 0: + symbolIndex = h[r, 2] - 1 # zero-based + streamIndex += wordbit + break + + # Decode n-tuple magnitudes + if signedValues: + base = 2 * maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val - maxAbsCodeVal) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + else: + base = maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + + # Apply sign bits + nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize] + nTupleSign = -(np.sign(nTupleSignBits - 0.5)) + streamIndex += nTupleSize + nTupleDec = nTupleDec * nTupleSign + + # Handle escape sequences + escIndex = np.where(np.abs(nTupleDec) == 16)[0] + if huffCodebook == 11 and escIndex.size > 0: + for idx in escIndex: + N = 0 + b = huff_sec[streamIndex] + while b: + N += 1 + b = huff_sec[streamIndex + N] + streamIndex += N + N4 = N + 4 + escape_word = huff_sec[streamIndex:streamIndex + N4] + escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2) + nTupleDec[idx] = escape_value + streamIndex += N4 + 1 + # Apply signs again + nTupleDec[escIndex] *= nTupleSign[escIndex] + + decCoeffs.extend(nTupleDec.tolist()) + + if streamIndex >= len(huff_sec): + eos = True + + return decCoeffs + + diff --git a/source/level_2/core/aac_coder.py b/source/level_2/core/aac_coder.py new file mode 100644 index 0000000..0b6b939 --- /dev/null +++ b/source/level_2/core/aac_coder.py @@ -0,0 +1,284 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 AAC encoder orchestration. +# Keeps the same functional behavior as the original level_1 implementation: +# - Reads WAV via soundfile +# - Validates stereo and 48 kHz +# - Frames into 2048 samples with hop=1024 and zero padding at both ends +# - SSC decision uses next-frame attack detection +# - Filterbank analysis (MDCT) +# - Stores per-channel spectra in AACSeq1 schema: +# * ESH: (128, 8) +# * else: (1024, 1) +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_configuration import WIN_TYPE +from core.aac_filterbank import aac_filter_bank +from core.aac_ssc import aac_SSC +from core.aac_tns import aac_tns +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Public helpers (useful for level_x demo wrappers) +# ----------------------------------------------------------------------------- + +def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]: + """ + Read a WAV file using soundfile and validate the Level-1 assumptions. + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + + Returns + ------- + x : StereoSignal (np.ndarray) + Stereo samples as float64, shape (N, 2). + fs : int + Sampling rate (Hz). Must be 48000. + + Raises + ------ + ValueError + If the input is not stereo or the sampling rate is not 48 kHz. + """ + filename_in = Path(filename_in) + + x, fs = sf.read(str(filename_in), always_2d=True) + x = np.asarray(x, dtype=np.float64) + + if x.shape[1] != 2: + raise ValueError("Input must be stereo (2 channels).") + if int(fs) != 48000: + raise ValueError("Input sampling rate must be 48 kHz.") + + return x, int(fs) + + +def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays + as required by the Level-1 AACSeq1 schema. + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + frame_f : FrameF + Output of aac_filter_bank(): + - If frame_type != "ESH": shape (1024, 2) + - If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + + Returns + ------- + chl_f : FrameChannelF + Left channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + """ + if frame_type == "ESH": + if frame_f.shape != (128, 16): + raise ValueError("For ESH, frame_f must have shape (128, 16).") + + chl_f = np.empty((128, 8), dtype=np.float64) + chr_f = np.empty((128, 8), dtype=np.float64) + for j in range(8): + chl_f[:, j] = frame_f[:, 2 * j + 0] + chr_f[:, j] = frame_f[:, 2 * j + 1] + return chl_f, chr_f + + # Non-ESH: store as (1024, 1) as required by the original Level-1 schema. + if frame_f.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).") + + chl_f = frame_f[:, 0:1].astype(np.float64, copy=False) + chr_f = frame_f[:, 1:2].astype(np.float64, copy=False) + return chl_f, chr_f + + + +# ----------------------------------------------------------------------------- +# Level 1 encoder +# ----------------------------------------------------------------------------- + +def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: + """ + Level-1 AAC encoder. + + This function preserves the behavior of the original level_1 implementation: + - Read stereo 48 kHz WAV + - Pad hop samples at start and hop samples at end + - Frame with win=2048, hop=1024 + - Use SSC with next-frame lookahead + - Apply filterbank analysis + - Store per-channel coefficients using AACSeq1 schema + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + Assumption: stereo audio, sampling rate 48 kHz. + + Returns + ------- + AACSeq1 + List of encoded frames (Level 1 schema). + """ + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + # Pad at the beginning to support the first overlap region. + # Tail padding is kept minimal; next-frame is padded on-the-fly when needed. + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + # Number of frames such that current frame fits; next frame will be padded if needed. + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq1 = [] + prev_frame_type: FrameType = "OLS" + + win_type: WinType = WIN_TYPE + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start:start + win, :] + if frame_t.shape != (win, 2): + # This should not happen due to K definition, but keep it explicit. + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop:start + hop + win, :] + + # Ensure next_t is always (2048, 2) by zero-padding at the tail. + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) + + aac_seq.append({ + "frame_type": frame_type, + "win_type": win_type, + "chl": {"frame_F": chl_f}, + "chr": {"frame_F": chr_f}, + }) + + prev_frame_type = frame_type + + return aac_seq + + +def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: + """ + Level-2 AAC encoder (Level 1 + TNS). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + + Returns + ------- + AACSeq2 + Encoded AAC sequence (Level 2 payload schema). + For each frame i: + - "frame_type": FrameType + - "win_type": WinType + - "chl"/"chr": + - "frame_F": FrameChannelF (after TNS) + - "tns_coeffs": TnsCoeffs + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq2 = [] + prev_frame_type: FrameType = "OLS" + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + + # Level 1 analysis (packed stereo container) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) + + # Unpack to per-channel (as you already do in Level 1) + if frame_type == "ESH": + chl_f = np.empty((128, 8), dtype=np.float64) + chr_f = np.empty((128, 8), dtype=np.float64) + for j in range(8): + chl_f[:, j] = frame_f_stereo[:, 2 * j + 0] + chr_f[:, j] = frame_f_stereo[:, 2 * j + 1] + else: + chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False) + chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False) + + # Level 2: apply TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + aac_seq.append( + { + "frame_type": frame_type, + "win_type": WIN_TYPE, + "chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs}, + "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, + } + ) + + prev_frame_type = frame_type + + return aac_seq \ No newline at end of file diff --git a/source/level_2/core/aac_configuration.py b/source/level_2/core/aac_configuration.py new file mode 100644 index 0000000..bb75d2e --- /dev/null +++ b/source/level_2/core/aac_configuration.py @@ -0,0 +1,31 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Configuration +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module contains the global configurations +# +# ------------------------------------------------------------ +from __future__ import annotations + +# Imports +from core.aac_types import WinType + +# Filterbank +# ------------------------------------------------------------ +# Window type +# Options: "SIN", "KBD" +WIN_TYPE: WinType = "SIN" + + +# TNS +# ------------------------------------------------------------ +PRED_ORDER = 4 +QUANT_STEP = 0.1 +QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7] \ No newline at end of file diff --git a/source/level_2/core/aac_decoder.py b/source/level_2/core/aac_decoder.py new file mode 100644 index 0000000..606ce78 --- /dev/null +++ b/source/level_2/core/aac_decoder.py @@ -0,0 +1,257 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Inverse AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 AAC decoder orchestration (inverse of aac_coder_1()). +# Keeps the same functional behavior as the original level_1 implementation: +# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank() +# - IMDCT synthesis per frame +# - Overlap-add with hop=1024 +# - Remove encoder boundary padding: hop at start and hop at end +# +# Note: +# This core module returns the reconstructed samples. Writing to disk is kept +# in level_x demos. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_filterbank import aac_i_filter_bank +from core.aac_tns import aac_i_tns +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Public helpers (useful for level_x demo wrappers) +# ----------------------------------------------------------------------------- + +def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: + """ + Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo + FrameF container expected by aac_i_filter_bank(). + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + chl_f : FrameChannelF + Left channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + + Returns + ------- + FrameF + Stereo coefficients: + - ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + - else: (1024, 2) + """ + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + return frame_f + + # Non-ESH: expected (1024, 1) per channel in Level-1 schema. + if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1): + raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_f[:, 0] + frame_f[:, 1] = chr_f[:, 0] + return frame_f + + +def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: + """ + Remove the boundary padding that the Level-1 encoder adds: + hop samples at start and hop samples at end. + + Parameters + ---------- + y_pad : StereoSignal (np.ndarray) + Reconstructed padded stream, shape (N_pad, 2). + hop : int + Hop size in samples (default 1024). + + Returns + ------- + StereoSignal (np.ndarray) + Unpadded reconstructed stream, shape (N_pad - 2*hop, 2). + + Raises + ------ + ValueError + If y_pad is too short to unpad. + """ + if y_pad.shape[0] < 2 * hop: + raise ValueError("Decoded stream too short to unpad.") + return y_pad[hop:-hop, :] + + +# ----------------------------------------------------------------------------- +# Level 1 decoder (core) +# ----------------------------------------------------------------------------- + +def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-1 AAC decoder (inverse of aac_coder_1()). + + This function preserves the behavior of the original level_1 implementation: + - Reconstruct the full padded stream by overlap-adding K synthesized frames + - Remove hop padding at the beginning and hop padding at the end + - Write the reconstructed stereo WAV file (48 kHz) + - Return reconstructed stereo samples as float64 + + Parameters + ---------- + aac_seq_1 : AACSeq1 + Encoded sequence as produced by aac_coder_1(). + filename_out : Union[str, Path] + Output WAV filename. Assumption: 48 kHz, stereo. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_1) + + # Output includes the encoder padding region, so we reconstruct the full padded stream. + # For K frames: last frame starts at (K-1)*hop and spans win, + # so total length = (K-1)*hop + win. + n_pad = (K - 1) * hop + win + y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_1): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) + + start = i * hop + y_pad[start:start + win, :] += frame_t_hat + + y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + + # Level 1 assumption: 48 kHz output. + sf.write(str(filename_out), y, 48000) + + return y + + +def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-2 AAC decoder (inverse of aac_coder_2). + + Behavior matches Level 1 decoder pipeline, with additional iTNS stage: + - Per frame/channel: inverse TNS using stored coefficients + - Re-pack to stereo frame_F + - IMDCT + windowing + - Overlap-add over frames + - Remove Level-1 padding (hop samples start/end) + - Write output WAV (48 kHz) + + Parameters + ---------- + aac_seq_2 : AACSeq2 + Encoded sequence as produced by aac_coder_2(). + filename_out : Union[str, Path] + Output WAV filename. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_2) + + if K <= 0: + raise ValueError("aac_seq_2 must contain at least one frame.") + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_2): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + + # Inverse TNS per channel + chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs) + chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs) + + # Re-pack to the stereo container expected by aac_i_filter_bank + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f: FrameF = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + else: + # Accept either (1024,1) or (1024,) from your internal convention. + if chl_f.shape == (1024,): + chl_col = chl_f.reshape(1024, 1) + elif chl_f.shape == (1024, 1): + chl_col = chl_f + else: + raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).") + + if chr_f.shape == (1024,): + chr_col = chr_f.reshape(1024, 1) + elif chr_f.shape == (1024, 1): + chr_col = chr_f + else: + raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_col[:, 0] + frame_f[:, 1] = chr_col[:, 0] + + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + + y = aac_remove_padding(y_pad, hop=hop) + + sf.write(str(filename_out), y, 48000) + return y \ No newline at end of file diff --git a/source/level_2/core/aac_filterbank.py b/source/level_2/core/aac_filterbank.py new file mode 100644 index 0000000..60eb9c2 --- /dev/null +++ b/source/level_2/core/aac_filterbank.py @@ -0,0 +1,454 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking +# +# ------------------------------------------------------------ +from __future__ import annotations + +from core.aac_types import * + +from scipy.signal.windows import kaiser + +# Private helpers for Filterbank +# ------------------------------------------------------------ + +def _sin_window(N: int) -> Window: + """ + Build a sinusoidal (SIN) window of length N. + + The AAC sinusoid window is: + w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N + + Parameters + ---------- + N : int + Window length in samples. + + Returns + ------- + Window + 1-D array of shape (N, ) with dtype float64. + """ + n = np.arange(N, dtype=np.float64) + return np.sin((np.pi / N) * (n + 0.5)) + + +def _kbd_window(N: int, alpha: float) -> Window: + """ + Build a Kaiser-Bessel-Derived (KBD) window of length N. + + This follows the standard KBD construction used in AAC: + 1) Build a Kaiser kernel of length (N/2 + 1). + 2) Form the left half by cumulative summation, normalization, and sqrt. + 3) Mirror the left half to form the right half (symmetric full-length window). + + Notes + ----- + - N must be even (AAC uses N=2048 for long and N=256 for short). + - The assignment specifies alpha=6 for long windows and alpha=4 for short windows. + - The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context. + + Parameters + ---------- + N : int + Window length in samples (must be even). + alpha : float + KBD alpha parameter. + + Returns + ------- + Window + 1-D array of shape (N,) with dtype float64. + """ + half = N // 2 + + # Kaiser kernel length: half + 1 samples (0 .. half) + # beta = pi * alpha per the usual correspondence with the ISO definition + kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64) + + csum = np.cumsum(kernel) + denom = csum[-1] + + w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1 + w_right = w_left[::-1] # mirror for second half + + return np.concatenate([w_left, w_right]) + + +def _long_window(win_type: WinType) -> Window: + """ + Return the long AAC window (length 2048) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(2048) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(2048, alpha=6.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _short_window(win_type: WinType) -> Window: + """ + Return the short AAC window (length 256) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (256,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(256) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(256, alpha=4.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window: + """ + Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS. + + In this assignment we assume a single window family is used globally + (no mixed KBD/SIN halves). Therefore, both the long and short windows + are drawn from the same family. + + For frame_type: + - "OLS": return the long window Wl (2048). + - "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)]. + - "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)]. + + Parameters + ---------- + frame_type : FrameType + One of "OLS", "LSS", "LPS". + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + wL = _long_window(win_type) # length 2048 + wS = _short_window(win_type) # length 256 + + if frame_type == "OLS": + return wL + + if frame_type == "LSS": + # 0..1023: left half of long window + # 1024..1471: ones (448 samples) + # 1472..1599: right half of short window (128 samples) + # 1600..2047: zeros (448 samples) + out = np.zeros(2048, dtype=np.float64) + out[0:1024] = wL[0:1024] + out[1024:1472] = 1.0 + out[1472:1600] = wS[128:256] + out[1600:2048] = 0.0 + return out + + if frame_type == "LPS": + # 0..447: zeros (448) + # 448..575: left half of short window (128) + # 576..1023: ones (448) + # 1024..2047: right half of long window (1024) + out = np.zeros(2048, dtype=np.float64) + out[0:448] = 0.0 + out[448:576] = wS[0:128] + out[576:1024] = 1.0 + out[1024:2048] = wL[1024:2048] + return out + + raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}") + + +def _mdct(s: TimeSignal) -> MdctCoeffs: + """ + MDCT (direct form) as specified in the assignment. + + Parameters + ---------- + s : TimeSignal + Windowed time samples, 1-D array of length N (N = 2048 or 256). + + Returns + ------- + MdctCoeffs + MDCT coefficients, 1-D array of length N/2. + + Definition + ---------- + X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + s = np.asarray(s, dtype=np.float64).reshape(-1) + N = int(s.shape[0]) + if N not in (2048, 256): + raise ValueError("MDCT input length must be 2048 or 256.") + + n0 = (N / 2.0 + 1.0) / 2.0 + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(N // 2, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2) + X = 2.0 * (s @ C) # (N/2,) + return X + + +def _imdct(X: MdctCoeffs) -> TimeSignal: + """ + IMDCT (direct form) as specified in the assignment. + + Parameters + ---------- + X : MdctCoeffs + MDCT coefficients, 1-D array of length K (K = 1024 or 128). + + Returns + ------- + TimeSignal + Reconstructed time samples, 1-D array of length N = 2K. + + Definition + ---------- + s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + X = np.asarray(X, dtype=np.float64).reshape(-1) + K = int(X.shape[0]) + if K not in (1024, 128): + raise ValueError("IMDCT input length must be 1024 or 128.") + + N = 2 * K + n0 = (N / 2.0 + 1.0) / 2.0 + + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(K, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K) + s = (2.0 / N) * (C @ X) # (N,) + return s + + +def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF: + """ + ESH analysis for one channel. + + Parameters + ---------- + x_ch : FrameChannelT + Time-domain channel frame (expected shape: (2048,)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelF + Array of shape (128, 8). Column j contains the 128 MDCT coefficients + of the j-th short window. + """ + wS = _short_window(win_type) # (256,) + X_esh = np.empty((128, 8), dtype=np.float64) + + # ESH subwindows are taken from the central region: + # start positions: 448 + 128*j, j = 0..7 + for j in range(8): + start = 448 + 128 * j + seg = x_ch[start:start + 256] * wS # (256,) + X_esh[:, j] = _mdct(seg) # (128,) + + return X_esh + + +def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8). + + Parameters + ---------- + frame_F : FrameF + Packed ESH spectrum (expected shape: (128, 16)). + + Returns + ------- + left : FrameChannelF + Left channel spectrum, shape (128, 8). + right : FrameChannelF + Right channel spectrum, shape (128, 8). + + Notes + ----- + Inverse mapping of the packing used in aac_filter_bank(): + packed[:, 2*j] = left[:, j] + packed[:, 2*j+1] = right[:, j] + """ + if frame_F.shape != (128, 16): + raise ValueError("ESH frame_F must have shape (128, 16).") + + left = np.empty((128, 8), dtype=np.float64) + right = np.empty((128, 8), dtype=np.float64) + for j in range(8): + left[:, j] = frame_F[:, 2 * j + 0] + right[:, j] = frame_F[:, 2 * j + 1] + return left, right + + +def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT: + """ + ESH synthesis for one channel. + + Parameters + ---------- + X_esh : FrameChannelF + MDCT coefficients for 8 short windows (expected shape: (128, 8)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelT + Time-domain channel contribution, shape (2048,). + This is already overlap-added internally for the 8 short blocks and + ready for OLA at the caller level. + """ + if X_esh.shape != (128, 8): + raise ValueError("X_esh must have shape (128, 8).") + + wS = _short_window(win_type) # (256,) + out = np.zeros(2048, dtype=np.float64) + + # Each short IMDCT returns 256 samples. Place them at: + # start = 448 + 128*j, j=0..7 (50% overlap) + for j in range(8): + seg = _imdct(X_esh[:, j]) * wS # (256,) + start = 448 + 128 * j + out[start:start + 256] += seg + + return out + + +# ----------------------------------------------------------------------------- +# Public Function prototypes (Level 1) +# ----------------------------------------------------------------------------- + +def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF: + """ + Filterbank stage (MDCT analysis). + + Parameters + ---------- + frame_T : FrameT + Time-domain frame, stereo, shape (2048, 2). + frame_type : FrameType + Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN") used for the current frame. + + Returns + ------- + frame_F : FrameF + Frequency-domain MDCT coefficients: + - If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2) + containing MDCT coefficients for both channels. + - If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2), + placed in columns according to subframe order, i.e. overall shape (128, 16). + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + + xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False) + xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False) + + if frame_type in ("OLS", "LSS", "LPS"): + w = _window_sequence(frame_type, win_type) # length 2048 + XL = _mdct(xL * w) # length 1024 + XR = _mdct(xR * w) # length 1024 + out = np.empty((1024, 2), dtype=np.float64) + out[:, 0] = XL + out[:, 1] = XR + return out + + if frame_type == "ESH": + Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8) + Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8) + + # Pack into (128, 16): each subframe as (128,2) placed in columns + out = np.empty((128, 16), dtype=np.float64) + for j in range(8): + out[:, 2 * j + 0] = Xl[:, j] + out[:, 2 * j + 1] = Xr[:, j] + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") + + +def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT: + """ + Inverse filterbank (IMDCT synthesis). + + Parameters + ---------- + frame_F : FrameF + Frequency-domain MDCT coefficients as produced by filter_bank(). + frame_type : FrameType + Frame type ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN"). + + Returns + ------- + frame_T : FrameT + Reconstructed time-domain frame, stereo, shape (2048, 2). + """ + if frame_type in ("OLS", "LSS", "LPS"): + if frame_F.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).") + + w = _window_sequence(frame_type, win_type) + + xL = _imdct(frame_F[:, 0]) * w + xR = _imdct(frame_F[:, 1]) * w + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + if frame_type == "ESH": + if frame_F.shape != (128, 16): + raise ValueError("For ESH, frame_F must have shape (128, 16).") + + Xl, Xr = _unpack_esh(frame_F) + xL = _i_filter_bank_esh_channel(Xl, win_type) + xR = _i_filter_bank_esh_channel(Xr, win_type) + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") diff --git a/source/level_2/core/aac_snr_db.py b/source/level_2/core/aac_snr_db.py new file mode 100644 index 0000000..21e5184 --- /dev/null +++ b/source/level_2/core/aac_snr_db.py @@ -0,0 +1,60 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - SNR dB calculator +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements SNR calculation in dB +# ------------------------------------------------------------ +from __future__ import annotations + +from core.aac_types import StereoSignal +import numpy as np + +def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Compute overall SNR (dB) over all samples and channels after aligning lengths. + + Parameters + ---------- + x_ref : StereoSignal + Reference stereo stream. + x_hat : StereoSignal + Reconstructed stereo stream. + + Returns + ------- + float + SNR in dB. + - Returns +inf if noise power is zero. + - Returns -inf if signal power is zero. + """ + x_ref = np.asarray(x_ref, dtype=np.float64) + x_hat = np.asarray(x_hat, dtype=np.float64) + + if x_ref.ndim == 1: + x_ref = x_ref.reshape(-1, 1) + if x_hat.ndim == 1: + x_hat = x_hat.reshape(-1, 1) + + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + x_ref = x_ref[:n, :c] + x_hat = x_hat[:n, :c] + + err = x_ref - x_hat + ps = float(np.sum(x_ref * x_ref)) + pn = float(np.sum(err * err)) + + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") + + return float(10.0 * np.log10(ps / pn)) \ No newline at end of file diff --git a/source/level_2/core/aac_ssc.py b/source/level_2/core/aac_ssc.py new file mode 100644 index 0000000..926c854 --- /dev/null +++ b/source/level_2/core/aac_ssc.py @@ -0,0 +1,217 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Sequence Segmentation Control module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Sequence Segmentation Control module (SSC). +# Selects and returns the frame type based on input parameters. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Dict, Tuple +from core.aac_types import FrameType, FrameT, FrameChannelT + +import numpy as np + +# ----------------------------------------------------------------------------- +# Private helpers for SSC +# ----------------------------------------------------------------------------- + +# See Table 1 in mm-2025-hw-v0.1.pdf +STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = { + ("OLS", "OLS"): "OLS", + ("OLS", "LSS"): "LSS", + ("OLS", "ESH"): "ESH", + ("OLS", "LPS"): "LPS", + ("LSS", "OLS"): "LSS", + ("LSS", "LSS"): "LSS", + ("LSS", "ESH"): "ESH", + ("LSS", "LPS"): "ESH", + ("ESH", "OLS"): "ESH", + ("ESH", "LSS"): "ESH", + ("ESH", "ESH"): "ESH", + ("ESH", "LPS"): "ESH", + ("LPS", "OLS"): "LPS", + ("LPS", "LSS"): "ESH", + ("LPS", "ESH"): "ESH", + ("LPS", "LPS"): "LPS", +} + + +def _detect_attack(next_frame_channel: FrameChannelT) -> bool: + """ + Detect whether the *next* frame (single channel) implies an attack, i.e. ESH + according to the assignment's criterion. + + Parameters + ---------- + next_frame_channel : FrameChannelT + One channel of next_frame_T (expected shape: (2048,)). + + Returns + ------- + bool + True if an attack is detected (=> next frame predicted ESH), else False. + + Notes + ----- + The criterion is implemented as described in the spec: + + 1) Apply the high-pass filter: + H(z) = (1 - z^-1) / (1 - 0.5 z^-1) + implemented in the time domain as: + y[n] = x[n] - x[n-1] + 0.5*y[n-1] + + 2) Split y into 16 segments of length 128 and compute segment energies s[l]. + + 3) Compute the ratio: + ds[l] = s[l] / s[l-1] + + 4) An attack exists if there exists l in {1..7} such that: + s[l] > 1e-3 and ds[l] > 10 + """ + # Local alias; expected to be a 1-D array of length 2048. + x = next_frame_channel + + # High-pass filter reference implementation (scalar recurrence). + y = np.zeros_like(x) + prev_x = 0.0 + prev_y = 0.0 + for n in range(x.shape[0]): + xn = float(x[n]) + yn = (xn - prev_x) + 0.5 * prev_y + y[n] = yn + prev_x = xn + prev_y = yn + + # Segment energies over 16 blocks of 128 samples. + s = np.empty(16, dtype=np.float64) + for l in range(16): + a = l * 128 + b = (l + 1) * 128 + seg = y[a:b] + s[l] = float(np.sum(seg * seg)) + + # ds[l] for l>=1. For l=0 not defined, keep 0. + ds = np.zeros(16, dtype=np.float64) + eps = 1e-12 # Avoid division by zero without materially changing the logic. + for l in range(1, 16): + ds[l] = s[l] / max(s[l - 1], eps) + + # Spec: check l in {1..7}. + for l in range(1, 8): + if (s[l] > 1e-3) and (ds[l] > 10.0): + return True + + return False + + +def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType: + """ + Decide the current frame type for a single channel based on the previous + frame type and whether the next frame is predicted to be ESH. + + Rules (spec): + + - If prev is "LSS" => current is "ESH" + - If prev is "LPS" => current is "OLS" + - If prev is "OLS" => current is "LSS" if attack else "OLS" + - If prev is "ESH" => current is "ESH" if attack else "LPS" + + Parameters + ---------- + prev_frame_type : FrameType + Previous frame type (one of "OLS", "LSS", "ESH", "LPS"). + attack : bool + True if the next frame is predicted ESH for this channel. + + Returns + ------- + FrameType + The per-channel decision for the current frame. + + """ + if prev_frame_type == "LSS": + return "ESH" + if prev_frame_type == "LPS": + return "OLS" + if prev_frame_type == "OLS": + return "LSS" if attack else "OLS" + if prev_frame_type == "ESH": + return "ESH" if attack else "LPS" + + raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}") + + +def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: + """ + Merge per-channel frame type decisions into one common frame type using + the stereo merge table from the spec. + + Parameters + ---------- + ft_l : FrameType + Frame type decision for the left channel. + ft_r : FrameType + Frame type decision for the right channel. + + Returns + ------- + FrameType + The merged common frame type. + """ + try: + return STEREO_MERGE_TABLE[(ft_l, ft_r)] + except KeyError as e: + raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e + + +# ----------------------------------------------------------------------------- +# Public Function prototypes (Level 1) +# ----------------------------------------------------------------------------- + +def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: + """ + Sequence Segmentation Control (SSC). + + Select and return the frame type for the current frame (i) based on: + - the current time-domain frame (stereo), + - the next time-domain frame (stereo), used for attack detection, + - the previous frame type. + + Parameters + ---------- + frame_T : FrameT + Current time-domain frame i (expected shape: (2048, 2)). + next_frame_T : FrameT + Next time-domain frame (i+1), used to decide transitions to/from ESH + (expected shape: (2048, 2)). + prev_frame_type : FrameType + Frame type chosen for the previous frame (i-1). + + Returns + ------- + FrameType + One of: "OLS", "LSS", "ESH", "LPS". + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + if next_frame_T.shape != (2048, 2): + raise ValueError("next_frame_T must have shape (2048, 2).") + + # Detect attack independently per channel on the next frame. + attack_l = _detect_attack(next_frame_T[:, 0]) + attack_r = _detect_attack(next_frame_T[:, 1]) + + # Decide per-channel type based on shared prev_frame_type. + ft_l = _decide_frame_type(prev_frame_type, attack_l) + ft_r = _decide_frame_type(prev_frame_type, attack_r) + + # Stereo merge as per the spec table. + return _stereo_merge(ft_l, ft_r) diff --git a/source/level_2/core/aac_tns.py b/source/level_2/core/aac_tns.py new file mode 100644 index 0000000..29e9921 --- /dev/null +++ b/source/level_2/core/aac_tns.py @@ -0,0 +1,549 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Temporal Noise Shaping (TNS) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Temporal Noise Shaping (TNS) module (Level 2). +# +# Public API: +# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type) +# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs) +# +# Notes (per assignment): +# - TNS is applied per channel (not stereo). +# - For ESH, TNS is applied independently to each of the 8 short subframes. +# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short) +# provided in TableB219.mat. +# - Predictor order is fixed to p = 4. +# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1. +# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p +# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Tuple + +import numpy as np +from scipy.io import loadmat + +from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Private helpers +# ----------------------------------------------------------------------------- + +_B219_CACHE: dict[str, FloatArray] | None = None + + +def _load_b219_tables() -> dict[str, FloatArray]: + """ + Load TableB219.mat and cache the contents. + + The project layout guarantees that a 'material' directory is discoverable + from the current working directory (tests and level_123 entrypoints). + + Returns + ------- + dict[str, FloatArray] + Keys: + - "B219a": long bands table (for K=1024 MDCT lines) + - "B219b": short bands table (for K=128 MDCT lines) + """ + global _B219_CACHE + if _B219_CACHE is not None: + return _B219_CACHE + + mat_path = Path("material") / "TableB219.mat" + if not mat_path.exists(): + raise FileNotFoundError("Could not locate material/TableB219.mat in the current working directory.") + + d = loadmat(str(mat_path)) + if "B219a" not in d or "B219b" not in d: + raise ValueError("TableB219.mat missing required variables B219a and/or B219b.") + + _B219_CACHE = { + "B219a": np.asarray(d["B219a"], dtype=np.float64), + "B219b": np.asarray(d["B219b"], dtype=np.float64), + } + return _B219_CACHE + + +def _band_ranges_for_kcount(k_count: int) -> BandRanges: + """ + Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count. + + Parameters + ---------- + k_count : int + Number of MDCT lines: + - 1024 for long frames + - 128 for short subframes (ESH) + + Returns + ------- + BandRanges (list[tuple[int, int]]) + Each tuple is (start_k, end_k) inclusive. + """ + tables = _load_b219_tables() + if k_count == 1024: + tbl = tables["B219a"] + elif k_count == 128: + tbl = tables["B219b"] + else: + raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).") + + start = tbl[:, 1].astype(int) + end = tbl[:, 2].astype(int) + + ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)] + + for s, e in ranges: + if s < 0 or e < s or e >= k_count: + raise ValueError("Invalid band table ranges for given k_count.") + return ranges + + +# ----------------------------------------------------------------------------- +# Core DSP helpers +# ----------------------------------------------------------------------------- + +def _smooth_sw_inplace(sw: MdctCoeffs) -> None: + """ + Smooth Sw(k) to reduce discontinuities between adjacent Bark bands. + + The assignment applies two passes: + - Backward: Sw(k) = (Sw(k) + Sw(k+1))/2 + - Forward: Sw(k) = (Sw(k) + Sw(k-1))/2 + + Parameters + ---------- + sw : MdctCoeffs + 1-D array of length K (float64). Modified in-place. + """ + k_count = int(sw.shape[0]) + + for k in range(k_count - 2, -1, -1): + sw[k] = 0.5 * (sw[k] + sw[k + 1]) + + for k in range(1, k_count): + sw[k] = 0.5 * (sw[k] + sw[k - 1]) + + +def _compute_sw(x: MdctCoeffs) -> MdctCoeffs: + """ + Compute Sw(k) from band energies P(j) and apply boundary smoothing. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT line array, length K. + + Returns + ------- + MdctCoeffs + Sw(k), 1-D array of length K, float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + k_count = int(x.shape[0]) + + bands = _band_ranges_for_kcount(k_count) + sw = np.zeros(k_count, dtype=np.float64) + + for s, e in bands: + seg = x[s : e + 1] + p_j = float(np.sum(seg * seg)) + sw_val = float(np.sqrt(p_j)) + sw[s : e + 1] = sw_val + + _smooth_sw_inplace(sw) + return sw + + +def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Autocorrelation r(m) for m=0..p. + + Parameters + ---------- + x : MdctCoeffs + 1-D signal. + p : int + Maximum lag. + + Returns + ------- + MdctCoeffs + r, shape (p+1,), float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + n = int(x.shape[0]) + + r = np.zeros(p + 1, dtype=np.float64) + for m in range(p + 1): + r[m] = float(np.dot(x[m:], x[: n - m])) + return r + + +def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Solve Yule-Walker normal equations for LPC coefficients of order p. + + Parameters + ---------- + xw : MdctCoeffs + 1-D normalized sequence Xw(k). + p : int + Predictor order. + + Returns + ------- + MdctCoeffs + LPC coefficients a[0..p-1], shape (p,), float64. + """ + r = _autocorr(xw, p) + + R = np.empty((p, p), dtype=np.float64) + for i in range(p): + for j in range(p): + R[i, j] = r[abs(i - j)] + + rhs = r[1 : p + 1].reshape(p) + + reg = 1e-12 + R_reg = R + reg * np.eye(p, dtype=np.float64) + + a = np.linalg.solve(R_reg, rhs) + return a + + +def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs: + """ + Quantize LPC coefficients with uniform symmetric quantizer and clamp. + + Parameters + ---------- + a : MdctCoeffs + LPC coefficient array, shape (p,). + + Returns + ------- + MdctCoeffs + Quantized coefficients, shape (p,), float64. + """ + a = np.asarray(a, dtype=np.float64).reshape(-1) + q = np.round(a / QUANT_STEP) * QUANT_STEP + q = np.clip(q, -QUANT_MAX, QUANT_MAX) + return q.astype(np.float64, copy=False) + + +def _is_inverse_stable(a_q: MdctCoeffs) -> bool: + """ + Check stability of the inverse TNS filter H_TNS^{-1}. + + Forward filter: + H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p + + Inverse filter poles are roots of: + A(z) = 1 - a1 z^-1 - ... - ap z^-p + Multiply by z^p: + z^p - a1 z^{p-1} - ... - ap = 0 + + Stability condition: + all roots satisfy |z| < 1. + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + bool + True if stable, else False. + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + + # Polynomial in z: z^p - a1 z^{p-1} - ... - ap + poly = np.empty(p + 1, dtype=np.float64) + poly[0] = 1.0 + poly[1:] = -a_q + + roots = np.roots(poly) + + # Strictly inside unit circle for stability. Add tiny margin for numeric safety. + margin = 1e-12 + return bool(np.all(np.abs(roots) < (1.0 - margin))) + + +def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs: + """ + Make quantized predictor coefficients stable for inverse filtering. + + Policy: + - If already stable: return as-is. + - Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid. + - If still unstable after attempts: fall back to all-zero coefficients (disable TNS). + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Stable quantized coefficients, shape (p,). + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + + if _is_inverse_stable(a_q): + return a_q + + # Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid. + gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1) + + for g in gammas: + cand = _quantize_coeffs(g * a_q) + if _is_inverse_stable(cand): + return cand + + # Last resort: disable TNS for this vector + return np.zeros_like(a_q, dtype=np.float64) + + +def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply forward TNS FIR filter: + y[k] = x[k] - sum_{l=1..p} a_l * x[k-l] + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT lines, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Filtered MDCT lines y, length K. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(x.shape[0]) + + y = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = x[k] + for l in range(1, p + 1): + if k - l >= 0: + acc -= a_q[l - 1] * x[k - l] + y[k] = acc + return y + + +def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply inverse TNS IIR filter: + x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l] + + Parameters + ---------- + y : MdctCoeffs + 1-D MDCT lines after TNS, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Reconstructed MDCT lines x_hat, length K. + """ + y = np.asarray(y, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(y.shape[0]) + + x_hat = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = y[k] + for l in range(1, p + 1): + if k - l >= 0: + acc += a_q[l - 1] * x_hat[k - l] + x_hat[k] = acc + return x_hat + + +def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: + """ + TNS for a single MDCT vector (one long frame or one short subframe). + + Steps: + 1) Compute Sw(k) from Bark band energies and smooth it. + 2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0). + 3) Compute LPC coefficients (order p=PRED_ORDER) on Xw. + 4) Quantize coefficients (4-bit symmetric, step QUANT_STEP). + 5) Apply FIR filter on original X(k) using quantized coefficients. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT vector. + + Returns + ------- + y : MdctCoeffs + TNS-processed MDCT vector (same length). + a_q : MdctCoeffs + Quantized LPC coefficients, shape (PRED_ORDER,). + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + sw = _compute_sw(x) + + eps = 1e-12 + xw = np.where(sw > eps, x / sw, 0.0) + + a = _lpc_coeffs(xw, PRED_ORDER) + a_q = _quantize_coeffs(a) + + # Ensure inverse stability (assignment requirement) + a_q = _stabilize_quantized_coeffs(a_q) + + y = _apply_tns_fir(x, a_q) + return y, a_q + + + +# ----------------------------------------------------------------------------- +# Public Functions (Level 2) +# ----------------------------------------------------------------------------- + +def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]: + """ + Temporal Noise Shaping (TNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + frame_F_out : FrameChannelF + Per-channel MDCT coefficients after applying TNS. + Same shape convention as input. + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + """ + x = np.asarray(frame_F_in, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + + y = np.empty_like(x, dtype=np.float64) + a_out = np.empty((PRED_ORDER, 8), dtype=np.float64) + + for j in range(8): + y[:, j], a_out[:, j] = _tns_one_vector(x[:, j]) + + return y, a_out + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec, a_q = _tns_one_vector(x_vec) + + if out_shape == (1024,): + y_out = y_vec + else: + y_out = y_vec.reshape(1024, 1) + + a_out = a_q.reshape(PRED_ORDER, 1) + return y_out, a_out + + +def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF: + """ + Inverse Temporal Noise Shaping (iTNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients after TNS. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + + Returns + ------- + FrameChannelF + Per-channel MDCT coefficients after inverse TNS. + Same shape convention as input frame_F_in. + """ + x = np.asarray(frame_F_in, dtype=np.float64) + a = np.asarray(tns_coeffs, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + if a.shape != (PRED_ORDER, 8): + raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).") + + y = np.empty_like(x, dtype=np.float64) + for j in range(8): + y[:, j] = _apply_itns_iir(x[:, j], a[:, j]) + return y + + if a.shape != (PRED_ORDER, 1): + raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).") + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec = _apply_itns_iir(x_vec, a[:, 0]) + + if out_shape == (1024,): + return y_vec + return y_vec.reshape(1024, 1) diff --git a/source/level_2/core/aac_types.py b/source/level_2/core/aac_types.py new file mode 100644 index 0000000..b52891a --- /dev/null +++ b/source/level_2/core/aac_types.py @@ -0,0 +1,282 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Public Type Aliases +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements Public Type aliases +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import List, Literal, TypeAlias, TypedDict +import numpy as np +from numpy.typing import NDArray + +# ----------------------------------------------------------------------------- +# Code enums (for readability; not intended to enforce shapes/lengths) +# ----------------------------------------------------------------------------- + +FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"] +""" +Frame type codes (AAC): +- "OLS": ONLY_LONG_SEQUENCE +- "LSS": LONG_START_SEQUENCE +- "ESH": EIGHT_SHORT_SEQUENCE +- "LPS": LONG_STOP_SEQUENCE +""" + +WinType: TypeAlias = Literal["KBD", "SIN"] +""" +Window type codes (AAC): +- "KBD": Kaiser-Bessel-Derived +- "SIN": sinusoid +""" + +ChannelKey: TypeAlias = Literal["chl", "chr"] +"""Channel dictionary keys used in Level payloads.""" + + +# ----------------------------------------------------------------------------- +# Array “semantic” aliases +# +# Goal: communicate meaning (time/frequency/window, stereo/channel) without +# forcing strict shapes in the type system. +# ----------------------------------------------------------------------------- + +FloatArray: TypeAlias = NDArray[np.float64] +""" +Generic float64 NumPy array. + +Note: +- We standardize internal numeric computations to float64 for stability and + reproducibility. External I/O can still be float32, but we convert at the + boundaries. +""" + +Window: TypeAlias = FloatArray +""" +Time-domain window (weighting sequence), 1-D. + +Typical lengths in this assignment: +- Long: 2048 +- Short: 256 +- Window sequences for LSS/LPS are also 2048 + +Expected shape: (N,) +dtype: float64 +""" + +TimeSignal: TypeAlias = FloatArray +""" +Time-domain signal samples, typically 1-D. + +Examples: +- Windowed MDCT input: shape (N,) +- IMDCT output: shape (N,) + +dtype: float64 +""" + +StereoSignal: TypeAlias = FloatArray +""" +Time-domain stereo signal stream. + +Expected (typical) shape: (N, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +MdctCoeffs: TypeAlias = FloatArray +""" +MDCT coefficient vector, typically 1-D. + +Examples: +- Long: shape (1024,) +- Short: shape (128,) + +dtype: float64 +""" + +MdctFrameChannel: TypeAlias = FloatArray +""" +Per-channel MDCT container used in Level-1/2 sequences. + +Typical shapes: +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) + +dtype: float64 + +Notes +----- +Some parts of the assignment store long-frame coefficients as a column vector +(1024, 1) to match MATLAB conventions. Internally you may also use (1024,) +when convenient, but the semantic meaning is identical. +""" + +TnsCoeffs: TypeAlias = FloatArray +""" +Quantized TNS predictor coefficients (one channel). + +Typical shapes (Level 2): +- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes) +- Else: (4, 1) (order p=4 for the long frame) + +dtype: float64 + +Notes +----- +The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1. +We store the quantized coefficient values as float64 (typically multiples of 0.1) +to keep the pipeline simple and readable. +""" + + +FrameT: TypeAlias = FloatArray +""" +Time-domain frame (stereo), as used by the filterbank input/output. + +Expected (typical) shape for stereo: (2048, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +FrameChannelT: TypeAlias = FloatArray +""" +Time-domain single-channel frame. + +Expected (typical) shape: (2048,) + +dtype: float64 +""" + +FrameF: TypeAlias = FloatArray +""" +Frequency-domain frame (MDCT coefficients), stereo container. + +Typical shapes (Level 1): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 2) +- If frame_type == "ESH": (128, 16) + +Rationale for ESH (128, 16): +- 8 short subframes per channel => 8 * 2 = 16 columns total +- Each short subframe per stereo is (128, 2), flattened into columns + in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R] + +dtype: float64 +""" + +FrameChannelF: TypeAlias = MdctFrameChannel +""" +Frequency-domain single-channel MDCT coefficients. + +Typical shapes (Level 1/2): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) + +dtype: float64 +""" + +BandRanges: TypeAlias = list[tuple[int, int]] +""" +Bark-band index ranges [start, end] (inclusive) for MDCT lines. + +Used by TNS to map MDCT indices k to Bark bands. +""" + +# ----------------------------------------------------------------------------- +# Level 1 AAC sequence payload types +# ----------------------------------------------------------------------------- + +class AACChannelFrameF(TypedDict): + """ + Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1). + + Keys + ---- + frame_F: + The MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) (8 short subframes) + - else: (1024, 1) or (1024,) + """ + frame_F: FrameChannelF + + +class AACSeq1Frame(TypedDict): + """ + One frame dictionary element of aac_seq_1 (Level 1). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF + chr: AACChannelFrameF + + +AACSeq1: TypeAlias = List[AACSeq1Frame] +""" +AAC sequence for Level 1: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" +""" + + +# ----------------------------------------------------------------------------- +# Level 2 AAC sequence payload types (TNS) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF2(TypedDict): + """ + Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2). + + Keys + ---- + frame_F: + The TNS-processed MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) + - else: (1024, 1) or (1024,) + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Typical shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + """ + frame_F: FrameChannelF + tns_coeffs: TnsCoeffs + + +class AACSeq2Frame(TypedDict): + """ + One frame dictionary element of aac_seq_2 (Level 2). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF2 + chr: AACChannelFrameF2 + + +AACSeq2: TypeAlias = List[AACSeq2Frame] +""" +AAC sequence for Level 2: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 2 adds: +- per-channel "tns_coeffs" +and stores: +- per-channel "frame_F" after applying TNS. +""" diff --git a/source/level_2/level_2.py b/source/level_2/level_2.py index eb5dc92..40adc1c 100644 --- a/source/level_2/level_2.py +++ b/source/level_2/level_2.py @@ -19,3 +19,125 @@ # ------------------------------------------------------------ from __future__ import annotations +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_types import AACSeq2, StereoSignal +from core.aac_coder import aac_coder_2 as core_aac_coder_2 +from core.aac_coder import aac_read_wav_stereo_48k +from core.aac_decoder import aac_decoder_2 as core_aac_decoder_2 +from core.aac_snr_db import snr_db + +# ----------------------------------------------------------------------------- +# Public Level 2 API (wrappers) +# ----------------------------------------------------------------------------- + +def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: + """ + Level-2 AAC encoder (wrapper). + + Delegates to core implementation. + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + Assumption: stereo audio, sampling rate 48 kHz. + + Returns + ------- + AACSeq2 + List of encoded frames (Level 2 schema). + """ + return core_aac_coder_2(filename_in) + + +def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-2 AAC decoder (wrapper). + + Delegates to core implementation. + + Parameters + ---------- + aac_seq_2 : AACSeq2 + Encoded sequence as produced by aac_coder_2(). + filename_out : Union[str, Path] + Output WAV filename. Assumption: 48 kHz, stereo. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + return core_aac_decoder_2(aac_seq_2, filename_out) + + +# ----------------------------------------------------------------------------- +# Demo (Level 2) +# ----------------------------------------------------------------------------- + +def demo_aac_2(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float: + """ + Demonstration for the Level-2 codec. + + Runs: + - aac_coder_2(filename_in) + - aac_decoder_2(aac_seq_2, filename_out) + and computes total SNR between original and decoded audio. + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + filename_out : Union[str, Path] + Output WAV filename (stereo, 48 kHz). + + Returns + ------- + float + Overall SNR in dB. + """ + filename_in = Path(filename_in) + filename_out = Path(filename_out) + + # Read original audio (reference) with the same validation as the codec. + x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in) + if int(fs_ref) != 48000: + raise ValueError("Input sampling rate must be 48 kHz.") + + # Encode / decode + aac_seq_2 = aac_coder_2(filename_in) + x_hat = i_aac_coder_2(aac_seq_2, filename_out) + + # Optional sanity: ensure output file exists and is readable + _, fs_hat = sf.read(str(filename_out), always_2d=True) + if int(fs_hat) != 48000: + raise ValueError("Decoded output sampling rate must be 48 kHz.") + + return snr_db(x_ref, x_hat) + + +# ----------------------------------------------------------------------------- +# CLI +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + # Example: + # cd level_2 + # python -m level_2 input.wav output.wav + # or + # python -m level_2 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l2.wav + import sys + + if len(sys.argv) != 3: + raise SystemExit("Usage: python -m level_2 ") + + in_wav = Path(sys.argv[1]) + out_wav = Path(sys.argv[2]) + + print(f"Encoding/Decoding {in_wav} to {out_wav}") + snr = demo_aac_2(in_wav, out_wav) + print(f"SNR = {snr:.3f} dB") diff --git a/source/level_2/material/LicorDeCalandraca.wav b/source/level_2/material/LicorDeCalandraca.wav new file mode 100644 index 0000000..a527e8c Binary files /dev/null and b/source/level_2/material/LicorDeCalandraca.wav differ diff --git a/source/level_2/material/LicorDeCalandraca_out2.wav b/source/level_2/material/LicorDeCalandraca_out2.wav new file mode 100644 index 0000000..2630d3a Binary files /dev/null and b/source/level_2/material/LicorDeCalandraca_out2.wav differ diff --git a/source/level_2/material/TableB219.mat b/source/level_2/material/TableB219.mat new file mode 100644 index 0000000..7b2e403 Binary files /dev/null and b/source/level_2/material/TableB219.mat differ diff --git a/source/level_2/material/huffCodebooks.mat b/source/level_2/material/huffCodebooks.mat new file mode 100644 index 0000000..a208b3d Binary files /dev/null and b/source/level_2/material/huffCodebooks.mat differ diff --git a/source/level_2/material/huff_utils.py b/source/level_2/material/huff_utils.py new file mode 100644 index 0000000..a8c7fe7 --- /dev/null +++ b/source/level_2/material/huff_utils.py @@ -0,0 +1,400 @@ +import numpy as np +import scipy.io as sio +import os + +# ------------------ LOAD LUT ------------------ + +def load_LUT(mat_filename=None): + """ + Loads the list of Huffman Codebooks (LUTs) + + Returns: + huffLUT : list (index 1..11 used, index 0 unused) + """ + if mat_filename is None: + current_dir = os.path.dirname(os.path.abspath(__file__)) + mat_filename = os.path.join(current_dir, "huffCodebooks.mat") + + mat = sio.loadmat(mat_filename) + + + huffCodebooks_raw = mat['huffCodebooks'].squeeze() + + huffCodebooks = [] + for i in range(11): + huffCodebooks.append(np.array(huffCodebooks_raw[i])) + + # Build inverse VLC tables + invTable = [None] * 11 + + for i in range(11): + h = huffCodebooks[i][:, 2].astype(int) # column 3 + hlength = huffCodebooks[i][:, 1].astype(int) # column 2 + + hbin = [] + for j in range(len(h)): + hbin.append(format(h[j], f'0{hlength[j]}b')) + + invTable[i] = vlc_table(hbin) + + # Build Huffman LUT dicts + huffLUT = [None] * 12 # index 0 unused + params = [ + (4, 1, True), + (4, 1, True), + (4, 2, False), + (4, 2, False), + (2, 4, True), + (2, 4, True), + (2, 7, False), + (2, 7, False), + (2, 12, False), + (2, 12, False), + (2, 16, False), + ] + + for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1): + huffLUT[i] = { + 'LUT': huffCodebooks[i-1], + 'invTable': invTable[i-1], + 'codebook': i, + 'nTupleSize': nTupleSize, + 'maxAbsCodeVal': maxAbs, + 'signedValues': signed + } + + return huffLUT + +def vlc_table(code_array): + """ + codeArray: list of strings, each string is a Huffman codeword (e.g. '0101') + returns: + h : NumPy array of shape (num_nodes, 3) + columns: + [ next_if_0 , next_if_1 , symbol_index ] + """ + h = np.zeros((1, 3), dtype=int) + + for code_index, code in enumerate(code_array, start=1): + word = [int(bit) for bit in code] + h_index = 0 + + for bit in word: + k = bit + next_node = h[h_index, k] + if next_node == 0: + h = np.vstack([h, [0, 0, 0]]) + new_index = h.shape[0] - 1 + h[h_index, k] = new_index + h_index = new_index + else: + h_index = next_node + + h[h_index, 2] = code_index + + return h + +# ------------------ ENCODE ------------------ + +def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None): + """ + Huffman-encode a sequence of quantized coefficients. + + This function selects the appropriate Huffman codebook based on the + maximum absolute value of the input coefficients, encodes the coefficients + into a binary Huffman bitstream, and returns both the bitstream and the + selected codebook index. + + This is the Python equivalent of the MATLAB `encodeHuff.m` function used + in audio/image coding (e.g., scale factor band encoding). The input + coefficient sequence is grouped into fixed-size tuples as defined by + the chosen Huffman LUT. Zero-padding may be applied internally. + + Parameters + ---------- + coeff_sec : array_like of int + 1-D array of quantized integer coefficients to encode. + Typically corresponds to a "section" or scale-factor band. + + huff_LUT_list : list + List of Huffman lookup-table dictionaries as returned by `loadLUT()`. + Index 1..11 correspond to valid Huffman codebooks. + Index 0 is unused. + + Returns + ------- + huffSec : str + Huffman-encoded bitstream represented as a string of '0' and '1' + characters. + + huffCodebook : int + Index (1..11) of the Huffman codebook used for encoding. + A value of 0 indicates a special all-zero section. + """ + if force_codebook is not None: + return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec) + + maxAbsVal = np.max(np.abs(coeff_sec)) + + if maxAbsVal == 0: + huffCodebook = 0 + huffSec = huff_LUT_code_0() + + elif maxAbsVal == 1: + candidates = [1, 2] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal == 2: + candidates = [3, 4] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (3, 4): + candidates = [5, 6] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (5, 6, 7): + candidates = [7, 8] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (8, 9, 10, 11, 12): + candidates = [9, 10] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (13, 14, 15): + huffCodebook = 11 + huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec) + + else: + huffCodebook = 11 + huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec) + + return huffSec, huffCodebook + +def huff_LUT_code_1(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + + if signedValues: + coeff = coeff_sec + maxAbsCodeVal + base = 2 * maxAbsCodeVal + 1 + else: + coeff = coeff_sec + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff)] = coeff + + huffSec = [] + + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + huffIndex = int(np.abs(nTuple) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + if signedValues: + huffSec.append(bits) + else: + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits) + + return ''.join(huffSec) + +def huff_LUT_code_0(): + return '' + +def huff_LUT_code_ESC(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff_sec)] = coeff_sec + + huffSec = [] + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + + lnTuple = nTuple.astype(float) + lnTuple[lnTuple == 0] = np.finfo(float).eps + + N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int)) + N = np.maximum(0, N4 - 4) + esc = np.abs(nTuple) > 15 + + nTupleESC = nTuple.copy() + nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16 + + huffIndex = int(np.abs(nTupleESC) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + escSeq = '' + for k in range(nTupleSize): + if esc[k]: + escSeq += '1' * N[k] + escSeq += '0' + escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b') + + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits + escSeq) + + return ''.join(huffSec) + +# ------------------ DECODE ------------------ + +def decode_huff(huff_sec, huff_LUT): + """ + Decode a Huffman-encoded stream. + + Parameters + ---------- + huff_sec : array-like of int or str + Huffman encoded stream as a sequence of 0 and 1 (string or list/array). + huff_LUT : dict + Huffman lookup table with keys: + - 'invTable': inverse table (numpy array) + - 'codebook': codebook number + - 'nTupleSize': tuple size + - 'maxAbsCodeVal': maximum absolute code value + - 'signedValues': True/False + + Returns + ------- + decCoeffs : list of int + Decoded quantized coefficients. + """ + + h = huff_LUT['invTable'] + huffCodebook = huff_LUT['codebook'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + # Convert string to array of ints + if isinstance(huff_sec, str): + huff_sec = np.array([int(b) for b in huff_sec]) + + eos = False + decCoeffs = [] + streamIndex = 0 + + while not eos: + wordbit = 0 + r = 0 # start at root + + # Decode Huffman word using inverse table + while True: + b = huff_sec[streamIndex + wordbit] + wordbit += 1 + rOld = r + r = h[rOld, b] + if h[r, 0] == 0 and h[r, 1] == 0: + symbolIndex = h[r, 2] - 1 # zero-based + streamIndex += wordbit + break + + # Decode n-tuple magnitudes + if signedValues: + base = 2 * maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val - maxAbsCodeVal) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + else: + base = maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + + # Apply sign bits + nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize] + nTupleSign = -(np.sign(nTupleSignBits - 0.5)) + streamIndex += nTupleSize + nTupleDec = nTupleDec * nTupleSign + + # Handle escape sequences + escIndex = np.where(np.abs(nTupleDec) == 16)[0] + if huffCodebook == 11 and escIndex.size > 0: + for idx in escIndex: + N = 0 + b = huff_sec[streamIndex] + while b: + N += 1 + b = huff_sec[streamIndex + N] + streamIndex += N + N4 = N + 4 + escape_word = huff_sec[streamIndex:streamIndex + N4] + escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2) + nTupleDec[idx] = escape_value + streamIndex += N4 + 1 + # Apply signs again + nTupleDec[escIndex] *= nTupleSign[escIndex] + + decCoeffs.extend(nTupleDec.tolist()) + + if streamIndex >= len(huff_sec): + eos = True + + return decCoeffs + +