From 8427d0e7214ff49e44f19c1ee1fcb82e6a31b10a Mon Sep 17 00:00:00 2001 From: Christos Choutouridis Date: Sun, 8 Feb 2026 17:22:23 +0200 Subject: [PATCH] Level_1: File restructure to support centralized development --- source/core/aac_coder.py | 198 ++++ source/core/aac_configuration.py | 22 + source/core/aac_decoder.py | 166 ++++ source/core/aac_filterbank.py | 454 +++++++++ source/core/aac_ssc.py | 217 +++++ source/core/aac_types.py | 193 ++++ source/core/tests/test_SSC.py | 234 +++++ .../tests/test_aac_coder_decoder.py} | 61 +- source/core/tests/test_filterbank.py | 269 ++++++ .../tests/test_filterbank_internal.py | 61 +- source/level_1/core/aac_coder.py | 198 ++++ source/level_1/core/aac_configuration.py | 22 + source/level_1/core/aac_decoder.py | 166 ++++ source/level_1/core/aac_filterbank.py | 454 +++++++++ source/level_1/core/aac_ssc.py | 217 +++++ source/level_1/core/aac_types.py | 193 ++++ source/level_1/core/tests/test_SSC.py | 234 +++++ .../core/tests/test_aac_coder_decoder.py | 156 ++++ source/level_1/core/tests/test_filterbank.py | 269 ++++++ .../core/tests/test_filterbank_internal.py | 117 +++ source/level_1/level_1.py | 859 ++---------------- source/level_1/tests/test_SSC.py | 199 ---- source/level_1/tests/test_filterbank.py | 235 ----- source/level_2/level_2.py | 21 + source/pytest.ini | 4 + 25 files changed, 3990 insertions(+), 1229 deletions(-) create mode 100644 source/core/aac_coder.py create mode 100644 source/core/aac_configuration.py create mode 100644 source/core/aac_decoder.py create mode 100644 source/core/aac_filterbank.py create mode 100644 source/core/aac_ssc.py create mode 100644 source/core/aac_types.py create mode 100644 source/core/tests/test_SSC.py rename source/{level_1/tests/test_aac_level1.py => core/tests/test_aac_coder_decoder.py} (62%) create mode 100644 source/core/tests/test_filterbank.py rename source/{level_1 => core}/tests/test_filterbank_internal.py (50%) create mode 100644 source/level_1/core/aac_coder.py create mode 100644 source/level_1/core/aac_configuration.py create mode 100644 source/level_1/core/aac_decoder.py create mode 100644 source/level_1/core/aac_filterbank.py create mode 100644 source/level_1/core/aac_ssc.py create mode 100644 source/level_1/core/aac_types.py create mode 100644 source/level_1/core/tests/test_SSC.py create mode 100644 source/level_1/core/tests/test_aac_coder_decoder.py create mode 100644 source/level_1/core/tests/test_filterbank.py create mode 100644 source/level_1/core/tests/test_filterbank_internal.py delete mode 100644 source/level_1/tests/test_SSC.py delete mode 100644 source/level_1/tests/test_filterbank.py create mode 100644 source/level_2/level_2.py create mode 100644 source/pytest.ini diff --git a/source/core/aac_coder.py b/source/core/aac_coder.py new file mode 100644 index 0000000..837ac34 --- /dev/null +++ b/source/core/aac_coder.py @@ -0,0 +1,198 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 AAC encoder orchestration. +# Keeps the same functional behavior as the original level_1 implementation: +# - Reads WAV via soundfile +# - Validates stereo and 48 kHz +# - Frames into 2048 samples with hop=1024 and zero padding at both ends +# - SSC decision uses next-frame attack detection +# - Filterbank analysis (MDCT) +# - Stores per-channel spectra in AACSeq1 schema: +# * ESH: (128, 8) +# * else: (1024, 1) +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_configuration import WIN_TYPE +from core.aac_filterbank import aac_filter_bank +from core.aac_ssc import aac_SSC +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Public helpers (useful for level_x demo wrappers) +# ----------------------------------------------------------------------------- + +def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]: + """ + Read a WAV file using soundfile and validate the Level-1 assumptions. + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + + Returns + ------- + x : StereoSignal (np.ndarray) + Stereo samples as float64, shape (N, 2). + fs : int + Sampling rate (Hz). Must be 48000. + + Raises + ------ + ValueError + If the input is not stereo or the sampling rate is not 48 kHz. + """ + filename_in = Path(filename_in) + + x, fs = sf.read(str(filename_in), always_2d=True) + x = np.asarray(x, dtype=np.float64) + + if x.shape[1] != 2: + raise ValueError("Input must be stereo (2 channels).") + if int(fs) != 48000: + raise ValueError("Input sampling rate must be 48 kHz.") + + return x, int(fs) + + +def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays + as required by the Level-1 AACSeq1 schema. + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + frame_f : FrameF + Output of aac_filter_bank(): + - If frame_type != "ESH": shape (1024, 2) + - If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + + Returns + ------- + chl_f : FrameChannelF + Left channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + """ + if frame_type == "ESH": + if frame_f.shape != (128, 16): + raise ValueError("For ESH, frame_f must have shape (128, 16).") + + chl_f = np.empty((128, 8), dtype=np.float64) + chr_f = np.empty((128, 8), dtype=np.float64) + for j in range(8): + chl_f[:, j] = frame_f[:, 2 * j + 0] + chr_f[:, j] = frame_f[:, 2 * j + 1] + return chl_f, chr_f + + # Non-ESH: store as (1024, 1) as required by the original Level-1 schema. + if frame_f.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).") + + chl_f = frame_f[:, 0:1].astype(np.float64, copy=False) + chr_f = frame_f[:, 1:2].astype(np.float64, copy=False) + return chl_f, chr_f + + + +# ----------------------------------------------------------------------------- +# Level 1 encoder +# ----------------------------------------------------------------------------- + +def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: + """ + Level-1 AAC encoder. + + This function preserves the behavior of the original level_1 implementation: + - Read stereo 48 kHz WAV + - Pad hop samples at start and hop samples at end + - Frame with win=2048, hop=1024 + - Use SSC with next-frame lookahead + - Apply filterbank analysis + - Store per-channel coefficients using AACSeq1 schema + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + Assumption: stereo audio, sampling rate 48 kHz. + + Returns + ------- + AACSeq1 + List of encoded frames (Level 1 schema). + """ + x, fs = aac_read_wav_stereo_48k(filename_in) + _ = fs # kept for clarity; The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + # Pad at the beginning to support the first overlap region. + # Tail padding is kept minimal; next-frame is padded on-the-fly when needed. + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + # Number of frames such that current frame fits; next frame will be padded if needed. + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq1 = [] + prev_frame_type: FrameType = "OLS" + + win_type: WinType = WIN_TYPE + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start:start + win, :] + if frame_t.shape != (win, 2): + # This should not happen due to K definition, but keep it explicit. + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop:start + hop + win, :] + + # Ensure next_t is always (2048, 2) by zero-padding at the tail. + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) + + aac_seq.append({ + "frame_type": frame_type, + "win_type": win_type, + "chl": {"frame_F": chl_f}, + "chr": {"frame_F": chr_f}, + }) + + prev_frame_type = frame_type + + return aac_seq diff --git a/source/core/aac_configuration.py b/source/core/aac_configuration.py new file mode 100644 index 0000000..262e884 --- /dev/null +++ b/source/core/aac_configuration.py @@ -0,0 +1,22 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Configuration +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module contains the global configurations +# +# ------------------------------------------------------------ +from __future__ import annotations + +# Imports +from core.aac_types import WinType + +# Window type +# Options: "SIN", "KBD" +WIN_TYPE: WinType = "SIN" \ No newline at end of file diff --git a/source/core/aac_decoder.py b/source/core/aac_decoder.py new file mode 100644 index 0000000..eb30011 --- /dev/null +++ b/source/core/aac_decoder.py @@ -0,0 +1,166 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Inverse AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 AAC decoder orchestration (inverse of aac_coder_1()). +# Keeps the same functional behavior as the original level_1 implementation: +# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank() +# - IMDCT synthesis per frame +# - Overlap-add with hop=1024 +# - Remove encoder boundary padding: hop at start and hop at end +# +# Note: +# This core module returns the reconstructed samples. Writing to disk is kept +# in level_x demos. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_filterbank import aac_i_filter_bank +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Public helpers (useful for level_x demo wrappers) +# ----------------------------------------------------------------------------- + +def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: + """ + Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo + FrameF container expected by aac_i_filter_bank(). + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + chl_f : FrameChannelF + Left channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + + Returns + ------- + FrameF + Stereo coefficients: + - ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + - else: (1024, 2) + """ + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + return frame_f + + # Non-ESH: expected (1024, 1) per channel in Level-1 schema. + if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1): + raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_f[:, 0] + frame_f[:, 1] = chr_f[:, 0] + return frame_f + + +def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: + """ + Remove the boundary padding that the Level-1 encoder adds: + hop samples at start and hop samples at end. + + Parameters + ---------- + y_pad : StereoSignal (np.ndarray) + Reconstructed padded stream, shape (N_pad, 2). + hop : int + Hop size in samples (default 1024). + + Returns + ------- + StereoSignal (np.ndarray) + Unpadded reconstructed stream, shape (N_pad - 2*hop, 2). + + Raises + ------ + ValueError + If y_pad is too short to unpad. + """ + if y_pad.shape[0] < 2 * hop: + raise ValueError("Decoded stream too short to unpad.") + return y_pad[hop:-hop, :] + + +# ----------------------------------------------------------------------------- +# Level 1 decoder (core) +# ----------------------------------------------------------------------------- + +def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-1 AAC decoder (inverse of aac_coder_1()). + + This function preserves the behavior of the original level_1 implementation: + - Reconstruct the full padded stream by overlap-adding K synthesized frames + - Remove hop padding at the beginning and hop padding at the end + - Write the reconstructed stereo WAV file (48 kHz) + - Return reconstructed stereo samples as float64 + + Parameters + ---------- + aac_seq_1 : AACSeq1 + Encoded sequence as produced by aac_coder_1(). + filename_out : Union[str, Path] + Output WAV filename. Assumption: 48 kHz, stereo. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_1) + + # Output includes the encoder padding region, so we reconstruct the full padded stream. + # For K frames: last frame starts at (K-1)*hop and spans win, + # so total length = (K-1)*hop + win. + n_pad = (K - 1) * hop + win + y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_1): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) + + start = i * hop + y_pad[start:start + win, :] += frame_t_hat + + y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + + # Level 1 assumption: 48 kHz output. + sf.write(str(filename_out), y, 48000) + + return y diff --git a/source/core/aac_filterbank.py b/source/core/aac_filterbank.py new file mode 100644 index 0000000..60eb9c2 --- /dev/null +++ b/source/core/aac_filterbank.py @@ -0,0 +1,454 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking +# +# ------------------------------------------------------------ +from __future__ import annotations + +from core.aac_types import * + +from scipy.signal.windows import kaiser + +# Private helpers for Filterbank +# ------------------------------------------------------------ + +def _sin_window(N: int) -> Window: + """ + Build a sinusoidal (SIN) window of length N. + + The AAC sinusoid window is: + w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N + + Parameters + ---------- + N : int + Window length in samples. + + Returns + ------- + Window + 1-D array of shape (N, ) with dtype float64. + """ + n = np.arange(N, dtype=np.float64) + return np.sin((np.pi / N) * (n + 0.5)) + + +def _kbd_window(N: int, alpha: float) -> Window: + """ + Build a Kaiser-Bessel-Derived (KBD) window of length N. + + This follows the standard KBD construction used in AAC: + 1) Build a Kaiser kernel of length (N/2 + 1). + 2) Form the left half by cumulative summation, normalization, and sqrt. + 3) Mirror the left half to form the right half (symmetric full-length window). + + Notes + ----- + - N must be even (AAC uses N=2048 for long and N=256 for short). + - The assignment specifies alpha=6 for long windows and alpha=4 for short windows. + - The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context. + + Parameters + ---------- + N : int + Window length in samples (must be even). + alpha : float + KBD alpha parameter. + + Returns + ------- + Window + 1-D array of shape (N,) with dtype float64. + """ + half = N // 2 + + # Kaiser kernel length: half + 1 samples (0 .. half) + # beta = pi * alpha per the usual correspondence with the ISO definition + kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64) + + csum = np.cumsum(kernel) + denom = csum[-1] + + w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1 + w_right = w_left[::-1] # mirror for second half + + return np.concatenate([w_left, w_right]) + + +def _long_window(win_type: WinType) -> Window: + """ + Return the long AAC window (length 2048) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(2048) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(2048, alpha=6.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _short_window(win_type: WinType) -> Window: + """ + Return the short AAC window (length 256) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (256,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(256) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(256, alpha=4.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window: + """ + Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS. + + In this assignment we assume a single window family is used globally + (no mixed KBD/SIN halves). Therefore, both the long and short windows + are drawn from the same family. + + For frame_type: + - "OLS": return the long window Wl (2048). + - "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)]. + - "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)]. + + Parameters + ---------- + frame_type : FrameType + One of "OLS", "LSS", "LPS". + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + wL = _long_window(win_type) # length 2048 + wS = _short_window(win_type) # length 256 + + if frame_type == "OLS": + return wL + + if frame_type == "LSS": + # 0..1023: left half of long window + # 1024..1471: ones (448 samples) + # 1472..1599: right half of short window (128 samples) + # 1600..2047: zeros (448 samples) + out = np.zeros(2048, dtype=np.float64) + out[0:1024] = wL[0:1024] + out[1024:1472] = 1.0 + out[1472:1600] = wS[128:256] + out[1600:2048] = 0.0 + return out + + if frame_type == "LPS": + # 0..447: zeros (448) + # 448..575: left half of short window (128) + # 576..1023: ones (448) + # 1024..2047: right half of long window (1024) + out = np.zeros(2048, dtype=np.float64) + out[0:448] = 0.0 + out[448:576] = wS[0:128] + out[576:1024] = 1.0 + out[1024:2048] = wL[1024:2048] + return out + + raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}") + + +def _mdct(s: TimeSignal) -> MdctCoeffs: + """ + MDCT (direct form) as specified in the assignment. + + Parameters + ---------- + s : TimeSignal + Windowed time samples, 1-D array of length N (N = 2048 or 256). + + Returns + ------- + MdctCoeffs + MDCT coefficients, 1-D array of length N/2. + + Definition + ---------- + X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + s = np.asarray(s, dtype=np.float64).reshape(-1) + N = int(s.shape[0]) + if N not in (2048, 256): + raise ValueError("MDCT input length must be 2048 or 256.") + + n0 = (N / 2.0 + 1.0) / 2.0 + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(N // 2, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2) + X = 2.0 * (s @ C) # (N/2,) + return X + + +def _imdct(X: MdctCoeffs) -> TimeSignal: + """ + IMDCT (direct form) as specified in the assignment. + + Parameters + ---------- + X : MdctCoeffs + MDCT coefficients, 1-D array of length K (K = 1024 or 128). + + Returns + ------- + TimeSignal + Reconstructed time samples, 1-D array of length N = 2K. + + Definition + ---------- + s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + X = np.asarray(X, dtype=np.float64).reshape(-1) + K = int(X.shape[0]) + if K not in (1024, 128): + raise ValueError("IMDCT input length must be 1024 or 128.") + + N = 2 * K + n0 = (N / 2.0 + 1.0) / 2.0 + + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(K, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K) + s = (2.0 / N) * (C @ X) # (N,) + return s + + +def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF: + """ + ESH analysis for one channel. + + Parameters + ---------- + x_ch : FrameChannelT + Time-domain channel frame (expected shape: (2048,)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelF + Array of shape (128, 8). Column j contains the 128 MDCT coefficients + of the j-th short window. + """ + wS = _short_window(win_type) # (256,) + X_esh = np.empty((128, 8), dtype=np.float64) + + # ESH subwindows are taken from the central region: + # start positions: 448 + 128*j, j = 0..7 + for j in range(8): + start = 448 + 128 * j + seg = x_ch[start:start + 256] * wS # (256,) + X_esh[:, j] = _mdct(seg) # (128,) + + return X_esh + + +def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8). + + Parameters + ---------- + frame_F : FrameF + Packed ESH spectrum (expected shape: (128, 16)). + + Returns + ------- + left : FrameChannelF + Left channel spectrum, shape (128, 8). + right : FrameChannelF + Right channel spectrum, shape (128, 8). + + Notes + ----- + Inverse mapping of the packing used in aac_filter_bank(): + packed[:, 2*j] = left[:, j] + packed[:, 2*j+1] = right[:, j] + """ + if frame_F.shape != (128, 16): + raise ValueError("ESH frame_F must have shape (128, 16).") + + left = np.empty((128, 8), dtype=np.float64) + right = np.empty((128, 8), dtype=np.float64) + for j in range(8): + left[:, j] = frame_F[:, 2 * j + 0] + right[:, j] = frame_F[:, 2 * j + 1] + return left, right + + +def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT: + """ + ESH synthesis for one channel. + + Parameters + ---------- + X_esh : FrameChannelF + MDCT coefficients for 8 short windows (expected shape: (128, 8)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelT + Time-domain channel contribution, shape (2048,). + This is already overlap-added internally for the 8 short blocks and + ready for OLA at the caller level. + """ + if X_esh.shape != (128, 8): + raise ValueError("X_esh must have shape (128, 8).") + + wS = _short_window(win_type) # (256,) + out = np.zeros(2048, dtype=np.float64) + + # Each short IMDCT returns 256 samples. Place them at: + # start = 448 + 128*j, j=0..7 (50% overlap) + for j in range(8): + seg = _imdct(X_esh[:, j]) * wS # (256,) + start = 448 + 128 * j + out[start:start + 256] += seg + + return out + + +# ----------------------------------------------------------------------------- +# Public Function prototypes (Level 1) +# ----------------------------------------------------------------------------- + +def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF: + """ + Filterbank stage (MDCT analysis). + + Parameters + ---------- + frame_T : FrameT + Time-domain frame, stereo, shape (2048, 2). + frame_type : FrameType + Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN") used for the current frame. + + Returns + ------- + frame_F : FrameF + Frequency-domain MDCT coefficients: + - If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2) + containing MDCT coefficients for both channels. + - If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2), + placed in columns according to subframe order, i.e. overall shape (128, 16). + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + + xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False) + xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False) + + if frame_type in ("OLS", "LSS", "LPS"): + w = _window_sequence(frame_type, win_type) # length 2048 + XL = _mdct(xL * w) # length 1024 + XR = _mdct(xR * w) # length 1024 + out = np.empty((1024, 2), dtype=np.float64) + out[:, 0] = XL + out[:, 1] = XR + return out + + if frame_type == "ESH": + Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8) + Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8) + + # Pack into (128, 16): each subframe as (128,2) placed in columns + out = np.empty((128, 16), dtype=np.float64) + for j in range(8): + out[:, 2 * j + 0] = Xl[:, j] + out[:, 2 * j + 1] = Xr[:, j] + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") + + +def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT: + """ + Inverse filterbank (IMDCT synthesis). + + Parameters + ---------- + frame_F : FrameF + Frequency-domain MDCT coefficients as produced by filter_bank(). + frame_type : FrameType + Frame type ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN"). + + Returns + ------- + frame_T : FrameT + Reconstructed time-domain frame, stereo, shape (2048, 2). + """ + if frame_type in ("OLS", "LSS", "LPS"): + if frame_F.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).") + + w = _window_sequence(frame_type, win_type) + + xL = _imdct(frame_F[:, 0]) * w + xR = _imdct(frame_F[:, 1]) * w + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + if frame_type == "ESH": + if frame_F.shape != (128, 16): + raise ValueError("For ESH, frame_F must have shape (128, 16).") + + Xl, Xr = _unpack_esh(frame_F) + xL = _i_filter_bank_esh_channel(Xl, win_type) + xR = _i_filter_bank_esh_channel(Xr, win_type) + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") diff --git a/source/core/aac_ssc.py b/source/core/aac_ssc.py new file mode 100644 index 0000000..926c854 --- /dev/null +++ b/source/core/aac_ssc.py @@ -0,0 +1,217 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Sequence Segmentation Control module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Sequence Segmentation Control module (SSC). +# Selects and returns the frame type based on input parameters. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Dict, Tuple +from core.aac_types import FrameType, FrameT, FrameChannelT + +import numpy as np + +# ----------------------------------------------------------------------------- +# Private helpers for SSC +# ----------------------------------------------------------------------------- + +# See Table 1 in mm-2025-hw-v0.1.pdf +STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = { + ("OLS", "OLS"): "OLS", + ("OLS", "LSS"): "LSS", + ("OLS", "ESH"): "ESH", + ("OLS", "LPS"): "LPS", + ("LSS", "OLS"): "LSS", + ("LSS", "LSS"): "LSS", + ("LSS", "ESH"): "ESH", + ("LSS", "LPS"): "ESH", + ("ESH", "OLS"): "ESH", + ("ESH", "LSS"): "ESH", + ("ESH", "ESH"): "ESH", + ("ESH", "LPS"): "ESH", + ("LPS", "OLS"): "LPS", + ("LPS", "LSS"): "ESH", + ("LPS", "ESH"): "ESH", + ("LPS", "LPS"): "LPS", +} + + +def _detect_attack(next_frame_channel: FrameChannelT) -> bool: + """ + Detect whether the *next* frame (single channel) implies an attack, i.e. ESH + according to the assignment's criterion. + + Parameters + ---------- + next_frame_channel : FrameChannelT + One channel of next_frame_T (expected shape: (2048,)). + + Returns + ------- + bool + True if an attack is detected (=> next frame predicted ESH), else False. + + Notes + ----- + The criterion is implemented as described in the spec: + + 1) Apply the high-pass filter: + H(z) = (1 - z^-1) / (1 - 0.5 z^-1) + implemented in the time domain as: + y[n] = x[n] - x[n-1] + 0.5*y[n-1] + + 2) Split y into 16 segments of length 128 and compute segment energies s[l]. + + 3) Compute the ratio: + ds[l] = s[l] / s[l-1] + + 4) An attack exists if there exists l in {1..7} such that: + s[l] > 1e-3 and ds[l] > 10 + """ + # Local alias; expected to be a 1-D array of length 2048. + x = next_frame_channel + + # High-pass filter reference implementation (scalar recurrence). + y = np.zeros_like(x) + prev_x = 0.0 + prev_y = 0.0 + for n in range(x.shape[0]): + xn = float(x[n]) + yn = (xn - prev_x) + 0.5 * prev_y + y[n] = yn + prev_x = xn + prev_y = yn + + # Segment energies over 16 blocks of 128 samples. + s = np.empty(16, dtype=np.float64) + for l in range(16): + a = l * 128 + b = (l + 1) * 128 + seg = y[a:b] + s[l] = float(np.sum(seg * seg)) + + # ds[l] for l>=1. For l=0 not defined, keep 0. + ds = np.zeros(16, dtype=np.float64) + eps = 1e-12 # Avoid division by zero without materially changing the logic. + for l in range(1, 16): + ds[l] = s[l] / max(s[l - 1], eps) + + # Spec: check l in {1..7}. + for l in range(1, 8): + if (s[l] > 1e-3) and (ds[l] > 10.0): + return True + + return False + + +def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType: + """ + Decide the current frame type for a single channel based on the previous + frame type and whether the next frame is predicted to be ESH. + + Rules (spec): + + - If prev is "LSS" => current is "ESH" + - If prev is "LPS" => current is "OLS" + - If prev is "OLS" => current is "LSS" if attack else "OLS" + - If prev is "ESH" => current is "ESH" if attack else "LPS" + + Parameters + ---------- + prev_frame_type : FrameType + Previous frame type (one of "OLS", "LSS", "ESH", "LPS"). + attack : bool + True if the next frame is predicted ESH for this channel. + + Returns + ------- + FrameType + The per-channel decision for the current frame. + + """ + if prev_frame_type == "LSS": + return "ESH" + if prev_frame_type == "LPS": + return "OLS" + if prev_frame_type == "OLS": + return "LSS" if attack else "OLS" + if prev_frame_type == "ESH": + return "ESH" if attack else "LPS" + + raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}") + + +def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: + """ + Merge per-channel frame type decisions into one common frame type using + the stereo merge table from the spec. + + Parameters + ---------- + ft_l : FrameType + Frame type decision for the left channel. + ft_r : FrameType + Frame type decision for the right channel. + + Returns + ------- + FrameType + The merged common frame type. + """ + try: + return STEREO_MERGE_TABLE[(ft_l, ft_r)] + except KeyError as e: + raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e + + +# ----------------------------------------------------------------------------- +# Public Function prototypes (Level 1) +# ----------------------------------------------------------------------------- + +def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: + """ + Sequence Segmentation Control (SSC). + + Select and return the frame type for the current frame (i) based on: + - the current time-domain frame (stereo), + - the next time-domain frame (stereo), used for attack detection, + - the previous frame type. + + Parameters + ---------- + frame_T : FrameT + Current time-domain frame i (expected shape: (2048, 2)). + next_frame_T : FrameT + Next time-domain frame (i+1), used to decide transitions to/from ESH + (expected shape: (2048, 2)). + prev_frame_type : FrameType + Frame type chosen for the previous frame (i-1). + + Returns + ------- + FrameType + One of: "OLS", "LSS", "ESH", "LPS". + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + if next_frame_T.shape != (2048, 2): + raise ValueError("next_frame_T must have shape (2048, 2).") + + # Detect attack independently per channel on the next frame. + attack_l = _detect_attack(next_frame_T[:, 0]) + attack_r = _detect_attack(next_frame_T[:, 1]) + + # Decide per-channel type based on shared prev_frame_type. + ft_l = _decide_frame_type(prev_frame_type, attack_l) + ft_r = _decide_frame_type(prev_frame_type, attack_r) + + # Stereo merge as per the spec table. + return _stereo_merge(ft_l, ft_r) diff --git a/source/core/aac_types.py b/source/core/aac_types.py new file mode 100644 index 0000000..8094163 --- /dev/null +++ b/source/core/aac_types.py @@ -0,0 +1,193 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Public Type Aliases +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements Public Type aliases +# +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import List, Literal, TypeAlias, TypedDict +import numpy as np +from numpy.typing import NDArray + +# ----------------------------------------------------------------------------- +# Code enums (for readability; not intended to enforce shapes/lengths) +# ----------------------------------------------------------------------------- + +FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"] +""" +Frame type codes (AAC): +- "OLS": ONLY_LONG_SEQUENCE +- "LSS": LONG_START_SEQUENCE +- "ESH": EIGHT_SHORT_SEQUENCE +- "LPS": LONG_STOP_SEQUENCE +""" + +WinType: TypeAlias = Literal["KBD", "SIN"] +""" +Window type codes (AAC): +- "KBD": Kaiser-Bessel-Derived +- "SIN": sinusoid +""" + +ChannelKey: TypeAlias = Literal["chl", "chr"] +"""Channel dictionary keys used in Level 1 payloads.""" + + +# ----------------------------------------------------------------------------- +# Array “semantic” aliases +# +# Goal: communicate meaning (time/frequency/window, stereo/channel) without +# forcing strict shapes in the type system. +# ----------------------------------------------------------------------------- + +FloatArray: TypeAlias = NDArray[np.float64] +""" +Generic float64 NumPy array. + +Note: +- We standardize internal numeric computations to float64 for stability and + reproducibility. External I/O can still be float32, but we convert at the + boundaries. +""" + +Window: TypeAlias = FloatArray +""" +Time-domain window (weighting sequence), 1-D. + +Typical lengths in this assignment: +- Long: 2048 +- Short: 256 +- Window sequences for LSS/LPS are also 2048 + +Expected shape: (N,) +dtype: float64 +""" + +TimeSignal: TypeAlias = FloatArray +""" +Time-domain signal samples, typically 1-D. + +Examples: +- Windowed MDCT input: shape (N,) +- IMDCT output: shape (N,) + +dtype: float64 +""" + +StereoSignal: TypeAlias = FloatArray +""" +Time-domain stereo signal stream. + +Expected (typical) shape: (N, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +MdctCoeffs: TypeAlias = FloatArray +""" +MDCT coefficient vector, typically 1-D. + +Examples: +- Long: shape (1024,) +- Short: shape (128,) + +dtype: float64 +""" + + +FrameT: TypeAlias = FloatArray +""" +Time-domain frame (stereo), as used by the filterbank input/output. + +Expected (typical) shape for stereo: (2048, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +FrameChannelT: TypeAlias = FloatArray +""" +Time-domain single-channel frame. + +Expected (typical) shape: (2048,) + +dtype: float64 +""" + +FrameF: TypeAlias = FloatArray +""" +Frequency-domain frame (MDCT coefficients), stereo container. + +Typical shapes (Level 1): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 2) +- If frame_type == "ESH": (128, 16) + +Rationale for ESH (128, 16): +- 8 short subframes per channel => 8 * 2 = 16 columns total +- Each short subframe per stereo is (128, 2), flattened into columns + in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R] + +dtype: float64 +""" + +FrameChannelF: TypeAlias = FloatArray +""" +Frequency-domain single-channel frame (MDCT coefficients). + +Typical shapes (Level 1): +- If frame_type in {"OLS","LSS","LPS"}: (1024,) +- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) + +dtype: float64 +""" + + +# ----------------------------------------------------------------------------- +# Level 1 AAC sequence payload types +# ----------------------------------------------------------------------------- + +class AACChannelFrameF(TypedDict): + """ + Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1). + + Keys + ---- + frame_F: + The MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) (8 short subframes) + - else: (1024, ) + """ + frame_F: FrameChannelF + + +class AACSeq1Frame(TypedDict): + """ + One frame dictionary element of aac_seq_1 (Level 1). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF + chr: AACChannelFrameF + + +AACSeq1: TypeAlias = List[AACSeq1Frame] +""" +AAC sequence for Level 1: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" +""" diff --git a/source/core/tests/test_SSC.py b/source/core/tests/test_SSC.py new file mode 100644 index 0000000..91bcf21 --- /dev/null +++ b/source/core/tests/test_SSC.py @@ -0,0 +1,234 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Sequence Segmentation Control Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Sequence Segmentation Control module (SSC). +# ------------------------------------------------------------ + +from __future__ import annotations + +import numpy as np + +from core.aac_ssc import aac_SSC +from core.aac_types import FrameT + +# ----------------------------------------------------------------------------- +# Helper fixtures for SSC +# ----------------------------------------------------------------------------- + +def _next_frame_no_attack() -> FrameT: + """ + Build a next_frame_T that must NOT trigger ESH detection. + + Uses exact zeros so all segment energies are zero and the condition + s[l] > 1e-3 cannot hold for any l. + """ + return np.zeros((2048, 2), dtype=np.float64) + + +def _next_frame_strong_attack( + *, + attack_left: bool, + attack_right: bool, + segment_l: int = 4, + baseline: float = 1e-6, + burst_amp: float = 1.0, +) -> FrameT: + """ + Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels. + + Attack criterion (spec): + Attack exists if there exists l in {1..7} such that: + s[l] > 1e-3 and ds[l] > 10, + where s[l] is the energy of segment l (length 128) after high-pass filtering, + and ds[l] = s[l] / s[l-1]. + + Construction: + - A small baseline is added everywhere to avoid relying on the epsilon guard in ds, + keeping ds behavior stable/reproducible. + - A strong burst is added inside a chosen segment l in 1..7. + """ + if not (1 <= segment_l <= 7): + raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.") + + x = np.full((2048, 2), baseline, dtype=np.float64) + + a = segment_l * 128 + b = (segment_l + 1) * 128 + + if attack_left: + x[a:b, 0] += burst_amp + if attack_right: + x[a:b, 1] += burst_amp + + return x + + +def _next_frame_below_s_threshold( + *, + left: bool, + right: bool, + segment_l: int = 4, + impulse_amp: float = 0.01, +) -> FrameT: + """ + Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered, + even if the ratio ds[l] could be large. + + We place a single impulse of amplitude 'impulse_amp' inside one segment. + Approx. segment energy: s[l] ~= impulse_amp^2. + + Example: + impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3 + """ + if not (1 <= segment_l <= 7): + raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.") + + x = np.zeros((2048, 2), dtype=np.float64) + + idx = segment_l * 128 + 10 # inside segment l + if left: + x[idx, 0] = impulse_amp + if right: + x[idx, 1] = impulse_amp + + return x + + +# ----------------------------------------------------------------------------- +# 1) Fixed/mandatory cases (prev frame type forces current type) +# ----------------------------------------------------------------------------- + +def test_ssc_fixed_cases_prev_lss_and_lps() -> None: + """ + Spec: + - If prev was LSS => current MUST be ESH + - If prev was LPS => current MUST be OLS + independent of attack detection on (i+1). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + + next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True) + + out1 = aac_SSC(frame_t, next_attack, "LSS") + assert out1 == "ESH" + + out2 = aac_SSC(frame_t, next_attack, "LPS") + assert out2 == "OLS" + + +# ----------------------------------------------------------------------------- +# 2) Cases requiring next-frame ESH prediction (attack computation) +# ----------------------------------------------------------------------------- + +def test_prev_ols_next_not_esh_returns_ols() -> None: + """ + If prev=OLS, current is: + - LSS iff (i+1) is predicted ESH + - else OLS + Here: no attack => expect OLS. + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_no_attack() + + out = aac_SSC(frame_t, next_t, "OLS") + assert out == "OLS" + + +def test_prev_ols_next_esh_both_channels_returns_lss() -> None: + """ + prev=OLS and next predicted ESH for both channels: + per-channel: LSS, LSS + merged: LSS + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) + + out = aac_SSC(frame_t, next_t, "OLS") + assert out == "LSS" + + +def test_prev_ols_next_esh_one_channel_returns_lss() -> None: + """ + prev=OLS: + - one channel predicts ESH => LSS + - other channel predicts not ESH => OLS + Merge table: OLS + LSS => LSS (either side). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + + next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) + out1 = aac_SSC(frame_t, next1_t, "OLS") + assert out1 == "LSS" + + next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) + out2 = aac_SSC(frame_t, next2_t, "OLS") + assert out2 == "LSS" + + +def test_prev_esh_next_esh_both_channels_returns_esh() -> None: + """ + prev=ESH and next predicted ESH for both channels: + per-channel: ESH, ESH + merged: ESH + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) + + out = aac_SSC(frame_t, next_t, "ESH") + assert out == "ESH" + + +def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None: + """ + prev=ESH and next not predicted ESH for both channels: + per-channel: LPS, LPS + merged: LPS + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_no_attack() + + out = aac_SSC(frame_t, next_t, "ESH") + assert out == "LPS" + + +def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None: + """ + prev=ESH: + - one channel predicts ESH => ESH + - other channel predicts not ESH => LPS + Merge table: ESH + LPS => ESH (either side). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + + next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) + out1 = aac_SSC(frame_t, next1_t, "ESH") + assert out1 == "ESH" + + next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) + out2 = aac_SSC(frame_t, next2_t, "ESH") + assert out2 == "ESH" + + +def test_threshold_s_must_exceed_1e_3() -> None: + """ + Spec: next frame is predicted ESH only if: + s[l] > 1e-3 AND ds[l] > 10 + for some l in 1..7. + + This test checks the necessity of the s[l] threshold: + - Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01). + - Expect: not classified as ESH -> for prev=OLS return OLS. + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01) + + out = aac_SSC(frame_t, next_t, "OLS") + assert out == "OLS" diff --git a/source/level_1/tests/test_aac_level1.py b/source/core/tests/test_aac_coder_decoder.py similarity index 62% rename from source/level_1/tests/test_aac_level1.py rename to source/core/tests/test_aac_coder_decoder.py index 2757cdb..e8bb669 100644 --- a/source/level_1/tests/test_aac_level1.py +++ b/source/core/tests/test_aac_coder_decoder.py @@ -1,3 +1,16 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - AAC Coder/DecoderTests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for AAC Coder/Decoder module. +# ------------------------------------------------------------ from __future__ import annotations from pathlib import Path @@ -6,18 +19,36 @@ import numpy as np import pytest import soundfile as sf -from level_1.level_1 import aac_coder_1, i_aac_coder_1 +from core.aac_coder import aac_coder_1 +from core.aac_decoder import aac_decoder_1 +from core.aac_types import * + # Helper "fixtures" for aac_coder_1 / i_aac_coder_1 # ----------------------------------------------------------------------------- -def _snr_db(x_ref: np.ndarray, x_hat: np.ndarray) -> float: +def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: """ Compute overall SNR (dB) over all samples and channels after aligning lengths. + + Parameters + ---------- + x_ref : StereoSignal + Reference signal, shape (N, 2) typical. + x_hat : StereoSignal + Reconstructed signal, shape (M, 2) typical. + + Returns + ------- + float + SNR in dB. + - Returns +inf if noise power is zero. + - Returns -inf if signal power is zero. """ x_ref = np.asarray(x_ref, dtype=np.float64) x_hat = np.asarray(x_hat, dtype=np.float64) + # Be conservative: align lengths and common channels. if x_ref.ndim == 1: x_ref = x_ref.reshape(-1, 1) if x_hat.ndim == 1: @@ -36,7 +67,7 @@ def _snr_db(x_ref: np.ndarray, x_hat: np.ndarray) -> float: if pn <= 0.0: return float("inf") if ps <= 0.0: - return -float("inf") + return float("-inf") return float(10.0 * np.log10(ps / pn)) @@ -49,9 +80,9 @@ def tmp_stereo_wav(tmp_path: Path) -> Path: rng = np.random.default_rng(123) fs = 48000 - # ~1 second of audio, keep small for test speed + # ~1 second of audio (kept small for test speed). n = fs - x = rng.normal(size=(n, 2)).astype(np.float64) + x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64) wav_path = tmp_path / "in.wav" sf.write(str(wav_path), x, fs) @@ -63,7 +94,7 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: Module-level contract test: Ensure aac_seq_1 follows the expected schema and per-frame shapes. """ - aac_seq = aac_coder_1(tmp_stereo_wav) + aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav) assert isinstance(aac_seq, list) assert len(aac_seq) > 0 @@ -88,8 +119,8 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: assert "frame_F" in fr["chl"] assert "frame_F" in fr["chr"] - chl_f = np.asarray(fr["chl"]["frame_F"]) - chr_f = np.asarray(fr["chr"]["frame_F"]) + chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) if frame_type == "ESH": assert chl_f.shape == (128, 8) @@ -101,23 +132,25 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: """ - End-to-end module test: + End-to-end test: Encode + decode and check SNR is very high (numerical-noise only). - Threshold is intentionally loose to avoid fragility. + + The threshold is intentionally loose to avoid fragility across platforms/BLAS. """ x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True) - assert fs == 48000 + x_ref = np.asarray(x_ref, dtype=np.float64) + assert int(fs) == 48000 out_wav = tmp_path / "out.wav" aac_seq = aac_coder_1(tmp_stereo_wav) - x_hat = i_aac_coder_1(aac_seq, out_wav) + x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav) # Basic sanity: output file exists and is readable assert out_wav.exists() x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True) - assert fs_hat == 48000 + assert int(fs_hat) == 48000 - # SNR computed against the array returned by i_aac_coder_1 (should match file, but not required) + # SNR against returned array (file should match closely, but we do not require it here). snr = _snr_db(x_ref, x_hat) assert snr > 80.0 diff --git a/source/core/tests/test_filterbank.py b/source/core/tests/test_filterbank.py new file mode 100644 index 0000000..ad2bd45 --- /dev/null +++ b/source/core/tests/test_filterbank.py @@ -0,0 +1,269 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Filterbank module. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Sequence +import pytest + +from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank +from core.aac_types import * + +# Helper fixtures for filterbank +# ----------------------------------------------------------------------------- + +def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal: + """ + Analyze-synthesize each frame and overlap-add with hop=1024. + + Parameters + ---------- + x : StereoSignal + Input stereo stream, expected shape (N, 2). + frame_types : Sequence[FrameType] + Length K sequence of frame types for frames starting at i*1024. + win_type : WinType + Window type ("SIN" or "KBD"). + + Returns + ------- + StereoSignal + Reconstructed stereo stream, same shape as x (N, 2). + """ + hop = 1024 + win = 2048 + K = len(frame_types) + + y: StereoSignal = np.zeros_like(x, dtype=np.float64) + + for i in range(K): + start = i * hop + frame_t: FrameT = x[start:start + win, :] + frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type) + y[start:start + win, :] += frame_t_hat + + return y + + +def _snr_db(x: StereoSignal, y: StereoSignal) -> float: + """ + Compute SNR in dB over all samples/channels. + """ + err = x - y + ps = float(np.sum(x * x)) + pn = float(np.sum(err * err)) + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") + return 10.0 * float(np.log10(ps / pn)) + + +# ----------------------------------------------------------------------------- +# Forward filterbank tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None: + """ + Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + assert frame_f.shape == (1024, 2) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_shapes_esh(win_type: WinType) -> None: + """ + Contract test: for ESH, aac_filter_bank returns shape (128, 16). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_f = aac_filter_bank(frame_t, "ESH", win_type) + assert frame_f.shape == (128, 16) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None: + """ + Behavior test: for OLS (representative long-sequence), channels are independent. + If right channel is zero and left is random, right spectrum should be near zero. + """ + rng = np.random.default_rng(0) + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_t[:, 0] = rng.normal(size=2048) + + frame_f = aac_filter_bank(frame_t, "OLS", win_type) + + assert np.max(np.abs(frame_f[:, 1])) < 1e-9 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_channel_isolation_esh(win_type: WinType) -> None: + """ + Behavior test: for ESH, channels are independent. + If right channel is zero and left is random, all odd columns (right) should be near zero. + """ + rng = np.random.default_rng(1) + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_t[:, 0] = rng.normal(size=2048) + + frame_f = aac_filter_bank(frame_t, "ESH", win_type) + + right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15 + assert np.max(np.abs(right_cols)) < 1e-9 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None: + """ + Spec-driven behavior test: + ESH uses only the central region [448, 1600), split into 8 overlapping + windows of length 256 with 50% overlap. + + Therefore, changing samples outside [448, 1600) must not affect the output. + """ + rng = np.random.default_rng(2) + + frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64) + + center = rng.normal(size=(1152, 2)) + frame_a[448:1600, :] = center + frame_b[448:1600, :] = center + + frame_b[0:448, :] = rng.normal(size=(448, 2)) + frame_b[1600:2048, :] = rng.normal(size=(448, 2)) + + fa = aac_filter_bank(frame_a, "ESH", win_type) + fb = aac_filter_bank(frame_b, "ESH", win_type) + + # Use a tiny tolerance to avoid flaky failures due to floating-point minutiae. + np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_output_is_finite(win_type: WinType) -> None: + """ + Sanity test: output must not contain NaN or inf for representative cases. + """ + rng = np.random.default_rng(3) + frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64) + + for frame_type in ("OLS", "LSS", "ESH", "LPS"): + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + assert np.isfinite(frame_f).all() + + +# ----------------------------------------------------------------------------- +# Reverse i_filterbank tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None: + """ + Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2). + """ + frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64) + for frame_type in ("OLS", "LSS", "LPS"): + frame_t = aac_i_filter_bank(frame_f, frame_type, win_type) + assert frame_t.shape == (2048, 2) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ifilterbank_shapes_esh(win_type: WinType) -> None: + """ + Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2). + """ + frame_f: FrameF = np.zeros((128, 16), dtype=np.float64) + frame_t = aac_i_filter_bank(frame_f, "ESH", win_type) + assert frame_t.shape == (2048, 2) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None: + """ + Sanity test: per-frame analysis+synthesis must produce finite outputs. + """ + rng = np.random.default_rng(0) + frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64) + + for frame_type in ("OLS", "LSS", "ESH", "LPS"): + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type) + assert np.isfinite(frame_t_hat).all() + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None: + """ + Module-level test: + OLS analysis+synthesis with hop=1024 must reconstruct with high SNR + in the steady-state region. + """ + rng = np.random.default_rng(1) + + K = 6 + N = 1024 * (K + 1) + x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) + + y = _ola_reconstruct(x, ["OLS"] * K, win_type) + + a = 1024 + b = N - 1024 + snr = _snr_db(x[a:b, :], y[a:b, :]) + assert snr > 50.0 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None: + """ + Module-level test: + ESH analysis+synthesis with hop=1024 must reconstruct with high SNR + in the steady-state region. + """ + rng = np.random.default_rng(2) + + K = 6 + N = 1024 * (K + 1) + x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) + + y = _ola_reconstruct(x, ["ESH"] * K, win_type) + + a = 1024 + b = N - 1024 + snr = _snr_db(x[a:b, :], y[a:b, :]) + assert snr > 45.0 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None: + """ + Transition sequence test matching the windowing logic: + OLS -> LSS -> ESH -> LPS -> OLS -> OLS + """ + rng = np.random.default_rng(3) + + frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"] + K = len(frame_types) + N = 1024 * (K + 1) + x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) + + y = _ola_reconstruct(x, frame_types, win_type) + + a = 1024 + b = N - 1024 + snr = _snr_db(x[a:b, :], y[a:b, :]) + assert snr > 40.0 diff --git a/source/level_1/tests/test_filterbank_internal.py b/source/core/tests/test_filterbank_internal.py similarity index 50% rename from source/level_1/tests/test_filterbank_internal.py rename to source/core/tests/test_filterbank_internal.py index fbf09c2..e092ad1 100644 --- a/source/level_1/tests/test_filterbank_internal.py +++ b/source/core/tests/test_filterbank_internal.py @@ -1,16 +1,33 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank internal (mdct) Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Filterbank internal MDCT/IMDCT functionality. +# ------------------------------------------------------------ +from __future__ import annotations + import numpy as np import pytest -from level_1.level_1 import _imdct, _mdct +from core.aac_filterbank import _imdct, _mdct +from core.aac_types import FloatArray, TimeSignal, MdctCoeffs -# Helper "fixtures" for filterbank internals (MDCT/IMDCT) -# ----------------------------------------------------------------------------- -def _assert_allclose(a: np.ndarray, b: np.ndarray, *, rtol: float, atol: float) -> None: - # Helper for consistent tolerances across tests. +def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None: + """ + Helper for consistent tolerances across tests. + """ np.testing.assert_allclose(a, b, rtol=rtol, atol=atol) -def _estimate_gain(y: np.ndarray, x: np.ndarray) -> float: + +def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float: """ Estimate scalar gain g such that y ~= g*x in least-squares sense. """ @@ -28,18 +45,18 @@ def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None: Consistency test in coefficient domain: mdct(imdct(X)) ~= g * X - For our chosen (non-orthonormal) scaling, g is expected to be close to 2. + For the chosen (non-orthonormal) scaling, g is expected to be close to 2. """ rng = np.random.default_rng(0) K = N // 2 - X = rng.normal(size=K).astype(np.float64) - x = _imdct(X) - X_hat = _mdct(x) + X: MdctCoeffs = rng.normal(size=K).astype(np.float64) + x: TimeSignal = _imdct(X) + X_hat: MdctCoeffs = _mdct(x) g = _estimate_gain(X_hat, X) _assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance) - _assert_allclose(np.array([g]), np.array([2.0]), rtol=tolerance, atol=tolerance) + _assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance) @pytest.mark.parametrize("N", [256, 2048]) @@ -47,18 +64,16 @@ def test_mdct_linearity(N: int) -> None: """ Linearity test: mdct(a*x + b*y) == a*mdct(x) + b*mdct(y) - - This should hold up to numerical error. """ rng = np.random.default_rng(1) - x = rng.normal(size=N).astype(np.float64) - y = rng.normal(size=N).astype(np.float64) + x: TimeSignal = rng.normal(size=N).astype(np.float64) + y: TimeSignal = rng.normal(size=N).astype(np.float64) a = 0.37 b = -1.12 - left = _mdct(a * x + b * y) - right = a * _mdct(x) + b * _mdct(y) + left: MdctCoeffs = _mdct(a * x + b * y) + right: MdctCoeffs = a * _mdct(x) + b * _mdct(y) _assert_allclose(left, right, rtol=tolerance, atol=tolerance) @@ -72,14 +87,14 @@ def test_imdct_linearity(N: int) -> None: rng = np.random.default_rng(2) K = N // 2 - X = rng.normal(size=K).astype(np.float64) - Y = rng.normal(size=K).astype(np.float64) + X: MdctCoeffs = rng.normal(size=K).astype(np.float64) + Y: MdctCoeffs = rng.normal(size=K).astype(np.float64) a = -0.5 b = 2.0 - left = _imdct(a * X + b * Y) - right = a * _imdct(X) + b * _imdct(Y) + left: TimeSignal = _imdct(a * X + b * Y) + right: TimeSignal = a * _imdct(X) + b * _imdct(Y) _assert_allclose(left, right, rtol=tolerance, atol=tolerance) @@ -92,8 +107,8 @@ def test_mdct_imdct_outputs_are_finite(N: int) -> None: rng = np.random.default_rng(3) K = N // 2 - x = rng.normal(size=N).astype(np.float64) - X = rng.normal(size=K).astype(np.float64) + x: TimeSignal = rng.normal(size=N).astype(np.float64) + X: MdctCoeffs = rng.normal(size=K).astype(np.float64) X1 = _mdct(x) x1 = _imdct(X) diff --git a/source/level_1/core/aac_coder.py b/source/level_1/core/aac_coder.py new file mode 100644 index 0000000..837ac34 --- /dev/null +++ b/source/level_1/core/aac_coder.py @@ -0,0 +1,198 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 AAC encoder orchestration. +# Keeps the same functional behavior as the original level_1 implementation: +# - Reads WAV via soundfile +# - Validates stereo and 48 kHz +# - Frames into 2048 samples with hop=1024 and zero padding at both ends +# - SSC decision uses next-frame attack detection +# - Filterbank analysis (MDCT) +# - Stores per-channel spectra in AACSeq1 schema: +# * ESH: (128, 8) +# * else: (1024, 1) +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_configuration import WIN_TYPE +from core.aac_filterbank import aac_filter_bank +from core.aac_ssc import aac_SSC +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Public helpers (useful for level_x demo wrappers) +# ----------------------------------------------------------------------------- + +def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]: + """ + Read a WAV file using soundfile and validate the Level-1 assumptions. + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + + Returns + ------- + x : StereoSignal (np.ndarray) + Stereo samples as float64, shape (N, 2). + fs : int + Sampling rate (Hz). Must be 48000. + + Raises + ------ + ValueError + If the input is not stereo or the sampling rate is not 48 kHz. + """ + filename_in = Path(filename_in) + + x, fs = sf.read(str(filename_in), always_2d=True) + x = np.asarray(x, dtype=np.float64) + + if x.shape[1] != 2: + raise ValueError("Input must be stereo (2 channels).") + if int(fs) != 48000: + raise ValueError("Input sampling rate must be 48 kHz.") + + return x, int(fs) + + +def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays + as required by the Level-1 AACSeq1 schema. + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + frame_f : FrameF + Output of aac_filter_bank(): + - If frame_type != "ESH": shape (1024, 2) + - If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + + Returns + ------- + chl_f : FrameChannelF + Left channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + """ + if frame_type == "ESH": + if frame_f.shape != (128, 16): + raise ValueError("For ESH, frame_f must have shape (128, 16).") + + chl_f = np.empty((128, 8), dtype=np.float64) + chr_f = np.empty((128, 8), dtype=np.float64) + for j in range(8): + chl_f[:, j] = frame_f[:, 2 * j + 0] + chr_f[:, j] = frame_f[:, 2 * j + 1] + return chl_f, chr_f + + # Non-ESH: store as (1024, 1) as required by the original Level-1 schema. + if frame_f.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).") + + chl_f = frame_f[:, 0:1].astype(np.float64, copy=False) + chr_f = frame_f[:, 1:2].astype(np.float64, copy=False) + return chl_f, chr_f + + + +# ----------------------------------------------------------------------------- +# Level 1 encoder +# ----------------------------------------------------------------------------- + +def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: + """ + Level-1 AAC encoder. + + This function preserves the behavior of the original level_1 implementation: + - Read stereo 48 kHz WAV + - Pad hop samples at start and hop samples at end + - Frame with win=2048, hop=1024 + - Use SSC with next-frame lookahead + - Apply filterbank analysis + - Store per-channel coefficients using AACSeq1 schema + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + Assumption: stereo audio, sampling rate 48 kHz. + + Returns + ------- + AACSeq1 + List of encoded frames (Level 1 schema). + """ + x, fs = aac_read_wav_stereo_48k(filename_in) + _ = fs # kept for clarity; The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + # Pad at the beginning to support the first overlap region. + # Tail padding is kept minimal; next-frame is padded on-the-fly when needed. + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + # Number of frames such that current frame fits; next frame will be padded if needed. + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq1 = [] + prev_frame_type: FrameType = "OLS" + + win_type: WinType = WIN_TYPE + + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start:start + win, :] + if frame_t.shape != (win, 2): + # This should not happen due to K definition, but keep it explicit. + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop:start + hop + win, :] + + # Ensure next_t is always (2048, 2) by zero-padding at the tail. + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_SSC(frame_t, next_t, prev_frame_type) + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) + + aac_seq.append({ + "frame_type": frame_type, + "win_type": win_type, + "chl": {"frame_F": chl_f}, + "chr": {"frame_F": chr_f}, + }) + + prev_frame_type = frame_type + + return aac_seq diff --git a/source/level_1/core/aac_configuration.py b/source/level_1/core/aac_configuration.py new file mode 100644 index 0000000..262e884 --- /dev/null +++ b/source/level_1/core/aac_configuration.py @@ -0,0 +1,22 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Configuration +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module contains the global configurations +# +# ------------------------------------------------------------ +from __future__ import annotations + +# Imports +from core.aac_types import WinType + +# Window type +# Options: "SIN", "KBD" +WIN_TYPE: WinType = "SIN" \ No newline at end of file diff --git a/source/level_1/core/aac_decoder.py b/source/level_1/core/aac_decoder.py new file mode 100644 index 0000000..eb30011 --- /dev/null +++ b/source/level_1/core/aac_decoder.py @@ -0,0 +1,166 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Inverse AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 AAC decoder orchestration (inverse of aac_coder_1()). +# Keeps the same functional behavior as the original level_1 implementation: +# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank() +# - IMDCT synthesis per frame +# - Overlap-add with hop=1024 +# - Remove encoder boundary padding: hop at start and hop at end +# +# Note: +# This core module returns the reconstructed samples. Writing to disk is kept +# in level_x demos. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_filterbank import aac_i_filter_bank +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Public helpers (useful for level_x demo wrappers) +# ----------------------------------------------------------------------------- + +def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: + """ + Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo + FrameF container expected by aac_i_filter_bank(). + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + chl_f : FrameChannelF + Left channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + + Returns + ------- + FrameF + Stereo coefficients: + - ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + - else: (1024, 2) + """ + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + return frame_f + + # Non-ESH: expected (1024, 1) per channel in Level-1 schema. + if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1): + raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_f[:, 0] + frame_f[:, 1] = chr_f[:, 0] + return frame_f + + +def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: + """ + Remove the boundary padding that the Level-1 encoder adds: + hop samples at start and hop samples at end. + + Parameters + ---------- + y_pad : StereoSignal (np.ndarray) + Reconstructed padded stream, shape (N_pad, 2). + hop : int + Hop size in samples (default 1024). + + Returns + ------- + StereoSignal (np.ndarray) + Unpadded reconstructed stream, shape (N_pad - 2*hop, 2). + + Raises + ------ + ValueError + If y_pad is too short to unpad. + """ + if y_pad.shape[0] < 2 * hop: + raise ValueError("Decoded stream too short to unpad.") + return y_pad[hop:-hop, :] + + +# ----------------------------------------------------------------------------- +# Level 1 decoder (core) +# ----------------------------------------------------------------------------- + +def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: + """ + Level-1 AAC decoder (inverse of aac_coder_1()). + + This function preserves the behavior of the original level_1 implementation: + - Reconstruct the full padded stream by overlap-adding K synthesized frames + - Remove hop padding at the beginning and hop padding at the end + - Write the reconstructed stereo WAV file (48 kHz) + - Return reconstructed stereo samples as float64 + + Parameters + ---------- + aac_seq_1 : AACSeq1 + Encoded sequence as produced by aac_coder_1(). + filename_out : Union[str, Path] + Output WAV filename. Assumption: 48 kHz, stereo. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_1) + + # Output includes the encoder padding region, so we reconstruct the full padded stream. + # For K frames: last frame starts at (K-1)*hop and spans win, + # so total length = (K-1)*hop + win. + n_pad = (K - 1) * hop + win + y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) + + for i, fr in enumerate(aac_seq_1): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) + + start = i * hop + y_pad[start:start + win, :] += frame_t_hat + + y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + + # Level 1 assumption: 48 kHz output. + sf.write(str(filename_out), y, 48000) + + return y diff --git a/source/level_1/core/aac_filterbank.py b/source/level_1/core/aac_filterbank.py new file mode 100644 index 0000000..60eb9c2 --- /dev/null +++ b/source/level_1/core/aac_filterbank.py @@ -0,0 +1,454 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking +# +# ------------------------------------------------------------ +from __future__ import annotations + +from core.aac_types import * + +from scipy.signal.windows import kaiser + +# Private helpers for Filterbank +# ------------------------------------------------------------ + +def _sin_window(N: int) -> Window: + """ + Build a sinusoidal (SIN) window of length N. + + The AAC sinusoid window is: + w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N + + Parameters + ---------- + N : int + Window length in samples. + + Returns + ------- + Window + 1-D array of shape (N, ) with dtype float64. + """ + n = np.arange(N, dtype=np.float64) + return np.sin((np.pi / N) * (n + 0.5)) + + +def _kbd_window(N: int, alpha: float) -> Window: + """ + Build a Kaiser-Bessel-Derived (KBD) window of length N. + + This follows the standard KBD construction used in AAC: + 1) Build a Kaiser kernel of length (N/2 + 1). + 2) Form the left half by cumulative summation, normalization, and sqrt. + 3) Mirror the left half to form the right half (symmetric full-length window). + + Notes + ----- + - N must be even (AAC uses N=2048 for long and N=256 for short). + - The assignment specifies alpha=6 for long windows and alpha=4 for short windows. + - The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context. + + Parameters + ---------- + N : int + Window length in samples (must be even). + alpha : float + KBD alpha parameter. + + Returns + ------- + Window + 1-D array of shape (N,) with dtype float64. + """ + half = N // 2 + + # Kaiser kernel length: half + 1 samples (0 .. half) + # beta = pi * alpha per the usual correspondence with the ISO definition + kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64) + + csum = np.cumsum(kernel) + denom = csum[-1] + + w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1 + w_right = w_left[::-1] # mirror for second half + + return np.concatenate([w_left, w_right]) + + +def _long_window(win_type: WinType) -> Window: + """ + Return the long AAC window (length 2048) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(2048) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(2048, alpha=6.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _short_window(win_type: WinType) -> Window: + """ + Return the short AAC window (length 256) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (256,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(256) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(256, alpha=4.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window: + """ + Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS. + + In this assignment we assume a single window family is used globally + (no mixed KBD/SIN halves). Therefore, both the long and short windows + are drawn from the same family. + + For frame_type: + - "OLS": return the long window Wl (2048). + - "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)]. + - "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)]. + + Parameters + ---------- + frame_type : FrameType + One of "OLS", "LSS", "LPS". + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + wL = _long_window(win_type) # length 2048 + wS = _short_window(win_type) # length 256 + + if frame_type == "OLS": + return wL + + if frame_type == "LSS": + # 0..1023: left half of long window + # 1024..1471: ones (448 samples) + # 1472..1599: right half of short window (128 samples) + # 1600..2047: zeros (448 samples) + out = np.zeros(2048, dtype=np.float64) + out[0:1024] = wL[0:1024] + out[1024:1472] = 1.0 + out[1472:1600] = wS[128:256] + out[1600:2048] = 0.0 + return out + + if frame_type == "LPS": + # 0..447: zeros (448) + # 448..575: left half of short window (128) + # 576..1023: ones (448) + # 1024..2047: right half of long window (1024) + out = np.zeros(2048, dtype=np.float64) + out[0:448] = 0.0 + out[448:576] = wS[0:128] + out[576:1024] = 1.0 + out[1024:2048] = wL[1024:2048] + return out + + raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}") + + +def _mdct(s: TimeSignal) -> MdctCoeffs: + """ + MDCT (direct form) as specified in the assignment. + + Parameters + ---------- + s : TimeSignal + Windowed time samples, 1-D array of length N (N = 2048 or 256). + + Returns + ------- + MdctCoeffs + MDCT coefficients, 1-D array of length N/2. + + Definition + ---------- + X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + s = np.asarray(s, dtype=np.float64).reshape(-1) + N = int(s.shape[0]) + if N not in (2048, 256): + raise ValueError("MDCT input length must be 2048 or 256.") + + n0 = (N / 2.0 + 1.0) / 2.0 + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(N // 2, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2) + X = 2.0 * (s @ C) # (N/2,) + return X + + +def _imdct(X: MdctCoeffs) -> TimeSignal: + """ + IMDCT (direct form) as specified in the assignment. + + Parameters + ---------- + X : MdctCoeffs + MDCT coefficients, 1-D array of length K (K = 1024 or 128). + + Returns + ------- + TimeSignal + Reconstructed time samples, 1-D array of length N = 2K. + + Definition + ---------- + s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + X = np.asarray(X, dtype=np.float64).reshape(-1) + K = int(X.shape[0]) + if K not in (1024, 128): + raise ValueError("IMDCT input length must be 1024 or 128.") + + N = 2 * K + n0 = (N / 2.0 + 1.0) / 2.0 + + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(K, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K) + s = (2.0 / N) * (C @ X) # (N,) + return s + + +def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF: + """ + ESH analysis for one channel. + + Parameters + ---------- + x_ch : FrameChannelT + Time-domain channel frame (expected shape: (2048,)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelF + Array of shape (128, 8). Column j contains the 128 MDCT coefficients + of the j-th short window. + """ + wS = _short_window(win_type) # (256,) + X_esh = np.empty((128, 8), dtype=np.float64) + + # ESH subwindows are taken from the central region: + # start positions: 448 + 128*j, j = 0..7 + for j in range(8): + start = 448 + 128 * j + seg = x_ch[start:start + 256] * wS # (256,) + X_esh[:, j] = _mdct(seg) # (128,) + + return X_esh + + +def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8). + + Parameters + ---------- + frame_F : FrameF + Packed ESH spectrum (expected shape: (128, 16)). + + Returns + ------- + left : FrameChannelF + Left channel spectrum, shape (128, 8). + right : FrameChannelF + Right channel spectrum, shape (128, 8). + + Notes + ----- + Inverse mapping of the packing used in aac_filter_bank(): + packed[:, 2*j] = left[:, j] + packed[:, 2*j+1] = right[:, j] + """ + if frame_F.shape != (128, 16): + raise ValueError("ESH frame_F must have shape (128, 16).") + + left = np.empty((128, 8), dtype=np.float64) + right = np.empty((128, 8), dtype=np.float64) + for j in range(8): + left[:, j] = frame_F[:, 2 * j + 0] + right[:, j] = frame_F[:, 2 * j + 1] + return left, right + + +def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT: + """ + ESH synthesis for one channel. + + Parameters + ---------- + X_esh : FrameChannelF + MDCT coefficients for 8 short windows (expected shape: (128, 8)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelT + Time-domain channel contribution, shape (2048,). + This is already overlap-added internally for the 8 short blocks and + ready for OLA at the caller level. + """ + if X_esh.shape != (128, 8): + raise ValueError("X_esh must have shape (128, 8).") + + wS = _short_window(win_type) # (256,) + out = np.zeros(2048, dtype=np.float64) + + # Each short IMDCT returns 256 samples. Place them at: + # start = 448 + 128*j, j=0..7 (50% overlap) + for j in range(8): + seg = _imdct(X_esh[:, j]) * wS # (256,) + start = 448 + 128 * j + out[start:start + 256] += seg + + return out + + +# ----------------------------------------------------------------------------- +# Public Function prototypes (Level 1) +# ----------------------------------------------------------------------------- + +def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF: + """ + Filterbank stage (MDCT analysis). + + Parameters + ---------- + frame_T : FrameT + Time-domain frame, stereo, shape (2048, 2). + frame_type : FrameType + Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN") used for the current frame. + + Returns + ------- + frame_F : FrameF + Frequency-domain MDCT coefficients: + - If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2) + containing MDCT coefficients for both channels. + - If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2), + placed in columns according to subframe order, i.e. overall shape (128, 16). + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + + xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False) + xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False) + + if frame_type in ("OLS", "LSS", "LPS"): + w = _window_sequence(frame_type, win_type) # length 2048 + XL = _mdct(xL * w) # length 1024 + XR = _mdct(xR * w) # length 1024 + out = np.empty((1024, 2), dtype=np.float64) + out[:, 0] = XL + out[:, 1] = XR + return out + + if frame_type == "ESH": + Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8) + Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8) + + # Pack into (128, 16): each subframe as (128,2) placed in columns + out = np.empty((128, 16), dtype=np.float64) + for j in range(8): + out[:, 2 * j + 0] = Xl[:, j] + out[:, 2 * j + 1] = Xr[:, j] + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") + + +def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT: + """ + Inverse filterbank (IMDCT synthesis). + + Parameters + ---------- + frame_F : FrameF + Frequency-domain MDCT coefficients as produced by filter_bank(). + frame_type : FrameType + Frame type ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN"). + + Returns + ------- + frame_T : FrameT + Reconstructed time-domain frame, stereo, shape (2048, 2). + """ + if frame_type in ("OLS", "LSS", "LPS"): + if frame_F.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).") + + w = _window_sequence(frame_type, win_type) + + xL = _imdct(frame_F[:, 0]) * w + xR = _imdct(frame_F[:, 1]) * w + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + if frame_type == "ESH": + if frame_F.shape != (128, 16): + raise ValueError("For ESH, frame_F must have shape (128, 16).") + + Xl, Xr = _unpack_esh(frame_F) + xL = _i_filter_bank_esh_channel(Xl, win_type) + xR = _i_filter_bank_esh_channel(Xr, win_type) + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") diff --git a/source/level_1/core/aac_ssc.py b/source/level_1/core/aac_ssc.py new file mode 100644 index 0000000..926c854 --- /dev/null +++ b/source/level_1/core/aac_ssc.py @@ -0,0 +1,217 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Sequence Segmentation Control module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Sequence Segmentation Control module (SSC). +# Selects and returns the frame type based on input parameters. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Dict, Tuple +from core.aac_types import FrameType, FrameT, FrameChannelT + +import numpy as np + +# ----------------------------------------------------------------------------- +# Private helpers for SSC +# ----------------------------------------------------------------------------- + +# See Table 1 in mm-2025-hw-v0.1.pdf +STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = { + ("OLS", "OLS"): "OLS", + ("OLS", "LSS"): "LSS", + ("OLS", "ESH"): "ESH", + ("OLS", "LPS"): "LPS", + ("LSS", "OLS"): "LSS", + ("LSS", "LSS"): "LSS", + ("LSS", "ESH"): "ESH", + ("LSS", "LPS"): "ESH", + ("ESH", "OLS"): "ESH", + ("ESH", "LSS"): "ESH", + ("ESH", "ESH"): "ESH", + ("ESH", "LPS"): "ESH", + ("LPS", "OLS"): "LPS", + ("LPS", "LSS"): "ESH", + ("LPS", "ESH"): "ESH", + ("LPS", "LPS"): "LPS", +} + + +def _detect_attack(next_frame_channel: FrameChannelT) -> bool: + """ + Detect whether the *next* frame (single channel) implies an attack, i.e. ESH + according to the assignment's criterion. + + Parameters + ---------- + next_frame_channel : FrameChannelT + One channel of next_frame_T (expected shape: (2048,)). + + Returns + ------- + bool + True if an attack is detected (=> next frame predicted ESH), else False. + + Notes + ----- + The criterion is implemented as described in the spec: + + 1) Apply the high-pass filter: + H(z) = (1 - z^-1) / (1 - 0.5 z^-1) + implemented in the time domain as: + y[n] = x[n] - x[n-1] + 0.5*y[n-1] + + 2) Split y into 16 segments of length 128 and compute segment energies s[l]. + + 3) Compute the ratio: + ds[l] = s[l] / s[l-1] + + 4) An attack exists if there exists l in {1..7} such that: + s[l] > 1e-3 and ds[l] > 10 + """ + # Local alias; expected to be a 1-D array of length 2048. + x = next_frame_channel + + # High-pass filter reference implementation (scalar recurrence). + y = np.zeros_like(x) + prev_x = 0.0 + prev_y = 0.0 + for n in range(x.shape[0]): + xn = float(x[n]) + yn = (xn - prev_x) + 0.5 * prev_y + y[n] = yn + prev_x = xn + prev_y = yn + + # Segment energies over 16 blocks of 128 samples. + s = np.empty(16, dtype=np.float64) + for l in range(16): + a = l * 128 + b = (l + 1) * 128 + seg = y[a:b] + s[l] = float(np.sum(seg * seg)) + + # ds[l] for l>=1. For l=0 not defined, keep 0. + ds = np.zeros(16, dtype=np.float64) + eps = 1e-12 # Avoid division by zero without materially changing the logic. + for l in range(1, 16): + ds[l] = s[l] / max(s[l - 1], eps) + + # Spec: check l in {1..7}. + for l in range(1, 8): + if (s[l] > 1e-3) and (ds[l] > 10.0): + return True + + return False + + +def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType: + """ + Decide the current frame type for a single channel based on the previous + frame type and whether the next frame is predicted to be ESH. + + Rules (spec): + + - If prev is "LSS" => current is "ESH" + - If prev is "LPS" => current is "OLS" + - If prev is "OLS" => current is "LSS" if attack else "OLS" + - If prev is "ESH" => current is "ESH" if attack else "LPS" + + Parameters + ---------- + prev_frame_type : FrameType + Previous frame type (one of "OLS", "LSS", "ESH", "LPS"). + attack : bool + True if the next frame is predicted ESH for this channel. + + Returns + ------- + FrameType + The per-channel decision for the current frame. + + """ + if prev_frame_type == "LSS": + return "ESH" + if prev_frame_type == "LPS": + return "OLS" + if prev_frame_type == "OLS": + return "LSS" if attack else "OLS" + if prev_frame_type == "ESH": + return "ESH" if attack else "LPS" + + raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}") + + +def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: + """ + Merge per-channel frame type decisions into one common frame type using + the stereo merge table from the spec. + + Parameters + ---------- + ft_l : FrameType + Frame type decision for the left channel. + ft_r : FrameType + Frame type decision for the right channel. + + Returns + ------- + FrameType + The merged common frame type. + """ + try: + return STEREO_MERGE_TABLE[(ft_l, ft_r)] + except KeyError as e: + raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e + + +# ----------------------------------------------------------------------------- +# Public Function prototypes (Level 1) +# ----------------------------------------------------------------------------- + +def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: + """ + Sequence Segmentation Control (SSC). + + Select and return the frame type for the current frame (i) based on: + - the current time-domain frame (stereo), + - the next time-domain frame (stereo), used for attack detection, + - the previous frame type. + + Parameters + ---------- + frame_T : FrameT + Current time-domain frame i (expected shape: (2048, 2)). + next_frame_T : FrameT + Next time-domain frame (i+1), used to decide transitions to/from ESH + (expected shape: (2048, 2)). + prev_frame_type : FrameType + Frame type chosen for the previous frame (i-1). + + Returns + ------- + FrameType + One of: "OLS", "LSS", "ESH", "LPS". + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + if next_frame_T.shape != (2048, 2): + raise ValueError("next_frame_T must have shape (2048, 2).") + + # Detect attack independently per channel on the next frame. + attack_l = _detect_attack(next_frame_T[:, 0]) + attack_r = _detect_attack(next_frame_T[:, 1]) + + # Decide per-channel type based on shared prev_frame_type. + ft_l = _decide_frame_type(prev_frame_type, attack_l) + ft_r = _decide_frame_type(prev_frame_type, attack_r) + + # Stereo merge as per the spec table. + return _stereo_merge(ft_l, ft_r) diff --git a/source/level_1/core/aac_types.py b/source/level_1/core/aac_types.py new file mode 100644 index 0000000..8094163 --- /dev/null +++ b/source/level_1/core/aac_types.py @@ -0,0 +1,193 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Public Type Aliases +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements Public Type aliases +# +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import List, Literal, TypeAlias, TypedDict +import numpy as np +from numpy.typing import NDArray + +# ----------------------------------------------------------------------------- +# Code enums (for readability; not intended to enforce shapes/lengths) +# ----------------------------------------------------------------------------- + +FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"] +""" +Frame type codes (AAC): +- "OLS": ONLY_LONG_SEQUENCE +- "LSS": LONG_START_SEQUENCE +- "ESH": EIGHT_SHORT_SEQUENCE +- "LPS": LONG_STOP_SEQUENCE +""" + +WinType: TypeAlias = Literal["KBD", "SIN"] +""" +Window type codes (AAC): +- "KBD": Kaiser-Bessel-Derived +- "SIN": sinusoid +""" + +ChannelKey: TypeAlias = Literal["chl", "chr"] +"""Channel dictionary keys used in Level 1 payloads.""" + + +# ----------------------------------------------------------------------------- +# Array “semantic” aliases +# +# Goal: communicate meaning (time/frequency/window, stereo/channel) without +# forcing strict shapes in the type system. +# ----------------------------------------------------------------------------- + +FloatArray: TypeAlias = NDArray[np.float64] +""" +Generic float64 NumPy array. + +Note: +- We standardize internal numeric computations to float64 for stability and + reproducibility. External I/O can still be float32, but we convert at the + boundaries. +""" + +Window: TypeAlias = FloatArray +""" +Time-domain window (weighting sequence), 1-D. + +Typical lengths in this assignment: +- Long: 2048 +- Short: 256 +- Window sequences for LSS/LPS are also 2048 + +Expected shape: (N,) +dtype: float64 +""" + +TimeSignal: TypeAlias = FloatArray +""" +Time-domain signal samples, typically 1-D. + +Examples: +- Windowed MDCT input: shape (N,) +- IMDCT output: shape (N,) + +dtype: float64 +""" + +StereoSignal: TypeAlias = FloatArray +""" +Time-domain stereo signal stream. + +Expected (typical) shape: (N, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +MdctCoeffs: TypeAlias = FloatArray +""" +MDCT coefficient vector, typically 1-D. + +Examples: +- Long: shape (1024,) +- Short: shape (128,) + +dtype: float64 +""" + + +FrameT: TypeAlias = FloatArray +""" +Time-domain frame (stereo), as used by the filterbank input/output. + +Expected (typical) shape for stereo: (2048, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +FrameChannelT: TypeAlias = FloatArray +""" +Time-domain single-channel frame. + +Expected (typical) shape: (2048,) + +dtype: float64 +""" + +FrameF: TypeAlias = FloatArray +""" +Frequency-domain frame (MDCT coefficients), stereo container. + +Typical shapes (Level 1): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 2) +- If frame_type == "ESH": (128, 16) + +Rationale for ESH (128, 16): +- 8 short subframes per channel => 8 * 2 = 16 columns total +- Each short subframe per stereo is (128, 2), flattened into columns + in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R] + +dtype: float64 +""" + +FrameChannelF: TypeAlias = FloatArray +""" +Frequency-domain single-channel frame (MDCT coefficients). + +Typical shapes (Level 1): +- If frame_type in {"OLS","LSS","LPS"}: (1024,) +- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) + +dtype: float64 +""" + + +# ----------------------------------------------------------------------------- +# Level 1 AAC sequence payload types +# ----------------------------------------------------------------------------- + +class AACChannelFrameF(TypedDict): + """ + Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1). + + Keys + ---- + frame_F: + The MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) (8 short subframes) + - else: (1024, ) + """ + frame_F: FrameChannelF + + +class AACSeq1Frame(TypedDict): + """ + One frame dictionary element of aac_seq_1 (Level 1). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF + chr: AACChannelFrameF + + +AACSeq1: TypeAlias = List[AACSeq1Frame] +""" +AAC sequence for Level 1: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" +""" diff --git a/source/level_1/core/tests/test_SSC.py b/source/level_1/core/tests/test_SSC.py new file mode 100644 index 0000000..91bcf21 --- /dev/null +++ b/source/level_1/core/tests/test_SSC.py @@ -0,0 +1,234 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Sequence Segmentation Control Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Sequence Segmentation Control module (SSC). +# ------------------------------------------------------------ + +from __future__ import annotations + +import numpy as np + +from core.aac_ssc import aac_SSC +from core.aac_types import FrameT + +# ----------------------------------------------------------------------------- +# Helper fixtures for SSC +# ----------------------------------------------------------------------------- + +def _next_frame_no_attack() -> FrameT: + """ + Build a next_frame_T that must NOT trigger ESH detection. + + Uses exact zeros so all segment energies are zero and the condition + s[l] > 1e-3 cannot hold for any l. + """ + return np.zeros((2048, 2), dtype=np.float64) + + +def _next_frame_strong_attack( + *, + attack_left: bool, + attack_right: bool, + segment_l: int = 4, + baseline: float = 1e-6, + burst_amp: float = 1.0, +) -> FrameT: + """ + Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels. + + Attack criterion (spec): + Attack exists if there exists l in {1..7} such that: + s[l] > 1e-3 and ds[l] > 10, + where s[l] is the energy of segment l (length 128) after high-pass filtering, + and ds[l] = s[l] / s[l-1]. + + Construction: + - A small baseline is added everywhere to avoid relying on the epsilon guard in ds, + keeping ds behavior stable/reproducible. + - A strong burst is added inside a chosen segment l in 1..7. + """ + if not (1 <= segment_l <= 7): + raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.") + + x = np.full((2048, 2), baseline, dtype=np.float64) + + a = segment_l * 128 + b = (segment_l + 1) * 128 + + if attack_left: + x[a:b, 0] += burst_amp + if attack_right: + x[a:b, 1] += burst_amp + + return x + + +def _next_frame_below_s_threshold( + *, + left: bool, + right: bool, + segment_l: int = 4, + impulse_amp: float = 0.01, +) -> FrameT: + """ + Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered, + even if the ratio ds[l] could be large. + + We place a single impulse of amplitude 'impulse_amp' inside one segment. + Approx. segment energy: s[l] ~= impulse_amp^2. + + Example: + impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3 + """ + if not (1 <= segment_l <= 7): + raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.") + + x = np.zeros((2048, 2), dtype=np.float64) + + idx = segment_l * 128 + 10 # inside segment l + if left: + x[idx, 0] = impulse_amp + if right: + x[idx, 1] = impulse_amp + + return x + + +# ----------------------------------------------------------------------------- +# 1) Fixed/mandatory cases (prev frame type forces current type) +# ----------------------------------------------------------------------------- + +def test_ssc_fixed_cases_prev_lss_and_lps() -> None: + """ + Spec: + - If prev was LSS => current MUST be ESH + - If prev was LPS => current MUST be OLS + independent of attack detection on (i+1). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + + next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True) + + out1 = aac_SSC(frame_t, next_attack, "LSS") + assert out1 == "ESH" + + out2 = aac_SSC(frame_t, next_attack, "LPS") + assert out2 == "OLS" + + +# ----------------------------------------------------------------------------- +# 2) Cases requiring next-frame ESH prediction (attack computation) +# ----------------------------------------------------------------------------- + +def test_prev_ols_next_not_esh_returns_ols() -> None: + """ + If prev=OLS, current is: + - LSS iff (i+1) is predicted ESH + - else OLS + Here: no attack => expect OLS. + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_no_attack() + + out = aac_SSC(frame_t, next_t, "OLS") + assert out == "OLS" + + +def test_prev_ols_next_esh_both_channels_returns_lss() -> None: + """ + prev=OLS and next predicted ESH for both channels: + per-channel: LSS, LSS + merged: LSS + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) + + out = aac_SSC(frame_t, next_t, "OLS") + assert out == "LSS" + + +def test_prev_ols_next_esh_one_channel_returns_lss() -> None: + """ + prev=OLS: + - one channel predicts ESH => LSS + - other channel predicts not ESH => OLS + Merge table: OLS + LSS => LSS (either side). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + + next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) + out1 = aac_SSC(frame_t, next1_t, "OLS") + assert out1 == "LSS" + + next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) + out2 = aac_SSC(frame_t, next2_t, "OLS") + assert out2 == "LSS" + + +def test_prev_esh_next_esh_both_channels_returns_esh() -> None: + """ + prev=ESH and next predicted ESH for both channels: + per-channel: ESH, ESH + merged: ESH + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) + + out = aac_SSC(frame_t, next_t, "ESH") + assert out == "ESH" + + +def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None: + """ + prev=ESH and next not predicted ESH for both channels: + per-channel: LPS, LPS + merged: LPS + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_no_attack() + + out = aac_SSC(frame_t, next_t, "ESH") + assert out == "LPS" + + +def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None: + """ + prev=ESH: + - one channel predicts ESH => ESH + - other channel predicts not ESH => LPS + Merge table: ESH + LPS => ESH (either side). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + + next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) + out1 = aac_SSC(frame_t, next1_t, "ESH") + assert out1 == "ESH" + + next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) + out2 = aac_SSC(frame_t, next2_t, "ESH") + assert out2 == "ESH" + + +def test_threshold_s_must_exceed_1e_3() -> None: + """ + Spec: next frame is predicted ESH only if: + s[l] > 1e-3 AND ds[l] > 10 + for some l in 1..7. + + This test checks the necessity of the s[l] threshold: + - Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01). + - Expect: not classified as ESH -> for prev=OLS return OLS. + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01) + + out = aac_SSC(frame_t, next_t, "OLS") + assert out == "OLS" diff --git a/source/level_1/core/tests/test_aac_coder_decoder.py b/source/level_1/core/tests/test_aac_coder_decoder.py new file mode 100644 index 0000000..e8bb669 --- /dev/null +++ b/source/level_1/core/tests/test_aac_coder_decoder.py @@ -0,0 +1,156 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - AAC Coder/DecoderTests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for AAC Coder/Decoder module. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest +import soundfile as sf + +from core.aac_coder import aac_coder_1 +from core.aac_decoder import aac_decoder_1 +from core.aac_types import * + + +# Helper "fixtures" for aac_coder_1 / i_aac_coder_1 +# ----------------------------------------------------------------------------- + +def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Compute overall SNR (dB) over all samples and channels after aligning lengths. + + Parameters + ---------- + x_ref : StereoSignal + Reference signal, shape (N, 2) typical. + x_hat : StereoSignal + Reconstructed signal, shape (M, 2) typical. + + Returns + ------- + float + SNR in dB. + - Returns +inf if noise power is zero. + - Returns -inf if signal power is zero. + """ + x_ref = np.asarray(x_ref, dtype=np.float64) + x_hat = np.asarray(x_hat, dtype=np.float64) + + # Be conservative: align lengths and common channels. + if x_ref.ndim == 1: + x_ref = x_ref.reshape(-1, 1) + if x_hat.ndim == 1: + x_hat = x_hat.reshape(-1, 1) + + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + x_ref = x_ref[:n, :c] + x_hat = x_hat[:n, :c] + + err = x_ref - x_hat + ps = float(np.sum(x_ref * x_ref)) + pn = float(np.sum(err * err)) + + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") + + return float(10.0 * np.log10(ps / pn)) + + +@pytest.fixture() +def tmp_stereo_wav(tmp_path: Path) -> Path: + """ + Create a temporary 48 kHz stereo WAV with random samples. + """ + rng = np.random.default_rng(123) + fs = 48000 + + # ~1 second of audio (kept small for test speed). + n = fs + x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64) + + wav_path = tmp_path / "in.wav" + sf.write(str(wav_path), x, fs) + return wav_path + + +def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: + """ + Module-level contract test: + Ensure aac_seq_1 follows the expected schema and per-frame shapes. + """ + aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav) + + assert isinstance(aac_seq, list) + assert len(aac_seq) > 0 + + for fr in aac_seq: + assert isinstance(fr, dict) + + # Required keys + assert "frame_type" in fr + assert "win_type" in fr + assert "chl" in fr + assert "chr" in fr + + frame_type = fr["frame_type"] + win_type = fr["win_type"] + + assert frame_type in ("OLS", "LSS", "ESH", "LPS") + assert win_type in ("SIN", "KBD") + + assert isinstance(fr["chl"], dict) + assert isinstance(fr["chr"], dict) + assert "frame_F" in fr["chl"] + assert "frame_F" in fr["chr"] + + chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + if frame_type == "ESH": + assert chl_f.shape == (128, 8) + assert chr_f.shape == (128, 8) + else: + assert chl_f.shape == (1024, 1) + assert chr_f.shape == (1024, 1) + + +def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: + """ + End-to-end test: + Encode + decode and check SNR is very high (numerical-noise only). + + The threshold is intentionally loose to avoid fragility across platforms/BLAS. + """ + x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True) + x_ref = np.asarray(x_ref, dtype=np.float64) + assert int(fs) == 48000 + + out_wav = tmp_path / "out.wav" + + aac_seq = aac_coder_1(tmp_stereo_wav) + x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav) + + # Basic sanity: output file exists and is readable + assert out_wav.exists() + x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True) + assert int(fs_hat) == 48000 + + # SNR against returned array (file should match closely, but we do not require it here). + snr = _snr_db(x_ref, x_hat) + assert snr > 80.0 diff --git a/source/level_1/core/tests/test_filterbank.py b/source/level_1/core/tests/test_filterbank.py new file mode 100644 index 0000000..ad2bd45 --- /dev/null +++ b/source/level_1/core/tests/test_filterbank.py @@ -0,0 +1,269 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Filterbank module. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Sequence +import pytest + +from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank +from core.aac_types import * + +# Helper fixtures for filterbank +# ----------------------------------------------------------------------------- + +def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal: + """ + Analyze-synthesize each frame and overlap-add with hop=1024. + + Parameters + ---------- + x : StereoSignal + Input stereo stream, expected shape (N, 2). + frame_types : Sequence[FrameType] + Length K sequence of frame types for frames starting at i*1024. + win_type : WinType + Window type ("SIN" or "KBD"). + + Returns + ------- + StereoSignal + Reconstructed stereo stream, same shape as x (N, 2). + """ + hop = 1024 + win = 2048 + K = len(frame_types) + + y: StereoSignal = np.zeros_like(x, dtype=np.float64) + + for i in range(K): + start = i * hop + frame_t: FrameT = x[start:start + win, :] + frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type) + y[start:start + win, :] += frame_t_hat + + return y + + +def _snr_db(x: StereoSignal, y: StereoSignal) -> float: + """ + Compute SNR in dB over all samples/channels. + """ + err = x - y + ps = float(np.sum(x * x)) + pn = float(np.sum(err * err)) + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") + return 10.0 * float(np.log10(ps / pn)) + + +# ----------------------------------------------------------------------------- +# Forward filterbank tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) +def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None: + """ + Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + assert frame_f.shape == (1024, 2) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_shapes_esh(win_type: WinType) -> None: + """ + Contract test: for ESH, aac_filter_bank returns shape (128, 16). + """ + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_f = aac_filter_bank(frame_t, "ESH", win_type) + assert frame_f.shape == (128, 16) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None: + """ + Behavior test: for OLS (representative long-sequence), channels are independent. + If right channel is zero and left is random, right spectrum should be near zero. + """ + rng = np.random.default_rng(0) + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_t[:, 0] = rng.normal(size=2048) + + frame_f = aac_filter_bank(frame_t, "OLS", win_type) + + assert np.max(np.abs(frame_f[:, 1])) < 1e-9 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_channel_isolation_esh(win_type: WinType) -> None: + """ + Behavior test: for ESH, channels are independent. + If right channel is zero and left is random, all odd columns (right) should be near zero. + """ + rng = np.random.default_rng(1) + frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_t[:, 0] = rng.normal(size=2048) + + frame_f = aac_filter_bank(frame_t, "ESH", win_type) + + right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15 + assert np.max(np.abs(right_cols)) < 1e-9 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None: + """ + Spec-driven behavior test: + ESH uses only the central region [448, 1600), split into 8 overlapping + windows of length 256 with 50% overlap. + + Therefore, changing samples outside [448, 1600) must not affect the output. + """ + rng = np.random.default_rng(2) + + frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64) + frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64) + + center = rng.normal(size=(1152, 2)) + frame_a[448:1600, :] = center + frame_b[448:1600, :] = center + + frame_b[0:448, :] = rng.normal(size=(448, 2)) + frame_b[1600:2048, :] = rng.normal(size=(448, 2)) + + fa = aac_filter_bank(frame_a, "ESH", win_type) + fb = aac_filter_bank(frame_b, "ESH", win_type) + + # Use a tiny tolerance to avoid flaky failures due to floating-point minutiae. + np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_filterbank_output_is_finite(win_type: WinType) -> None: + """ + Sanity test: output must not contain NaN or inf for representative cases. + """ + rng = np.random.default_rng(3) + frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64) + + for frame_type in ("OLS", "LSS", "ESH", "LPS"): + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + assert np.isfinite(frame_f).all() + + +# ----------------------------------------------------------------------------- +# Reverse i_filterbank tests +# ----------------------------------------------------------------------------- + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None: + """ + Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2). + """ + frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64) + for frame_type in ("OLS", "LSS", "LPS"): + frame_t = aac_i_filter_bank(frame_f, frame_type, win_type) + assert frame_t.shape == (2048, 2) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ifilterbank_shapes_esh(win_type: WinType) -> None: + """ + Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2). + """ + frame_f: FrameF = np.zeros((128, 16), dtype=np.float64) + frame_t = aac_i_filter_bank(frame_f, "ESH", win_type) + assert frame_t.shape == (2048, 2) + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None: + """ + Sanity test: per-frame analysis+synthesis must produce finite outputs. + """ + rng = np.random.default_rng(0) + frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64) + + for frame_type in ("OLS", "LSS", "ESH", "LPS"): + frame_f = aac_filter_bank(frame_t, frame_type, win_type) + frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type) + assert np.isfinite(frame_t_hat).all() + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None: + """ + Module-level test: + OLS analysis+synthesis with hop=1024 must reconstruct with high SNR + in the steady-state region. + """ + rng = np.random.default_rng(1) + + K = 6 + N = 1024 * (K + 1) + x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) + + y = _ola_reconstruct(x, ["OLS"] * K, win_type) + + a = 1024 + b = N - 1024 + snr = _snr_db(x[a:b, :], y[a:b, :]) + assert snr > 50.0 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None: + """ + Module-level test: + ESH analysis+synthesis with hop=1024 must reconstruct with high SNR + in the steady-state region. + """ + rng = np.random.default_rng(2) + + K = 6 + N = 1024 * (K + 1) + x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) + + y = _ola_reconstruct(x, ["ESH"] * K, win_type) + + a = 1024 + b = N - 1024 + snr = _snr_db(x[a:b, :], y[a:b, :]) + assert snr > 45.0 + + +@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) +def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None: + """ + Transition sequence test matching the windowing logic: + OLS -> LSS -> ESH -> LPS -> OLS -> OLS + """ + rng = np.random.default_rng(3) + + frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"] + K = len(frame_types) + N = 1024 * (K + 1) + x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64) + + y = _ola_reconstruct(x, frame_types, win_type) + + a = 1024 + b = N - 1024 + snr = _snr_db(x[a:b, :], y[a:b, :]) + assert snr > 40.0 diff --git a/source/level_1/core/tests/test_filterbank_internal.py b/source/level_1/core/tests/test_filterbank_internal.py new file mode 100644 index 0000000..e092ad1 --- /dev/null +++ b/source/level_1/core/tests/test_filterbank_internal.py @@ -0,0 +1,117 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank internal (mdct) Tests +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Tests for Filterbank internal MDCT/IMDCT functionality. +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np +import pytest + +from core.aac_filterbank import _imdct, _mdct +from core.aac_types import FloatArray, TimeSignal, MdctCoeffs + + +def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None: + """ + Helper for consistent tolerances across tests. + """ + np.testing.assert_allclose(a, b, rtol=rtol, atol=atol) + + +def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float: + """ + Estimate scalar gain g such that y ~= g*x in least-squares sense. + """ + denom = float(np.dot(x, x)) + if denom == 0.0: + return 0.0 + return float(np.dot(y, x) / denom) + + +tolerance = 1e-10 + +@pytest.mark.parametrize("N", [256, 2048]) +def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None: + """ + Consistency test in coefficient domain: + mdct(imdct(X)) ~= g * X + + For the chosen (non-orthonormal) scaling, g is expected to be close to 2. + """ + rng = np.random.default_rng(0) + K = N // 2 + + X: MdctCoeffs = rng.normal(size=K).astype(np.float64) + x: TimeSignal = _imdct(X) + X_hat: MdctCoeffs = _mdct(x) + + g = _estimate_gain(X_hat, X) + _assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance) + _assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance) + + +@pytest.mark.parametrize("N", [256, 2048]) +def test_mdct_linearity(N: int) -> None: + """ + Linearity test: + mdct(a*x + b*y) == a*mdct(x) + b*mdct(y) + """ + rng = np.random.default_rng(1) + x: TimeSignal = rng.normal(size=N).astype(np.float64) + y: TimeSignal = rng.normal(size=N).astype(np.float64) + + a = 0.37 + b = -1.12 + + left: MdctCoeffs = _mdct(a * x + b * y) + right: MdctCoeffs = a * _mdct(x) + b * _mdct(y) + + _assert_allclose(left, right, rtol=tolerance, atol=tolerance) + + +@pytest.mark.parametrize("N", [256, 2048]) +def test_imdct_linearity(N: int) -> None: + """ + Linearity test for IMDCT: + imdct(a*X + b*Y) == a*imdct(X) + b*imdct(Y) + """ + rng = np.random.default_rng(2) + K = N // 2 + + X: MdctCoeffs = rng.normal(size=K).astype(np.float64) + Y: MdctCoeffs = rng.normal(size=K).astype(np.float64) + + a = -0.5 + b = 2.0 + + left: TimeSignal = _imdct(a * X + b * Y) + right: TimeSignal = a * _imdct(X) + b * _imdct(Y) + + _assert_allclose(left, right, rtol=tolerance, atol=tolerance) + + +@pytest.mark.parametrize("N", [256, 2048]) +def test_mdct_imdct_outputs_are_finite(N: int) -> None: + """ + Sanity test: no NaN/inf on random inputs. + """ + rng = np.random.default_rng(3) + K = N // 2 + + x: TimeSignal = rng.normal(size=N).astype(np.float64) + X: MdctCoeffs = rng.normal(size=K).astype(np.float64) + + X1 = _mdct(x) + x1 = _imdct(X) + + assert np.isfinite(X1).all() + assert np.isfinite(x1).all() diff --git a/source/level_1/level_1.py b/source/level_1/level_1.py index af764c2..0e0b4a7 100644 --- a/source/level_1/level_1.py +++ b/source/level_1/level_1.py @@ -1,843 +1,186 @@ -#! /usr/bin/env python - +# ------------------------------------------------------------ +# AAC Coder/Decoder - Level 1 Wrappers + Demo +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 wrapper module. +# +# This file provides: +# - Thin wrappers for Level 1 API functions (encode/decode) that delegate +# to the corresponding core implementations. +# - A demo function that runs end-to-end and computes SNR. +# - A small CLI entrypoint for convenience. +# ------------------------------------------------------------ from __future__ import annotations from pathlib import Path -from typing import Dict, Tuple, List, Literal, TypedDict, Union +from typing import Union import numpy as np import soundfile as sf -from scipy.signal.windows import kaiser -# -------------------------------- -# Public Type aliases (Level 1) -# -------------------------------- +from core.aac_types import AACSeq1, StereoSignal +from core.aac_coder import aac_coder_1 as core_aac_coder_1 +from core.aac_coder import aac_read_wav_stereo_48k +from core.aac_decoder import aac_decoder_1 as core_aac_decoder_1 -FrameType = Literal["OLS", "LSS", "ESH", "LPS"] -""" -Frame type codes: -- "OLS": ONLY_LONG_SEQUENCE -- "LSS": LONG_START_SEQUENCE -- "ESH": EIGHT_SHORT_SEQUENCE -- "LPS": LONG_STOP_SEQUENCE -""" - -WinType = Literal["KBD", "SIN"] -""" -Window type codes: -- "KBD": Kaiser-Bessel-Derived -- "SIN": sinusoid -""" - -FrameT = np.ndarray -""" -Time-domain frame. -Expected shape: (2048, 2) for stereo (two channels). -dtype: float (e.g., float32/float64). -""" - -FrameChannelT = np.ndarray -""" -Time-domain single channel frame. -Expected shape: (2048,). -dtype: float (e.g., float32/float64). -""" - - -FrameF = np.ndarray -""" -Frequency-domain frame (MDCT coefficients). -As per spec (Level 1): -- If frame_type in {"OLS","LSS","LPS"}: shape (1024, 2) -- If frame_type == "ESH": shape (128, 16) where 8 subframes x 2 channels - are placed in columns according to the subframe order (i.e., each subframe is (128,2)). -""" - -ChannelKey = Literal["chl", "chr"] - - -class AACChannelFrameF(TypedDict): - """Channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).""" - frame_F: np.ndarray - # frame_F for one channel: - # - ESH: shape (128, 8) - # - else: shape (1024, 1) - - -class AACSeq1Frame(TypedDict): - """One frame dictionary of aac_seq_1 (Level 1).""" - frame_type: FrameType - win_type: WinType - chl: AACChannelFrameF - chr: AACChannelFrameF - - -AACSeq1 = List[AACSeq1Frame] -"""AAC sequence for Level 1: -List of length K (K = number of frames). -Each element is a dict with keys: -- "frame_type", "win_type", "chl", "chr" -""" - -# Global Options -# ----------------------------------------------------------------------------- - -# Window type -# Options: "SIN", "KBD" -WIN_TYPE: WinType = "SIN" - - -# Private helpers for SSC -# ----------------------------------------------------------------------------- - -# See Table 1 in mm-2025-hw-v0.1.pdf -STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = { - ("OLS", "OLS"): "OLS", - ("OLS", "LSS"): "LSS", - ("OLS", "ESH"): "ESH", - ("OLS", "LPS"): "LPS", - ("LSS", "OLS"): "LSS", - ("LSS", "LSS"): "LSS", - ("LSS", "ESH"): "ESH", - ("LSS", "LPS"): "ESH", - ("ESH", "OLS"): "ESH", - ("ESH", "LSS"): "ESH", - ("ESH", "ESH"): "ESH", - ("ESH", "LPS"): "ESH", - ("LPS", "OLS"): "LPS", - ("LPS", "LSS"): "ESH", - ("LPS", "ESH"): "ESH", - ("LPS", "LPS"): "LPS", -} - -def _detect_attack(next_frame_channel: FrameChannelT) -> bool: - """ - Detect if next frame (single channel) implies ESH according to the spec's attack criterion. - - Parameters - ---------- - next_frame_channel : FrameChannelT - One channel of next_frame_T (shape: (2048,), dtype float). - - Returns - ------- - attack : bool - True if an attack is detected (=> next frame predicted ESH), else False. - - Notes - ----- - The spec describes: - - - High-pass filter applied to next_frame_channel - - Split into 16 segments of length 128 - - Compute segment energies s(l) - - Compute ds(l) = s(l) / s(l-1) - - Attack exists if there exists l in {1..7} such that: - s(l) > 1e-3 and ds(l) > 10 - """ - x = next_frame_channel # local alias, x assumed to be a 1-D array of length 2048 - - # High-pass filter H(z) = (1 - z^-1) / (1 - 0.5 z^-1) - # Implemented as: y[n] = x[n] - x[n-1] + 0.5*y[n-1] - y = np.zeros_like(x) - prev_x = 0.0 - prev_y = 0.0 - for n in range(x.shape[0]): - xn = float(x[n]) - yn = (xn - prev_x) + 0.5 * prev_y - y[n] = yn - prev_x = xn - prev_y = yn - - # Segment energies over 16 blocks of 128 samples. - s = np.empty(16, dtype=np.float64) - for l in range(16): - a = l * 128 - b = (l + 1) * 128 - seg = y[a:b] - s[l] = float(np.sum(seg * seg)) - - # ds(l) for l>=1. For l=0 not defined, keep 0. - ds = np.zeros(16, dtype=np.float64) - eps = 1e-12 # avoid division by zero without changing logic materially - for l in range(1, 16): - ds[l] = s[l] / max(s[l - 1], eps) - - # Spec: check l in {1..7} - for l in range(1, 8): - if (s[l] > 1e-3) and (ds[l] > 10.0): - return True - - return False - - -def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType: - """ - Decide current frame type for a single channel based on prev_frame_type and next-frame attack. - - Parameters - ---------- - prev_frame_type : FrameType - Previous frame type (one of "OLS","LSS","ESH","LPS"). - attack : bool - Whether next frame is predicted ESH for this channel. - - Returns - ------- - frame_type : FrameType - The per-channel decision for the current frame. - - Rules (spec) - ------------ - - If prev is "LSS" => current is "ESH" (fixed) - - If prev is "LPS" => current is "OLS" (fixed) - - If prev is "OLS" => current is "LSS" if attack else "OLS" - - If prev is "ESH" => current is "ESH" if attack else "LPS" - """ - if prev_frame_type == "LSS": - return "ESH" - if prev_frame_type == "LPS": - return "OLS" - if prev_frame_type == "OLS": - return "LSS" if attack else "OLS" - if prev_frame_type == "ESH": - return "ESH" if attack else "LPS" - - raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}") - - -def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: - """ - Merge per-channel frame types into one common frame type using the spec table. - - Parameters - ---------- - ft_l : FrameType - Frame type decision for channel 0 (left). - ft_r : FrameType - Frame type decision for channel 1 (right). - - Returns - ------- - common : FrameType - The common final frame type. - """ - try: - return STEREO_MERGE_TABLE[(ft_l, ft_r)] - except KeyError as e: - raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e - - - -# Private helpers for Filterbank -# ----------------------------------------------------------------------------- - -def _sin_window(N: int) -> np.ndarray: - """ - Sine window (full length N). - w[n] = sin(pi/N * (n + 0.5)), 0 <= n < N - """ - n = np.arange(N, dtype=np.float64) - return np.sin((np.pi / N) * (n + 0.5)) - - -def _kbd_window(N: int, alpha: float) -> np.ndarray: - """ - Kaiser-Bessel-Derived (KBD) window (full length N). - - This follows the standard KBD construction: - - Build Kaiser kernel of length N/2 + 1 - - Use cumulative sum and sqrt normalization to form left and right halves - """ - half = N // 2 - - # Kaiser kernel length: half + 1 samples (0 .. half) - # beta = pi * alpha per the usual correspondence with the ISO definition - kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64) - - csum = np.cumsum(kernel) - denom = csum[-1] - - w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1 - w_right = w_left[::-1] # mirror for second half - - return np.concatenate([w_left, w_right]) - - -def _long_window(win_type: WinType) -> np.ndarray: - """ - Long window (length 2048) for the selected win_type. - """ - if win_type == "SIN": - return _sin_window(2048) - if win_type == "KBD": - # Assignment-specific alpha values - return _kbd_window(2048, alpha=6.0) - raise ValueError(f"Invalid win_type: {win_type!r}") - - -def _short_window(win_type: WinType) -> np.ndarray: - """ - Short window (length 256) for the selected win_type. - """ - if win_type == "SIN": - return _sin_window(256) - if win_type == "KBD": - # Assignment-specific alpha values - return _kbd_window(256, alpha=4.0) - raise ValueError(f"Invalid win_type: {win_type!r}") - - -def _window_sequence(frame_type: FrameType, win_type: WinType) -> np.ndarray: - """ - Build the 2048-sample window sequence for OLS/LSS/LPS. - - We follow the simplified assumption: - - The same window shape (KBD or SIN) is used globally (no mixed halves). - - Therefore, the left and right halves are drawn from the same family. - """ - wL = _long_window(win_type) # length 2048 - wS = _short_window(win_type) # length 256 - - if frame_type == "OLS": - return wL - - if frame_type == "LSS": - # 0..1023: left half of long window - # 1024..1471: ones (448 samples) - # 1472..1599: right half of short window (128 samples) - # 1600..2047: zeros (448 samples) - out = np.zeros(2048, dtype=np.float64) - out[0:1024] = wL[0:1024] - out[1024:1472] = 1.0 - out[1472:1600] = wS[128:256] - out[1600:2048] = 0.0 - return out - - if frame_type == "LPS": - # 0..447: zeros (448) - # 448..575: left half of short window (128) - # 576..1023: ones (448) - # 1024..2047: right half of long window (1024) - out = np.zeros(2048, dtype=np.float64) - out[0:448] = 0.0 - out[448:576] = wS[0:128] - out[576:1024] = 1.0 - out[1024:2048] = wL[1024:2048] - return out - - raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}") - - -def _mdct(s: np.ndarray) -> np.ndarray: - """ - MDCT (direct form) as given in the assignment. - - Input: - s: windowed time samples of length N (N = 2048 or 256) - - Output: - X: MDCT coefficients of length N/2 - - Definition: - X[k] = 2 * sum_{n=0 .. N-1} s[n] * cos(2*pi/N * (n + n0) * (k + 1/2)) - where n0 = (N/2 + 1)/2 - """ - s = np.asarray(s, dtype=np.float64) - N = int(s.shape[0]) - if N not in (2048, 256): - raise ValueError("MDCT input length must be 2048 or 256.") - - n0 = (N / 2.0 + 1.0) / 2.0 - - n = np.arange(N, dtype=np.float64) + n0 - k = np.arange(N // 2, dtype=np.float64) + 0.5 - - # Cosine matrix: shape (N, N/2) - C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) - X = 2.0 * (s @ C) - - return X - -def _imdct(X: np.ndarray) -> np.ndarray: - """ - IMDCT (direct form) as given in the assignment. - - Input: - X: MDCT coefficients of length N/2 (N = 2048 or 256) - - Output: - s: time samples of length N - - Definition: - s[n] = (2/N) * sum_{k=0 .. N/2-1} X[k] * cos(2*pi/N * (n + n0) * (k + 1/2)) - where n0 = (N/2 + 1)/2 - """ - X = np.asarray(X, dtype=np.float64).reshape(-1) - K = int(X.shape[0]) - if K not in (1024, 128): - raise ValueError("IMDCT input length must be 1024 or 128.") - - N = 2 * K - n0 = (N / 2.0 + 1.0) / 2.0 - - n = np.arange(N, dtype=np.float64) + n0 - k = np.arange(K, dtype=np.float64) + 0.5 - - C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K) - s = (2.0 / N) * (C @ X) - - return s - - -def _filter_bank_esh_channel(x_ch: np.ndarray, win_type: WinType) -> np.ndarray: - """ - ESH analysis for one channel. - - Returns: - X_esh: shape (128, 8), where each column is the 128 MDCT coeffs of one short window. - """ - wS = _short_window(win_type) - X_esh = np.empty((128, 8), dtype=np.float64) - - # ESH subwindows are taken from the central region: - # start positions: 448 + 128*j, j = 0..7 - for j in range(8): - start = 448 + 128 * j - seg = x_ch[start:start + 256] * wS - X_esh[:, j] = _mdct(seg) - - return X_esh - - - - -def _unpack_esh(frame_F: np.ndarray) -> tuple[np.ndarray, np.ndarray]: - """ - Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8). - - Mapping is the inverse of the packing used in filter_bank(): - out[:, 2*j] = left[:, j] - out[:, 2*j+1] = right[:, j] - """ - if frame_F.shape != (128, 16): - raise ValueError("ESH frame_F must have shape (128, 16).") - - left = np.empty((128, 8), dtype=np.float64) - right = np.empty((128, 8), dtype=np.float64) - for j in range(8): - left[:, j] = frame_F[:, 2 * j + 0] - right[:, j] = frame_F[:, 2 * j + 1] - return left, right - - -def _i_filter_bank_esh_channel(X_esh: np.ndarray, win_type: WinType) -> np.ndarray: - """ - ESH synthesis for one channel. - - Input: - X_esh: (128, 8) MDCT coeffs for 8 short windows - - Output: - x_ch: (2048, ) time-domain frame contribution (windowed), - ready for OLA at the caller level. - """ - if X_esh.shape != (128, 8): - raise ValueError("X_esh must have shape (128, 8).") - - wS = _short_window(win_type) - out = np.zeros(2048, dtype=np.float64) - - # Each short IMDCT returns 256 samples. Place them at: - # start = 448 + 128*j, j=0..7 (50% overlap) - for j in range(8): - seg = _imdct(X_esh[:, j]) * wS # (256,) - start = 448 + 128 * j - out[start:start + 256] += seg - - return out # ----------------------------------------------------------------------------- -# Public Function prototypes (Level 1) +# Public Level 1 API (wrappers) # ----------------------------------------------------------------------------- -def SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: - """ - Sequence Segmentation Control (SSC). - Selects and returns the frame type for the current frame (i) based on input parameters. - - Parameters - ------- - frame_T: FrameT - current time-domain frame i, stereo, shape (2048, 2) - next_frame_T: FrameT - next time-domain frame (i+1), stereo, shape (2048, 2) - (used to decide transitions to/from ESH) - prev_frame_type: FrameType - frame type chosen for the previous frame (i-1) - - Returns - ------- - frame_type : FrameType - - "OLS" (ONLY_LONG_SEQUENCE) - - "LSS" (LONG_START_SEQUENCE) - - "ESH" (EIGHT_SHORT_SEQUENCE) - - "LPS" (LONG_STOP_SEQUENCE) - """ - if frame_T.shape != (2048, 2): - raise ValueError("frame_T must have shape (2048, 2).") - if next_frame_T.shape != (2048, 2): - raise ValueError("next_frame_T must have shape (2048, 2).") - - # Detect attack independently per channel on next frame. - attack_l = _detect_attack(next_frame_T[:, 0]) - attack_r = _detect_attack(next_frame_T[:, 1]) - - # Decide per-channel type based on shared prev_frame_type. - ft_l = _decide_frame_type(prev_frame_type, attack_l) - ft_r = _decide_frame_type(prev_frame_type, attack_r) - - # Stereo merge as per Table 1. - return _stereo_merge(ft_l, ft_r) - - -def filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF: - """ - Filterbank stage (MDCT analysis). - - Parameters - ---------- - frame_T : FrameT - Time-domain frame, stereo, shape (2048, 2). - frame_type : FrameType - Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS"). - win_type : WinType - Window type ("KBD" or "SIN") used for the current frame. - - Returns - ------- - frame_F : FrameF - Frequency-domain MDCT coefficients: - - If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2) - containing MDCT coefficients for both channels. - - If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2), - placed in columns according to subframe order, i.e. overall shape (128, 16). - """ - if frame_T.shape != (2048, 2): - raise ValueError("frame_T must have shape (2048, 2).") - - xL = frame_T[:, 0].astype(np.float64, copy=False) - xR = frame_T[:, 1].astype(np.float64, copy=False) - - if frame_type in ("OLS", "LSS", "LPS"): - w = _window_sequence(frame_type, win_type) # length 2048 - XL = _mdct(xL * w) # length 1024 - XR = _mdct(xR * w) # length 1024 - out = np.empty((1024, 2), dtype=np.float64) - out[:, 0] = XL - out[:, 1] = XR - return out - - if frame_type == "ESH": - Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8) - Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8) - - # Pack into (128, 16): each subframe as (128,2) placed in columns - out = np.empty((128, 16), dtype=np.float64) - for j in range(8): - out[:, 2 * j + 0] = Xl[:, j] - out[:, 2 * j + 1] = Xr[:, j] - return out - - raise ValueError(f"Invalid frame_type: {frame_type!r}") - - -def i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT: - """ - Inverse filterbank (IMDCT synthesis). - - Parameters - ---------- - frame_F : FrameF - Frequency-domain MDCT coefficients as produced by filter_bank(). - frame_type : FrameType - Frame type ("OLS"|"LSS"|"ESH"|"LPS"). - win_type : WinType - Window type ("KBD" or "SIN"). - - Returns - ------- - frame_T : FrameT - Reconstructed time-domain frame, stereo, shape (2048, 2). - """ - if frame_type in ("OLS", "LSS", "LPS"): - if frame_F.shape != (1024, 2): - raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).") - - w = _window_sequence(frame_type, win_type) - - xL = _imdct(frame_F[:, 0]) * w - xR = _imdct(frame_F[:, 1]) * w - - out = np.empty((2048, 2), dtype=np.float64) - out[:, 0] = xL - out[:, 1] = xR - return out - - if frame_type == "ESH": - if frame_F.shape != (128, 16): - raise ValueError("For ESH, frame_F must have shape (128, 16).") - - Xl, Xr = _unpack_esh(frame_F) - xL = _i_filter_bank_esh_channel(Xl, win_type) - xR = _i_filter_bank_esh_channel(Xr, win_type) - - out = np.empty((2048, 2), dtype=np.float64) - out[:, 0] = xL - out[:, 1] = xR - return out - - raise ValueError(f"Invalid frame_type: {frame_type!r}") - - def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: """ - Level-1 AAC encoder. + Level-1 AAC encoder (wrapper). + + Delegates to core implementation. Parameters ---------- - filename_in : str | Path + filename_in : Union[str, Path] Input WAV filename. Assumption: stereo audio, sampling rate 48 kHz. Returns ------- - aac_seq_1 : AACSeq1 - List of K encoded frames. - For each i: - - - aac_seq_1[i]["frame_type"]: FrameType - - aac_seq_1[i]["win_type"]: WinType - - aac_seq_1[i]["chl"]["frame_F"]: - - ESH: shape (128, 8) - - else: shape (1024, 1) - - aac_seq_1[i]["chr"]["frame_F"]: - - ESH: shape (128, 8) - - else: shape (1024, 1) + AACSeq1 + List of encoded frames (Level 1 schema). """ - filename_in = Path(filename_in) - - x, fs = sf.read(str(filename_in), always_2d=True) - x = np.asarray(x, dtype=np.float64) - - if x.shape[1] != 2: - raise ValueError("Input must be stereo (2 channels).") - if fs != 48000: - raise ValueError("Input sampling rate must be 48 kHz.") - - hop = 1024 - win = 2048 - - # Pad at the beginning to support the first overlap region. - # Tail padding is kept minimal; next-frame is padded on-the-fly when needed. - pad_pre = np.zeros((hop, 2), dtype=np.float64) - pad_post = np.zeros((hop, 2), dtype=np.float64) - x_pad = np.vstack([pad_pre, x, pad_post]) - - # Number of frames such that current frame fits; next frame will be padded if needed. - K = int((x_pad.shape[0] - win) // hop + 1) - if K <= 0: - raise ValueError("Input too short for framing.") - - aac_seq: AACSeq1 = [] - prev_frame_type: FrameType = "OLS" - - for i in range(K): - start = i * hop - - frame_t: FrameT = x_pad[start:start + win, :] - if frame_t.shape != (win, 2): - # This should not happen due to K definition, but we keep it explicit. - raise ValueError("Internal framing error: frame_t has wrong shape.") - - next_t = x_pad[start + hop:start + hop + win, :] - - # Ensure next_t is always (2048,2) by zero-padding at the tail. - if next_t.shape[0] < win: - tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) - next_t = np.vstack([next_t, tail]) - - frame_type = SSC(frame_t, next_t, prev_frame_type) - frame_f = filter_bank(frame_t, frame_type, WIN_TYPE) - - # Store per-channel as required by AACSeq1 schema - if frame_type == "ESH": - # frame_f: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] - chl_f = np.empty((128, 8), dtype=np.float64) - chr_f = np.empty((128, 8), dtype=np.float64) - for j in range(8): - chl_f[:, j] = frame_f[:, 2 * j + 0] - chr_f[:, j] = frame_f[:, 2 * j + 1] - else: - # frame_f: (1024, 2) - chl_f = frame_f[:, 0:1].astype(np.float64, copy=False) - chr_f = frame_f[:, 1:2].astype(np.float64, copy=False) - - aac_seq.append({ - "frame_type": frame_type, - "win_type": WIN_TYPE, - "chl": {"frame_F": chl_f}, - "chr": {"frame_F": chr_f}, - }) - prev_frame_type = frame_type - return aac_seq + return core_aac_coder_1(filename_in) -def i_aac_coder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> np.ndarray: +def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: """ - Level-1 AAC decoder (inverse of aac_coder_1()). + Level-1 AAC decoder (wrapper). + + Delegates to core implementation. Parameters ---------- aac_seq_1 : AACSeq1 Encoded sequence as produced by aac_coder_1(). - filename_out : str | Path - Output WAV filename. - Assumption: stereo audio, sampling rate 48 kHz. + filename_out : Union[str, Path] + Output WAV filename. Assumption: 48 kHz, stereo. Returns ------- - x : np.ndarray - Decoded audio samples (time-domain). - Expected shape: (N, 2) for stereo (N depends on input length). + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. """ - filename_out = Path(filename_out) + return core_aac_decoder_1(aac_seq_1, filename_out) - hop = 1024 - win = 2048 - K = len(aac_seq_1) - # Output includes the encoder padding region, so we reconstruct - # full padded stream. For K frames: last frame starts at (K-1)*hop and spans win, - # so total length = (K-1)*hop + win - n_pad = (K - 1) * hop + win - y_pad = np.zeros((n_pad, 2), dtype=np.float64) +# ----------------------------------------------------------------------------- +# Demo (Level 1) +# ----------------------------------------------------------------------------- - for i, fr in enumerate(aac_seq_1): - frame_type = fr["frame_type"] - win_type = fr["win_type"] +def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Compute overall SNR (dB) over all samples and channels after aligning lengths. - chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) - chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + Parameters + ---------- + x_ref : StereoSignal + Reference stereo stream. + x_hat : StereoSignal + Reconstructed stereo stream. - # Re-pack into the format expected by i_filter_bank() - if frame_type == "ESH": - if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): - raise ValueError("ESH channel frame_F must have shape (128, 8).") + Returns + ------- + float + SNR in dB. + - Returns +inf if noise power is zero. + - Returns -inf if signal power is zero. + """ + x_ref = np.asarray(x_ref, dtype=np.float64) + x_hat = np.asarray(x_hat, dtype=np.float64) - frame_f = np.empty((128, 16), dtype=np.float64) - for j in range(8): - frame_f[:, 2 * j + 0] = chl_f[:, j] - frame_f[:, 2 * j + 1] = chr_f[:, j] - else: - if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1): - raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).") + if x_ref.ndim == 1: + x_ref = x_ref.reshape(-1, 1) + if x_hat.ndim == 1: + x_hat = x_hat.reshape(-1, 1) - frame_f = np.empty((1024, 2), dtype=np.float64) - frame_f[:, 0] = chl_f[:, 0] - frame_f[:, 1] = chr_f[:, 0] + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) - frame_t_hat = i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) + x_ref = x_ref[:n, :c] + x_hat = x_hat[:n, :c] - start = i * hop - y_pad[start:start + win, :] += frame_t_hat + err = x_ref - x_hat + ps = float(np.sum(x_ref * x_ref)) + pn = float(np.sum(err * err)) - # Remove boundary padding that encoder adds: hop samples at start and hop at end. - if y_pad.shape[0] < 2 * hop: - raise ValueError("Decoded stream too short to unpad.") + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") - y = y_pad[hop:-hop, :] - - sf.write(str(filename_out), y, 48000) - return y + return float(10.0 * np.log10(ps / pn)) def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float: """ - Demonstration for Level-1 codec. + Demonstration for the Level-1 codec. Runs: - aac_coder_1(filename_in) - - i_aac_coder_1(aac_seq_1, filename_out) + - aac_decoder_1(aac_seq_1, filename_out) and computes total SNR between original and decoded audio. Parameters ---------- - filename_in : str | Path + filename_in : Union[str, Path] Input WAV filename (stereo, 48 kHz). - filename_out : str | Path + filename_out : Union[str, Path] Output WAV filename (stereo, 48 kHz). Returns ------- - SNR : float - Overall Signal-to-Noise Ratio in dB. + float + Overall SNR in dB. """ filename_in = Path(filename_in) filename_out = Path(filename_out) - # Read original audio (reference) - x_ref, fs_ref = sf.read(str(filename_in), always_2d=True) - x_ref = np.asarray(x_ref, dtype=np.float64) + # Read original audio (reference) with the same validation as the codec. + x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in) + if int(fs_ref) != 48000: + raise ValueError("Input sampling rate must be 48 kHz.") # Encode / decode aac_seq_1 = aac_coder_1(filename_in) - x_hat = i_aac_coder_1(aac_seq_1, filename_out) - x_hat = np.asarray(x_hat, dtype=np.float64) + x_hat = aac_decoder_1(aac_seq_1, filename_out) - # Ensure 2D stereo shape (N, 2) - if x_hat.ndim == 1: - x_hat = x_hat.reshape(-1, 1) - if x_ref.ndim == 1: - x_ref = x_ref.reshape(-1, 1) + # Optional sanity: ensure output file exists and is readable + x_hat_file, fs_hat = sf.read(str(filename_out), always_2d=True) + _ = x_hat_file + if int(fs_hat) != 48000: + raise ValueError("Decoded output sampling rate must be 48 kHz.") - # Align lengths (use common overlap) - n = min(x_ref.shape[0], x_hat.shape[0]) - x_ref = x_ref[:n, :] - x_hat = x_hat[:n, :] + return _snr_db(x_ref, x_hat) - # Match channel count conservatively (common channels) - c = min(x_ref.shape[1], x_hat.shape[1]) - x_ref = x_ref[:, :c] - x_hat = x_hat[:, :c] - - # Compute overall SNR over all samples and channels - err = x_ref - x_hat - p_signal = float(np.sum(x_ref * x_ref)) - p_noise = float(np.sum(err * err)) - - if p_noise <= 0.0: - return float("inf") - if p_signal <= 0.0: - # Degenerate case: silent input - return -float("inf") - # else: - snr_db = 10.0 * np.log10(p_signal / p_noise) - return float(snr_db) +# ----------------------------------------------------------------------------- +# CLI +# ----------------------------------------------------------------------------- if __name__ == "__main__": - # Example usage: + # Example: # python -m level_1.level_1 input.wav output.wav import sys if len(sys.argv) != 3: raise SystemExit("Usage: python -m level_1.level_1 ") - in_wav = sys.argv[1] - out_wav = sys.argv[2] + in_wav = Path(sys.argv[1]) + out_wav = Path(sys.argv[2]) print(f"Encoding/Decoding {in_wav} to {out_wav}") snr = demo_aac_1(in_wav, out_wav) print(f"SNR = {snr:.3f} dB") - diff --git a/source/level_1/tests/test_SSC.py b/source/level_1/tests/test_SSC.py deleted file mode 100644 index 362559e..0000000 --- a/source/level_1/tests/test_SSC.py +++ /dev/null @@ -1,199 +0,0 @@ -import numpy as np -import pytest - -# Adjust the import based on package/module layout. -from level_1.level_1 import SSC - -# Helper "fixtures" for SSC -# ----------------------------------------------------------------------------- - -def _next_frame_no_attack() -> np.ndarray: - """ - Build a next_frame_T that should NOT trigger ESH detection. - - Uses exact zeros so all s2l are zero and the ESH condition (s2l > 1e-3) cannot hold. - """ - return np.zeros((2048, 2), dtype=np.float64) - - -def _next_frame_strong_attack( - *, - attack_left: bool, - attack_right: bool, - segment_l: int = 4, - baseline: float = 1e-6, - burst_amp: float = 1.0, -) -> np.ndarray: - """ - Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels. - - Spec: ESH if exists l in {1..7} with s2l > 1e-3 AND ds2l > 10. - We create: - - small baseline energy in all samples (avoids division by zero in ds2l), - - a strong burst inside one 128-sample segment l in 1..7. - """ - assert 1 <= segment_l <= 7 - x = np.full((2048, 2), baseline, dtype=np.float64) - - a = segment_l * 128 - b = (segment_l + 1) * 128 - - if attack_left: - x[a:b, 0] += burst_amp - if attack_right: - x[a:b, 1] += burst_amp - - return x - - -def _next_frame_below_s2l_threshold( - *, - left: bool, - right: bool, - segment_l: int = 4, - impulse_amp: float = 0.01, -) -> np.ndarray: - """ - Construct a next_frame_T where s2l is below 1e-3, so ESH must NOT be triggered, - even if ds2l could be large. - - Put a single impulse of amplitude 'impulse_amp' inside a segment. - Energy in the 128-sample segment: s2l ~= impulse_amp^2. - With impulse_amp=0.01 => s2l ~= 1e-4 < 1e-3. - """ - assert 1 <= segment_l <= 7 - x = np.zeros((2048, 2), dtype=np.float64) - - idx = segment_l * 128 + 10 # inside segment - if left: - x[idx, 0] = impulse_amp - if right: - x[idx, 1] = impulse_amp - - return x - - -# --------------------------------------------------------------------- -# 1) Fixed/mandatory cases (prev frame type forces current type) -# --------------------------------------------------------------------- - -def test_ssc_fixed_cases_prev_lss_and_lps() -> None: - """ - Spec: if prev was: - - LSS => current MUST be ESH - - LPS => current MUST be OLS - independent of next frame check. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - - # Even if next frame has a strong attack, LSS must force ESH. - next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True) - out1 = SSC(frame_t, next_attack, "LSS") - assert out1 == "ESH" - - # Even if next frame has a strong attack, LPS must force OLS. - out2 = SSC(frame_t, next_attack, "LPS") - assert out2 == "OLS" - - -# --------------------------------------------------------------------- -# 2) Cases requiring next-frame ESH prediction (energy/attack computation) -# --------------------------------------------------------------------- - -def test_prev_ols_next_not_esh_returns_ols() -> None: - """ - Spec: if prev=OLS, current is OLS or LSS. - Choose LSS iff (i+1) predicted ESH, else OLS. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_no_attack() - - out = SSC(frame_t, next_t, "OLS") - assert out == "OLS" - - -def test_prev_ols_next_esh_both_channels_returns_lss() -> None: - """ - prev=OLS, next predicted ESH (both channels) => per-channel decisions are LSS and LSS - and merge table keeps LSS. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) - - out = SSC(frame_t, next_t, "OLS") - assert out == "LSS" - - -def test_prev_ols_next_esh_one_channel_returns_lss() -> None: - """ - prev=OLS: - - one channel predicts ESH => LSS - - other channel predicts not ESH => OLS - Merge table: OLS + LSS => LSS. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - - next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) - out1 = SSC(frame_t, next1_t, "OLS") - assert out1 == "LSS" - - next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True) - out2 = SSC(frame_t, next2_t, "OLS") - assert out2 == "LSS" - - -def test_prev_esh_next_esh_both_channels_returns_esh() -> None: - """ - prev=ESH: - - next predicted ESH => current ESH (per-channel) - Merge table: ESH + ESH => ESH. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_strong_attack(attack_left=True, attack_right=True) - - out = SSC(frame_t, next_t, "ESH") - assert out == "ESH" - - -def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None: - """ - prev=ESH: - - next not predicted ESH => current LPS (per-channel) - Merge table: LPS + LPS => LPS. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_no_attack() - - out = SSC(frame_t, next_t, "ESH") - assert out == "LPS" - - -def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None: - """ - prev=ESH: - - one channel predicts ESH => ESH - - other channel predicts not ESH => LPS - Merge table: ESH + LPS => ESH. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - - next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False) - out1 = SSC(frame_t, next1_t, "ESH") - assert out1 == "ESH" - - next2_t = _next_frame_strong_attack(attack_left=True, attack_right=False) - out2 = SSC(frame_t, next2_t, "ESH") - assert out2 == "ESH" - -def test_threshold_s2l_must_exceed_1e_3() -> None: - """ - Spec: next frame is ESH only if s2l > 1e-3 AND ds2l > 10 for some l in 1..7. - This test checks the necessity of the s2l threshold: - - Create a frame with s2l ~= 1e-4 < 1e-3 (single impulse with amp 0.01). - - Expect: not classified as ESH -> for prev=OLS return OLS. - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - next_t = _next_frame_below_s2l_threshold(left=True, right=True, impulse_amp=0.01) - - out = SSC(frame_t, next_t, "OLS") - assert out == "OLS" diff --git a/source/level_1/tests/test_filterbank.py b/source/level_1/tests/test_filterbank.py deleted file mode 100644 index 5c3f5ef..0000000 --- a/source/level_1/tests/test_filterbank.py +++ /dev/null @@ -1,235 +0,0 @@ -import numpy as np -import pytest - -from level_1.level_1 import FrameType, WinType, filter_bank, i_filter_bank - -# Helper "fixtures" for filterbank -# ----------------------------------------------------------------------------- - -def _ola_reconstruct(x: np.ndarray, frame_types: list[str], win_type: str) -> np.ndarray: - """ - Analyze-synthesize each frame and overlap-add with hop=1024. - x: shape (N,2) - frame_types: length K, for frames starting at i*1024 - """ - hop = 1024 - win = 2048 - K = len(frame_types) - - y = np.zeros_like(x, dtype=np.float64) - - for i in range(K): - start = i * hop - frame_t = x[start:start + win, :] - frame_f = filter_bank(frame_t, frame_types[i], win_type) - frame_t_hat = i_filter_bank(frame_f, frame_types[i], win_type) - y[start:start + win, :] += frame_t_hat - - return y - - -def _snr_db(x: np.ndarray, y: np.ndarray) -> float: - err = x - y - ps = float(np.sum(x * x)) - pn = float(np.sum(err * err)) - if pn <= 0.0: - return float("inf") - return 10.0 * np.log10(ps / pn) - -# --------------------------------------------------------------------- -# Forward filterbank tests -# --------------------------------------------------------------------- - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"]) -def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None: - """ - Contract test: - For OLS/LSS/LPS, filter_bank returns shape (1024, 2). - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - frame_f = filter_bank(frame_t, frame_type, win_type) - assert frame_f.shape == (1024, 2) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_shapes_esh(win_type: WinType) -> None: - """ - Contract test: - For ESH, filter_bank returns shape (128, 16). - """ - frame_t = np.zeros((2048, 2), dtype=np.float64) - frame_f = filter_bank(frame_t, "ESH", win_type) - assert frame_f.shape == (128, 16) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None: - """ - Module behavior test: - For OLS (representative long-sequence), channels are processed independently: - - If right channel is zero and left is random, right spectrum should be near zero. - """ - rng = np.random.default_rng(0) - frame_t = np.zeros((2048, 2), dtype=np.float64) - frame_t[:, 0] = rng.normal(size=2048) - - frame_f = filter_bank(frame_t, "OLS", win_type) - - # Right channel output should be (close to) zero - assert np.max(np.abs(frame_f[:, 1])) < 1e-9 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_channel_isolation_esh(win_type: WinType) -> None: - """ - Module behavior test: - For ESH, channels are processed independently: - - If right channel is zero and left is random, all odd columns (right) should be near zero. - """ - rng = np.random.default_rng(1) - frame_t = np.zeros((2048, 2), dtype=np.float64) - frame_t[:, 0] = rng.normal(size=2048) - - frame_f = filter_bank(frame_t, "ESH", win_type) - - # Right channel appears in columns 1,3,5,...,15 - right_cols = frame_f[:, 1::2] - assert np.max(np.abs(right_cols)) < 1e-9 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None: - """ - Spec-driven behavior test: - ESH uses only the central 1152 samples (from 448 to 1599), split into 8 overlapping - windows of length 256 with 50% overlap. - - Therefore, changing samples outside [448, 1600) must not affect the output. - """ - rng = np.random.default_rng(2) - - frame_a = np.zeros((2048, 2), dtype=np.float64) - frame_b = np.zeros((2048, 2), dtype=np.float64) - - # Same central region for both frames - center = rng.normal(size=(1152, 2)) - frame_a[448:1600, :] = center - frame_b[448:1600, :] = center - - # Modify only the outer regions of frame_b - frame_b[0:448, :] = rng.normal(size=(448, 2)) - frame_b[1600:2048, :] = rng.normal(size=(448, 2)) - - fa = filter_bank(frame_a, "ESH", win_type) - fb = filter_bank(frame_b, "ESH", win_type) - - np.testing.assert_allclose(fa, fb, rtol=0.0, atol=0.0) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_filterbank_output_is_finite(win_type: WinType) -> None: - """ - Sanity test: - Output must not contain NaN or inf for representative cases. - """ - rng = np.random.default_rng(3) - frame_t = rng.normal(size=(2048, 2)).astype(np.float64) - - for frame_type in ("OLS", "LSS", "ESH", "LPS"): - frame_f = filter_bank(frame_t, frame_type, win_type) - assert np.isfinite(frame_f).all() - - -# --------------------------------------------------------------------- -# Reverse i_filterbank tests -# --------------------------------------------------------------------- - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ifilterbank_shapes_long_sequences(win_type: str) -> None: - frame_f = np.zeros((1024, 2), dtype=np.float64) - for frame_type in ("OLS", "LSS", "LPS"): - frame_t = i_filter_bank(frame_f, frame_type, win_type) - assert frame_t.shape == (2048, 2) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ifilterbank_shapes_esh(win_type: str) -> None: - frame_f = np.zeros((128, 16), dtype=np.float64) - frame_t = i_filter_bank(frame_f, "ESH", win_type) - assert frame_t.shape == (2048, 2) - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_roundtrip_per_frame_is_finite(win_type: str) -> None: - rng = np.random.default_rng(0) - frame_t = rng.normal(size=(2048, 2)).astype(np.float64) - - for frame_type in ("OLS", "LSS", "ESH", "LPS"): - frame_f = filter_bank(frame_t, frame_type, win_type) - frame_t_hat = i_filter_bank(frame_f, frame_type, win_type) - assert np.isfinite(frame_t_hat).all() - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ola_reconstruction_ols_high_snr(win_type: str) -> None: - """ - Core module-level test: - OLS analysis+synthesis with hop=1024 must reconstruct with high SNR - in the steady-state region. - """ - rng = np.random.default_rng(1) - - K = 6 - N = 1024 * (K + 1) - x = rng.normal(size=(N, 2)).astype(np.float64) - - y = _ola_reconstruct(x, ["OLS"] * K, win_type) - - # Exclude edges (first and last hop) where full overlap is not available - a = 1024 - b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) - assert snr > 50.0 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ola_reconstruction_esh_high_snr(win_type: str) -> None: - """ - ESH analysis+synthesis with hop=1024 must reconstruct with high SNR - in the steady-state region. - """ - rng = np.random.default_rng(2) - - K = 6 - N = 1024 * (K + 1) - x = rng.normal(size=(N, 2)).astype(np.float64) - - y = _ola_reconstruct(x, ["ESH"] * K, win_type) - - a = 1024 - b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) - assert snr > 45.0 - - -@pytest.mark.parametrize("win_type", ["SIN", "KBD"]) -def test_ola_reconstruction_transition_sequence(win_type: str) -> None: - """ - Transition sequence test matching the windowing logic: - OLS -> LSS -> ESH -> LPS -> OLS -> OLS - """ - rng = np.random.default_rng(3) - - frame_types = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"] - K = len(frame_types) - N = 1024 * (K + 1) - x = rng.normal(size=(N, 2)).astype(np.float64) - - y = _ola_reconstruct(x, frame_types, win_type) - - a = 1024 - b = N - 1024 - snr = _snr_db(x[a:b, :], y[a:b, :]) - assert snr > 40.0 diff --git a/source/level_2/level_2.py b/source/level_2/level_2.py new file mode 100644 index 0000000..eb5dc92 --- /dev/null +++ b/source/level_2/level_2.py @@ -0,0 +1,21 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Level 2 Wrappers + Demo +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 2 wrapper module. +# +# This file provides: +# - Thin wrappers for Level 2 API functions (encode/decode) that delegate +# to the corresponding core implementations. +# - A demo function that runs end-to-end and computes SNR. +# - A small CLI entrypoint for convenience. +# ------------------------------------------------------------ +from __future__ import annotations + diff --git a/source/pytest.ini b/source/pytest.ini new file mode 100644 index 0000000..cdcbe3c --- /dev/null +++ b/source/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +pythonpath = . +testpaths = + core/tests \ No newline at end of file