Level_1: File restructure to support centralized development

This commit is contained in:
Christos Choutouridis 2026-02-08 17:22:23 +02:00
parent dde11ddebe
commit 8427d0e721
25 changed files with 3990 additions and 1229 deletions

198
source/core/aac_coder.py Normal file
View File

@ -0,0 +1,198 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC encoder orchestration.
# Keeps the same functional behavior as the original level_1 implementation:
# - Reads WAV via soundfile
# - Validates stereo and 48 kHz
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
# - SSC decision uses next-frame attack detection
# - Filterbank analysis (MDCT)
# - Stores per-channel spectra in AACSeq1 schema:
# * ESH: (128, 8)
# * else: (1024, 1)
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_configuration import WIN_TYPE
from core.aac_filterbank import aac_filter_bank
from core.aac_ssc import aac_SSC
from core.aac_types import *
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
"""
Read a WAV file using soundfile and validate the Level-1 assumptions.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Returns
-------
x : StereoSignal (np.ndarray)
Stereo samples as float64, shape (N, 2).
fs : int
Sampling rate (Hz). Must be 48000.
Raises
------
ValueError
If the input is not stereo or the sampling rate is not 48 kHz.
"""
filename_in = Path(filename_in)
x, fs = sf.read(str(filename_in), always_2d=True)
x = np.asarray(x, dtype=np.float64)
if x.shape[1] != 2:
raise ValueError("Input must be stereo (2 channels).")
if int(fs) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
return x, int(fs)
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
as required by the Level-1 AACSeq1 schema.
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
frame_f : FrameF
Output of aac_filter_bank():
- If frame_type != "ESH": shape (1024, 2)
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
Returns
-------
chl_f : FrameChannelF
Left channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
"""
if frame_type == "ESH":
if frame_f.shape != (128, 16):
raise ValueError("For ESH, frame_f must have shape (128, 16).")
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f[:, 2 * j + 0]
chr_f[:, j] = frame_f[:, 2 * j + 1]
return chl_f, chr_f
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
if frame_f.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
return chl_f, chr_f
# -----------------------------------------------------------------------------
# Level 1 encoder
# -----------------------------------------------------------------------------
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
"""
Level-1 AAC encoder.
This function preserves the behavior of the original level_1 implementation:
- Read stereo 48 kHz WAV
- Pad hop samples at start and hop samples at end
- Frame with win=2048, hop=1024
- Use SSC with next-frame lookahead
- Apply filterbank analysis
- Store per-channel coefficients using AACSeq1 schema
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
Returns
-------
AACSeq1
List of encoded frames (Level 1 schema).
"""
x, fs = aac_read_wav_stereo_48k(filename_in)
_ = fs # kept for clarity; The assignment assumes 48 kHz
hop = 1024
win = 2048
# Pad at the beginning to support the first overlap region.
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
# Number of frames such that current frame fits; next frame will be padded if needed.
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS"
win_type: WinType = WIN_TYPE
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start:start + win, :]
if frame_t.shape != (win, 2):
# This should not happen due to K definition, but keep it explicit.
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop:start + hop + win, :]
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({
"frame_type": frame_type,
"win_type": win_type,
"chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f},
})
prev_frame_type = frame_type
return aac_seq

View File

@ -0,0 +1,22 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Configuration
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module contains the global configurations
#
# ------------------------------------------------------------
from __future__ import annotations
# Imports
from core.aac_types import WinType
# Window type
# Options: "SIN", "KBD"
WIN_TYPE: WinType = "SIN"

166
source/core/aac_decoder.py Normal file
View File

@ -0,0 +1,166 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Inverse AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
# Keeps the same functional behavior as the original level_1 implementation:
# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
# - IMDCT synthesis per frame
# - Overlap-add with hop=1024
# - Remove encoder boundary padding: hop at start and hop at end
#
# Note:
# This core module returns the reconstructed samples. Writing to disk is kept
# in level_x demos.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_filterbank import aac_i_filter_bank
from core.aac_types import *
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
"""
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
FrameF container expected by aac_i_filter_bank().
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
chl_f : FrameChannelF
Left channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
Returns
-------
FrameF
Stereo coefficients:
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
- else: (1024, 2)
"""
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
return frame_f
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_f[:, 0]
frame_f[:, 1] = chr_f[:, 0]
return frame_f
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
"""
Remove the boundary padding that the Level-1 encoder adds:
hop samples at start and hop samples at end.
Parameters
----------
y_pad : StereoSignal (np.ndarray)
Reconstructed padded stream, shape (N_pad, 2).
hop : int
Hop size in samples (default 1024).
Returns
-------
StereoSignal (np.ndarray)
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
Raises
------
ValueError
If y_pad is too short to unpad.
"""
if y_pad.shape[0] < 2 * hop:
raise ValueError("Decoded stream too short to unpad.")
return y_pad[hop:-hop, :]
# -----------------------------------------------------------------------------
# Level 1 decoder (core)
# -----------------------------------------------------------------------------
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-1 AAC decoder (inverse of aac_coder_1()).
This function preserves the behavior of the original level_1 implementation:
- Reconstruct the full padded stream by overlap-adding K synthesized frames
- Remove hop padding at the beginning and hop padding at the end
- Write the reconstructed stereo WAV file (48 kHz)
- Return reconstructed stereo samples as float64
Parameters
----------
aac_seq_1 : AACSeq1
Encoded sequence as produced by aac_coder_1().
filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_1)
# Output includes the encoder padding region, so we reconstruct the full padded stream.
# For K frames: last frame starts at (K-1)*hop and spans win,
# so total length = (K-1)*hop + win.
n_pad = (K - 1) * hop + win
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
for i, fr in enumerate(aac_seq_1):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
start = i * hop
y_pad[start:start + win, :] += frame_t_hat
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
# Level 1 assumption: 48 kHz output.
sf.write(str(filename_out), y, 48000)
return y

View File

@ -0,0 +1,454 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
#
# ------------------------------------------------------------
from __future__ import annotations
from core.aac_types import *
from scipy.signal.windows import kaiser
# Private helpers for Filterbank
# ------------------------------------------------------------
def _sin_window(N: int) -> Window:
"""
Build a sinusoidal (SIN) window of length N.
The AAC sinusoid window is:
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
Parameters
----------
N : int
Window length in samples.
Returns
-------
Window
1-D array of shape (N, ) with dtype float64.
"""
n = np.arange(N, dtype=np.float64)
return np.sin((np.pi / N) * (n + 0.5))
def _kbd_window(N: int, alpha: float) -> Window:
"""
Build a Kaiser-Bessel-Derived (KBD) window of length N.
This follows the standard KBD construction used in AAC:
1) Build a Kaiser kernel of length (N/2 + 1).
2) Form the left half by cumulative summation, normalization, and sqrt.
3) Mirror the left half to form the right half (symmetric full-length window).
Notes
-----
- N must be even (AAC uses N=2048 for long and N=256 for short).
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
Parameters
----------
N : int
Window length in samples (must be even).
alpha : float
KBD alpha parameter.
Returns
-------
Window
1-D array of shape (N,) with dtype float64.
"""
half = N // 2
# Kaiser kernel length: half + 1 samples (0 .. half)
# beta = pi * alpha per the usual correspondence with the ISO definition
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
csum = np.cumsum(kernel)
denom = csum[-1]
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
w_right = w_left[::-1] # mirror for second half
return np.concatenate([w_left, w_right])
def _long_window(win_type: WinType) -> Window:
"""
Return the long AAC window (length 2048) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(2048)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(2048, alpha=6.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _short_window(win_type: WinType) -> Window:
"""
Return the short AAC window (length 256) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (256,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(256)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(256, alpha=4.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
"""
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
In this assignment we assume a single window family is used globally
(no mixed KBD/SIN halves). Therefore, both the long and short windows
are drawn from the same family.
For frame_type:
- "OLS": return the long window Wl (2048).
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
Parameters
----------
frame_type : FrameType
One of "OLS", "LSS", "LPS".
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
wL = _long_window(win_type) # length 2048
wS = _short_window(win_type) # length 256
if frame_type == "OLS":
return wL
if frame_type == "LSS":
# 0..1023: left half of long window
# 1024..1471: ones (448 samples)
# 1472..1599: right half of short window (128 samples)
# 1600..2047: zeros (448 samples)
out = np.zeros(2048, dtype=np.float64)
out[0:1024] = wL[0:1024]
out[1024:1472] = 1.0
out[1472:1600] = wS[128:256]
out[1600:2048] = 0.0
return out
if frame_type == "LPS":
# 0..447: zeros (448)
# 448..575: left half of short window (128)
# 576..1023: ones (448)
# 1024..2047: right half of long window (1024)
out = np.zeros(2048, dtype=np.float64)
out[0:448] = 0.0
out[448:576] = wS[0:128]
out[576:1024] = 1.0
out[1024:2048] = wL[1024:2048]
return out
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
def _mdct(s: TimeSignal) -> MdctCoeffs:
"""
MDCT (direct form) as specified in the assignment.
Parameters
----------
s : TimeSignal
Windowed time samples, 1-D array of length N (N = 2048 or 256).
Returns
-------
MdctCoeffs
MDCT coefficients, 1-D array of length N/2.
Definition
----------
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
s = np.asarray(s, dtype=np.float64).reshape(-1)
N = int(s.shape[0])
if N not in (2048, 256):
raise ValueError("MDCT input length must be 2048 or 256.")
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(N // 2, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
X = 2.0 * (s @ C) # (N/2,)
return X
def _imdct(X: MdctCoeffs) -> TimeSignal:
"""
IMDCT (direct form) as specified in the assignment.
Parameters
----------
X : MdctCoeffs
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
Returns
-------
TimeSignal
Reconstructed time samples, 1-D array of length N = 2K.
Definition
----------
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
X = np.asarray(X, dtype=np.float64).reshape(-1)
K = int(X.shape[0])
if K not in (1024, 128):
raise ValueError("IMDCT input length must be 1024 or 128.")
N = 2 * K
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(K, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
s = (2.0 / N) * (C @ X) # (N,)
return s
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
"""
ESH analysis for one channel.
Parameters
----------
x_ch : FrameChannelT
Time-domain channel frame (expected shape: (2048,)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelF
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
of the j-th short window.
"""
wS = _short_window(win_type) # (256,)
X_esh = np.empty((128, 8), dtype=np.float64)
# ESH subwindows are taken from the central region:
# start positions: 448 + 128*j, j = 0..7
for j in range(8):
start = 448 + 128 * j
seg = x_ch[start:start + 256] * wS # (256,)
X_esh[:, j] = _mdct(seg) # (128,)
return X_esh
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
Parameters
----------
frame_F : FrameF
Packed ESH spectrum (expected shape: (128, 16)).
Returns
-------
left : FrameChannelF
Left channel spectrum, shape (128, 8).
right : FrameChannelF
Right channel spectrum, shape (128, 8).
Notes
-----
Inverse mapping of the packing used in aac_filter_bank():
packed[:, 2*j] = left[:, j]
packed[:, 2*j+1] = right[:, j]
"""
if frame_F.shape != (128, 16):
raise ValueError("ESH frame_F must have shape (128, 16).")
left = np.empty((128, 8), dtype=np.float64)
right = np.empty((128, 8), dtype=np.float64)
for j in range(8):
left[:, j] = frame_F[:, 2 * j + 0]
right[:, j] = frame_F[:, 2 * j + 1]
return left, right
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
"""
ESH synthesis for one channel.
Parameters
----------
X_esh : FrameChannelF
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelT
Time-domain channel contribution, shape (2048,).
This is already overlap-added internally for the 8 short blocks and
ready for OLA at the caller level.
"""
if X_esh.shape != (128, 8):
raise ValueError("X_esh must have shape (128, 8).")
wS = _short_window(win_type) # (256,)
out = np.zeros(2048, dtype=np.float64)
# Each short IMDCT returns 256 samples. Place them at:
# start = 448 + 128*j, j=0..7 (50% overlap)
for j in range(8):
seg = _imdct(X_esh[:, j]) * wS # (256,)
start = 448 + 128 * j
out[start:start + 256] += seg
return out
# -----------------------------------------------------------------------------
# Public Function prototypes (Level 1)
# -----------------------------------------------------------------------------
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
"""
Filterbank stage (MDCT analysis).
Parameters
----------
frame_T : FrameT
Time-domain frame, stereo, shape (2048, 2).
frame_type : FrameType
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN") used for the current frame.
Returns
-------
frame_F : FrameF
Frequency-domain MDCT coefficients:
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
containing MDCT coefficients for both channels.
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
placed in columns according to subframe order, i.e. overall shape (128, 16).
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
if frame_type in ("OLS", "LSS", "LPS"):
w = _window_sequence(frame_type, win_type) # length 2048
XL = _mdct(xL * w) # length 1024
XR = _mdct(xR * w) # length 1024
out = np.empty((1024, 2), dtype=np.float64)
out[:, 0] = XL
out[:, 1] = XR
return out
if frame_type == "ESH":
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
# Pack into (128, 16): each subframe as (128,2) placed in columns
out = np.empty((128, 16), dtype=np.float64)
for j in range(8):
out[:, 2 * j + 0] = Xl[:, j]
out[:, 2 * j + 1] = Xr[:, j]
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
"""
Inverse filterbank (IMDCT synthesis).
Parameters
----------
frame_F : FrameF
Frequency-domain MDCT coefficients as produced by filter_bank().
frame_type : FrameType
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN").
Returns
-------
frame_T : FrameT
Reconstructed time-domain frame, stereo, shape (2048, 2).
"""
if frame_type in ("OLS", "LSS", "LPS"):
if frame_F.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
w = _window_sequence(frame_type, win_type)
xL = _imdct(frame_F[:, 0]) * w
xR = _imdct(frame_F[:, 1]) * w
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
if frame_type == "ESH":
if frame_F.shape != (128, 16):
raise ValueError("For ESH, frame_F must have shape (128, 16).")
Xl, Xr = _unpack_esh(frame_F)
xL = _i_filter_bank_esh_channel(Xl, win_type)
xR = _i_filter_bank_esh_channel(Xr, win_type)
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")

217
source/core/aac_ssc.py Normal file
View File

@ -0,0 +1,217 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Sequence Segmentation Control module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Sequence Segmentation Control module (SSC).
# Selects and returns the frame type based on input parameters.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Dict, Tuple
from core.aac_types import FrameType, FrameT, FrameChannelT
import numpy as np
# -----------------------------------------------------------------------------
# Private helpers for SSC
# -----------------------------------------------------------------------------
# See Table 1 in mm-2025-hw-v0.1.pdf
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
("OLS", "OLS"): "OLS",
("OLS", "LSS"): "LSS",
("OLS", "ESH"): "ESH",
("OLS", "LPS"): "LPS",
("LSS", "OLS"): "LSS",
("LSS", "LSS"): "LSS",
("LSS", "ESH"): "ESH",
("LSS", "LPS"): "ESH",
("ESH", "OLS"): "ESH",
("ESH", "LSS"): "ESH",
("ESH", "ESH"): "ESH",
("ESH", "LPS"): "ESH",
("LPS", "OLS"): "LPS",
("LPS", "LSS"): "ESH",
("LPS", "ESH"): "ESH",
("LPS", "LPS"): "LPS",
}
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
"""
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
according to the assignment's criterion.
Parameters
----------
next_frame_channel : FrameChannelT
One channel of next_frame_T (expected shape: (2048,)).
Returns
-------
bool
True if an attack is detected (=> next frame predicted ESH), else False.
Notes
-----
The criterion is implemented as described in the spec:
1) Apply the high-pass filter:
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
implemented in the time domain as:
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
2) Split y into 16 segments of length 128 and compute segment energies s[l].
3) Compute the ratio:
ds[l] = s[l] / s[l-1]
4) An attack exists if there exists l in {1..7} such that:
s[l] > 1e-3 and ds[l] > 10
"""
# Local alias; expected to be a 1-D array of length 2048.
x = next_frame_channel
# High-pass filter reference implementation (scalar recurrence).
y = np.zeros_like(x)
prev_x = 0.0
prev_y = 0.0
for n in range(x.shape[0]):
xn = float(x[n])
yn = (xn - prev_x) + 0.5 * prev_y
y[n] = yn
prev_x = xn
prev_y = yn
# Segment energies over 16 blocks of 128 samples.
s = np.empty(16, dtype=np.float64)
for l in range(16):
a = l * 128
b = (l + 1) * 128
seg = y[a:b]
s[l] = float(np.sum(seg * seg))
# ds[l] for l>=1. For l=0 not defined, keep 0.
ds = np.zeros(16, dtype=np.float64)
eps = 1e-12 # Avoid division by zero without materially changing the logic.
for l in range(1, 16):
ds[l] = s[l] / max(s[l - 1], eps)
# Spec: check l in {1..7}.
for l in range(1, 8):
if (s[l] > 1e-3) and (ds[l] > 10.0):
return True
return False
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
"""
Decide the current frame type for a single channel based on the previous
frame type and whether the next frame is predicted to be ESH.
Rules (spec):
- If prev is "LSS" => current is "ESH"
- If prev is "LPS" => current is "OLS"
- If prev is "OLS" => current is "LSS" if attack else "OLS"
- If prev is "ESH" => current is "ESH" if attack else "LPS"
Parameters
----------
prev_frame_type : FrameType
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
attack : bool
True if the next frame is predicted ESH for this channel.
Returns
-------
FrameType
The per-channel decision for the current frame.
"""
if prev_frame_type == "LSS":
return "ESH"
if prev_frame_type == "LPS":
return "OLS"
if prev_frame_type == "OLS":
return "LSS" if attack else "OLS"
if prev_frame_type == "ESH":
return "ESH" if attack else "LPS"
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
"""
Merge per-channel frame type decisions into one common frame type using
the stereo merge table from the spec.
Parameters
----------
ft_l : FrameType
Frame type decision for the left channel.
ft_r : FrameType
Frame type decision for the right channel.
Returns
-------
FrameType
The merged common frame type.
"""
try:
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
except KeyError as e:
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
# -----------------------------------------------------------------------------
# Public Function prototypes (Level 1)
# -----------------------------------------------------------------------------
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
"""
Sequence Segmentation Control (SSC).
Select and return the frame type for the current frame (i) based on:
- the current time-domain frame (stereo),
- the next time-domain frame (stereo), used for attack detection,
- the previous frame type.
Parameters
----------
frame_T : FrameT
Current time-domain frame i (expected shape: (2048, 2)).
next_frame_T : FrameT
Next time-domain frame (i+1), used to decide transitions to/from ESH
(expected shape: (2048, 2)).
prev_frame_type : FrameType
Frame type chosen for the previous frame (i-1).
Returns
-------
FrameType
One of: "OLS", "LSS", "ESH", "LPS".
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
if next_frame_T.shape != (2048, 2):
raise ValueError("next_frame_T must have shape (2048, 2).")
# Detect attack independently per channel on the next frame.
attack_l = _detect_attack(next_frame_T[:, 0])
attack_r = _detect_attack(next_frame_T[:, 1])
# Decide per-channel type based on shared prev_frame_type.
ft_l = _decide_frame_type(prev_frame_type, attack_l)
ft_r = _decide_frame_type(prev_frame_type, attack_r)
# Stereo merge as per the spec table.
return _stereo_merge(ft_l, ft_r)

193
source/core/aac_types.py Normal file
View File

@ -0,0 +1,193 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Public Type Aliases
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements Public Type aliases
#
# ------------------------------------------------------------
from __future__ import annotations
from typing import List, Literal, TypeAlias, TypedDict
import numpy as np
from numpy.typing import NDArray
# -----------------------------------------------------------------------------
# Code enums (for readability; not intended to enforce shapes/lengths)
# -----------------------------------------------------------------------------
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
"""
Frame type codes (AAC):
- "OLS": ONLY_LONG_SEQUENCE
- "LSS": LONG_START_SEQUENCE
- "ESH": EIGHT_SHORT_SEQUENCE
- "LPS": LONG_STOP_SEQUENCE
"""
WinType: TypeAlias = Literal["KBD", "SIN"]
"""
Window type codes (AAC):
- "KBD": Kaiser-Bessel-Derived
- "SIN": sinusoid
"""
ChannelKey: TypeAlias = Literal["chl", "chr"]
"""Channel dictionary keys used in Level 1 payloads."""
# -----------------------------------------------------------------------------
# Array “semantic” aliases
#
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
# forcing strict shapes in the type system.
# -----------------------------------------------------------------------------
FloatArray: TypeAlias = NDArray[np.float64]
"""
Generic float64 NumPy array.
Note:
- We standardize internal numeric computations to float64 for stability and
reproducibility. External I/O can still be float32, but we convert at the
boundaries.
"""
Window: TypeAlias = FloatArray
"""
Time-domain window (weighting sequence), 1-D.
Typical lengths in this assignment:
- Long: 2048
- Short: 256
- Window sequences for LSS/LPS are also 2048
Expected shape: (N,)
dtype: float64
"""
TimeSignal: TypeAlias = FloatArray
"""
Time-domain signal samples, typically 1-D.
Examples:
- Windowed MDCT input: shape (N,)
- IMDCT output: shape (N,)
dtype: float64
"""
StereoSignal: TypeAlias = FloatArray
"""
Time-domain stereo signal stream.
Expected (typical) shape: (N, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
MdctCoeffs: TypeAlias = FloatArray
"""
MDCT coefficient vector, typically 1-D.
Examples:
- Long: shape (1024,)
- Short: shape (128,)
dtype: float64
"""
FrameT: TypeAlias = FloatArray
"""
Time-domain frame (stereo), as used by the filterbank input/output.
Expected (typical) shape for stereo: (2048, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
FrameChannelT: TypeAlias = FloatArray
"""
Time-domain single-channel frame.
Expected (typical) shape: (2048,)
dtype: float64
"""
FrameF: TypeAlias = FloatArray
"""
Frequency-domain frame (MDCT coefficients), stereo container.
Typical shapes (Level 1):
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
- If frame_type == "ESH": (128, 16)
Rationale for ESH (128, 16):
- 8 short subframes per channel => 8 * 2 = 16 columns total
- Each short subframe per stereo is (128, 2), flattened into columns
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
dtype: float64
"""
FrameChannelF: TypeAlias = FloatArray
"""
Frequency-domain single-channel frame (MDCT coefficients).
Typical shapes (Level 1):
- If frame_type in {"OLS","LSS","LPS"}: (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
dtype: float64
"""
# -----------------------------------------------------------------------------
# Level 1 AAC sequence payload types
# -----------------------------------------------------------------------------
class AACChannelFrameF(TypedDict):
"""
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
Keys
----
frame_F:
The MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8) (8 short subframes)
- else: (1024, )
"""
frame_F: FrameChannelF
class AACSeq1Frame(TypedDict):
"""
One frame dictionary element of aac_seq_1 (Level 1).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF
chr: AACChannelFrameF
AACSeq1: TypeAlias = List[AACSeq1Frame]
"""
AAC sequence for Level 1:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
"""

View File

@ -0,0 +1,234 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Sequence Segmentation Control Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Sequence Segmentation Control module (SSC).
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
from core.aac_ssc import aac_SSC
from core.aac_types import FrameT
# -----------------------------------------------------------------------------
# Helper fixtures for SSC
# -----------------------------------------------------------------------------
def _next_frame_no_attack() -> FrameT:
"""
Build a next_frame_T that must NOT trigger ESH detection.
Uses exact zeros so all segment energies are zero and the condition
s[l] > 1e-3 cannot hold for any l.
"""
return np.zeros((2048, 2), dtype=np.float64)
def _next_frame_strong_attack(
*,
attack_left: bool,
attack_right: bool,
segment_l: int = 4,
baseline: float = 1e-6,
burst_amp: float = 1.0,
) -> FrameT:
"""
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
Attack criterion (spec):
Attack exists if there exists l in {1..7} such that:
s[l] > 1e-3 and ds[l] > 10,
where s[l] is the energy of segment l (length 128) after high-pass filtering,
and ds[l] = s[l] / s[l-1].
Construction:
- A small baseline is added everywhere to avoid relying on the epsilon guard in ds,
keeping ds behavior stable/reproducible.
- A strong burst is added inside a chosen segment l in 1..7.
"""
if not (1 <= segment_l <= 7):
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
x = np.full((2048, 2), baseline, dtype=np.float64)
a = segment_l * 128
b = (segment_l + 1) * 128
if attack_left:
x[a:b, 0] += burst_amp
if attack_right:
x[a:b, 1] += burst_amp
return x
def _next_frame_below_s_threshold(
*,
left: bool,
right: bool,
segment_l: int = 4,
impulse_amp: float = 0.01,
) -> FrameT:
"""
Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered,
even if the ratio ds[l] could be large.
We place a single impulse of amplitude 'impulse_amp' inside one segment.
Approx. segment energy: s[l] ~= impulse_amp^2.
Example:
impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3
"""
if not (1 <= segment_l <= 7):
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
x = np.zeros((2048, 2), dtype=np.float64)
idx = segment_l * 128 + 10 # inside segment l
if left:
x[idx, 0] = impulse_amp
if right:
x[idx, 1] = impulse_amp
return x
# -----------------------------------------------------------------------------
# 1) Fixed/mandatory cases (prev frame type forces current type)
# -----------------------------------------------------------------------------
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
"""
Spec:
- If prev was LSS => current MUST be ESH
- If prev was LPS => current MUST be OLS
independent of attack detection on (i+1).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
out1 = aac_SSC(frame_t, next_attack, "LSS")
assert out1 == "ESH"
out2 = aac_SSC(frame_t, next_attack, "LPS")
assert out2 == "OLS"
# -----------------------------------------------------------------------------
# 2) Cases requiring next-frame ESH prediction (attack computation)
# -----------------------------------------------------------------------------
def test_prev_ols_next_not_esh_returns_ols() -> None:
"""
If prev=OLS, current is:
- LSS iff (i+1) is predicted ESH
- else OLS
Here: no attack => expect OLS.
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "OLS"
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
"""
prev=OLS and next predicted ESH for both channels:
per-channel: LSS, LSS
merged: LSS
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "LSS"
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
"""
prev=OLS:
- one channel predicts ESH => LSS
- other channel predicts not ESH => OLS
Merge table: OLS + LSS => LSS (either side).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = aac_SSC(frame_t, next1_t, "OLS")
assert out1 == "LSS"
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
out2 = aac_SSC(frame_t, next2_t, "OLS")
assert out2 == "LSS"
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
"""
prev=ESH and next predicted ESH for both channels:
per-channel: ESH, ESH
merged: ESH
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = aac_SSC(frame_t, next_t, "ESH")
assert out == "ESH"
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
"""
prev=ESH and next not predicted ESH for both channels:
per-channel: LPS, LPS
merged: LPS
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = aac_SSC(frame_t, next_t, "ESH")
assert out == "LPS"
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
"""
prev=ESH:
- one channel predicts ESH => ESH
- other channel predicts not ESH => LPS
Merge table: ESH + LPS => ESH (either side).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = aac_SSC(frame_t, next1_t, "ESH")
assert out1 == "ESH"
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
out2 = aac_SSC(frame_t, next2_t, "ESH")
assert out2 == "ESH"
def test_threshold_s_must_exceed_1e_3() -> None:
"""
Spec: next frame is predicted ESH only if:
s[l] > 1e-3 AND ds[l] > 10
for some l in 1..7.
This test checks the necessity of the s[l] threshold:
- Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
- Expect: not classified as ESH -> for prev=OLS return OLS.
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "OLS"

View File

@ -1,3 +1,16 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder/DecoderTests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for AAC Coder/Decoder module.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
@ -6,18 +19,36 @@ import numpy as np
import pytest
import soundfile as sf
from level_1.level_1 import aac_coder_1, i_aac_coder_1
from core.aac_coder import aac_coder_1
from core.aac_decoder import aac_decoder_1
from core.aac_types import *
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
# -----------------------------------------------------------------------------
def _snr_db(x_ref: np.ndarray, x_hat: np.ndarray) -> float:
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference signal, shape (N, 2) typical.
x_hat : StereoSignal
Reconstructed signal, shape (M, 2) typical.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
# Be conservative: align lengths and common channels.
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
@ -36,7 +67,7 @@ def _snr_db(x_ref: np.ndarray, x_hat: np.ndarray) -> float:
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return -float("inf")
return float("-inf")
return float(10.0 * np.log10(ps / pn))
@ -49,9 +80,9 @@ def tmp_stereo_wav(tmp_path: Path) -> Path:
rng = np.random.default_rng(123)
fs = 48000
# ~1 second of audio, keep small for test speed
# ~1 second of audio (kept small for test speed).
n = fs
x = rng.normal(size=(n, 2)).astype(np.float64)
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
wav_path = tmp_path / "in.wav"
sf.write(str(wav_path), x, fs)
@ -63,7 +94,7 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
Module-level contract test:
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
"""
aac_seq = aac_coder_1(tmp_stereo_wav)
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
assert isinstance(aac_seq, list)
assert len(aac_seq) > 0
@ -88,8 +119,8 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
assert "frame_F" in fr["chl"]
assert "frame_F" in fr["chr"]
chl_f = np.asarray(fr["chl"]["frame_F"])
chr_f = np.asarray(fr["chr"]["frame_F"])
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
if frame_type == "ESH":
assert chl_f.shape == (128, 8)
@ -101,23 +132,25 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
"""
End-to-end module test:
End-to-end test:
Encode + decode and check SNR is very high (numerical-noise only).
Threshold is intentionally loose to avoid fragility.
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
"""
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
assert fs == 48000
x_ref = np.asarray(x_ref, dtype=np.float64)
assert int(fs) == 48000
out_wav = tmp_path / "out.wav"
aac_seq = aac_coder_1(tmp_stereo_wav)
x_hat = i_aac_coder_1(aac_seq, out_wav)
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
# Basic sanity: output file exists and is readable
assert out_wav.exists()
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
assert fs_hat == 48000
assert int(fs_hat) == 48000
# SNR computed against the array returned by i_aac_coder_1 (should match file, but not required)
# SNR against returned array (file should match closely, but we do not require it here).
snr = _snr_db(x_ref, x_hat)
assert snr > 80.0

View File

@ -0,0 +1,269 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Filterbank module.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Sequence
import pytest
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
from core.aac_types import *
# Helper fixtures for filterbank
# -----------------------------------------------------------------------------
def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal:
"""
Analyze-synthesize each frame and overlap-add with hop=1024.
Parameters
----------
x : StereoSignal
Input stereo stream, expected shape (N, 2).
frame_types : Sequence[FrameType]
Length K sequence of frame types for frames starting at i*1024.
win_type : WinType
Window type ("SIN" or "KBD").
Returns
-------
StereoSignal
Reconstructed stereo stream, same shape as x (N, 2).
"""
hop = 1024
win = 2048
K = len(frame_types)
y: StereoSignal = np.zeros_like(x, dtype=np.float64)
for i in range(K):
start = i * hop
frame_t: FrameT = x[start:start + win, :]
frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type)
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type)
y[start:start + win, :] += frame_t_hat
return y
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
"""
Compute SNR in dB over all samples/channels.
"""
err = x - y
ps = float(np.sum(x * x))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return 10.0 * float(np.log10(ps / pn))
# -----------------------------------------------------------------------------
# Forward filterbank tests
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
"""
Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
assert frame_f.shape == (1024, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_shapes_esh(win_type: WinType) -> None:
"""
Contract test: for ESH, aac_filter_bank returns shape (128, 16).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
assert frame_f.shape == (128, 16)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
"""
Behavior test: for OLS (representative long-sequence), channels are independent.
If right channel is zero and left is random, right spectrum should be near zero.
"""
rng = np.random.default_rng(0)
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = aac_filter_bank(frame_t, "OLS", win_type)
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
"""
Behavior test: for ESH, channels are independent.
If right channel is zero and left is random, all odd columns (right) should be near zero.
"""
rng = np.random.default_rng(1)
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15
assert np.max(np.abs(right_cols)) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
"""
Spec-driven behavior test:
ESH uses only the central region [448, 1600), split into 8 overlapping
windows of length 256 with 50% overlap.
Therefore, changing samples outside [448, 1600) must not affect the output.
"""
rng = np.random.default_rng(2)
frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64)
center = rng.normal(size=(1152, 2))
frame_a[448:1600, :] = center
frame_b[448:1600, :] = center
frame_b[0:448, :] = rng.normal(size=(448, 2))
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
fa = aac_filter_bank(frame_a, "ESH", win_type)
fb = aac_filter_bank(frame_b, "ESH", win_type)
# Use a tiny tolerance to avoid flaky failures due to floating-point minutiae.
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_output_is_finite(win_type: WinType) -> None:
"""
Sanity test: output must not contain NaN or inf for representative cases.
"""
rng = np.random.default_rng(3)
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
assert np.isfinite(frame_f).all()
# -----------------------------------------------------------------------------
# Reverse i_filterbank tests
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None:
"""
Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2).
"""
frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64)
for frame_type in ("OLS", "LSS", "LPS"):
frame_t = aac_i_filter_bank(frame_f, frame_type, win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_esh(win_type: WinType) -> None:
"""
Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2).
"""
frame_f: FrameF = np.zeros((128, 16), dtype=np.float64)
frame_t = aac_i_filter_bank(frame_f, "ESH", win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None:
"""
Sanity test: per-frame analysis+synthesis must produce finite outputs.
"""
rng = np.random.default_rng(0)
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type)
assert np.isfinite(frame_t_hat).all()
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
"""
Module-level test:
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(1)
K = 6
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 50.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
"""
Module-level test:
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(2)
K = 6
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 45.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
"""
Transition sequence test matching the windowing logic:
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
"""
rng = np.random.default_rng(3)
frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
K = len(frame_types)
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, frame_types, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 40.0

View File

@ -1,16 +1,33 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank internal (mdct) Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Filterbank internal MDCT/IMDCT functionality.
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
import pytest
from level_1.level_1 import _imdct, _mdct
from core.aac_filterbank import _imdct, _mdct
from core.aac_types import FloatArray, TimeSignal, MdctCoeffs
# Helper "fixtures" for filterbank internals (MDCT/IMDCT)
# -----------------------------------------------------------------------------
def _assert_allclose(a: np.ndarray, b: np.ndarray, *, rtol: float, atol: float) -> None:
# Helper for consistent tolerances across tests.
def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None:
"""
Helper for consistent tolerances across tests.
"""
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
def _estimate_gain(y: np.ndarray, x: np.ndarray) -> float:
def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float:
"""
Estimate scalar gain g such that y ~= g*x in least-squares sense.
"""
@ -28,18 +45,18 @@ def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None:
Consistency test in coefficient domain:
mdct(imdct(X)) ~= g * X
For our chosen (non-orthonormal) scaling, g is expected to be close to 2.
For the chosen (non-orthonormal) scaling, g is expected to be close to 2.
"""
rng = np.random.default_rng(0)
K = N // 2
X = rng.normal(size=K).astype(np.float64)
x = _imdct(X)
X_hat = _mdct(x)
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
x: TimeSignal = _imdct(X)
X_hat: MdctCoeffs = _mdct(x)
g = _estimate_gain(X_hat, X)
_assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance)
_assert_allclose(np.array([g]), np.array([2.0]), rtol=tolerance, atol=tolerance)
_assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance)
@pytest.mark.parametrize("N", [256, 2048])
@ -47,18 +64,16 @@ def test_mdct_linearity(N: int) -> None:
"""
Linearity test:
mdct(a*x + b*y) == a*mdct(x) + b*mdct(y)
This should hold up to numerical error.
"""
rng = np.random.default_rng(1)
x = rng.normal(size=N).astype(np.float64)
y = rng.normal(size=N).astype(np.float64)
x: TimeSignal = rng.normal(size=N).astype(np.float64)
y: TimeSignal = rng.normal(size=N).astype(np.float64)
a = 0.37
b = -1.12
left = _mdct(a * x + b * y)
right = a * _mdct(x) + b * _mdct(y)
left: MdctCoeffs = _mdct(a * x + b * y)
right: MdctCoeffs = a * _mdct(x) + b * _mdct(y)
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
@ -72,14 +87,14 @@ def test_imdct_linearity(N: int) -> None:
rng = np.random.default_rng(2)
K = N // 2
X = rng.normal(size=K).astype(np.float64)
Y = rng.normal(size=K).astype(np.float64)
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
Y: MdctCoeffs = rng.normal(size=K).astype(np.float64)
a = -0.5
b = 2.0
left = _imdct(a * X + b * Y)
right = a * _imdct(X) + b * _imdct(Y)
left: TimeSignal = _imdct(a * X + b * Y)
right: TimeSignal = a * _imdct(X) + b * _imdct(Y)
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
@ -92,8 +107,8 @@ def test_mdct_imdct_outputs_are_finite(N: int) -> None:
rng = np.random.default_rng(3)
K = N // 2
x = rng.normal(size=N).astype(np.float64)
X = rng.normal(size=K).astype(np.float64)
x: TimeSignal = rng.normal(size=N).astype(np.float64)
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
X1 = _mdct(x)
x1 = _imdct(X)

View File

@ -0,0 +1,198 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC encoder orchestration.
# Keeps the same functional behavior as the original level_1 implementation:
# - Reads WAV via soundfile
# - Validates stereo and 48 kHz
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
# - SSC decision uses next-frame attack detection
# - Filterbank analysis (MDCT)
# - Stores per-channel spectra in AACSeq1 schema:
# * ESH: (128, 8)
# * else: (1024, 1)
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_configuration import WIN_TYPE
from core.aac_filterbank import aac_filter_bank
from core.aac_ssc import aac_SSC
from core.aac_types import *
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
"""
Read a WAV file using soundfile and validate the Level-1 assumptions.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Returns
-------
x : StereoSignal (np.ndarray)
Stereo samples as float64, shape (N, 2).
fs : int
Sampling rate (Hz). Must be 48000.
Raises
------
ValueError
If the input is not stereo or the sampling rate is not 48 kHz.
"""
filename_in = Path(filename_in)
x, fs = sf.read(str(filename_in), always_2d=True)
x = np.asarray(x, dtype=np.float64)
if x.shape[1] != 2:
raise ValueError("Input must be stereo (2 channels).")
if int(fs) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
return x, int(fs)
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
as required by the Level-1 AACSeq1 schema.
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
frame_f : FrameF
Output of aac_filter_bank():
- If frame_type != "ESH": shape (1024, 2)
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
Returns
-------
chl_f : FrameChannelF
Left channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
"""
if frame_type == "ESH":
if frame_f.shape != (128, 16):
raise ValueError("For ESH, frame_f must have shape (128, 16).")
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f[:, 2 * j + 0]
chr_f[:, j] = frame_f[:, 2 * j + 1]
return chl_f, chr_f
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
if frame_f.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
return chl_f, chr_f
# -----------------------------------------------------------------------------
# Level 1 encoder
# -----------------------------------------------------------------------------
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
"""
Level-1 AAC encoder.
This function preserves the behavior of the original level_1 implementation:
- Read stereo 48 kHz WAV
- Pad hop samples at start and hop samples at end
- Frame with win=2048, hop=1024
- Use SSC with next-frame lookahead
- Apply filterbank analysis
- Store per-channel coefficients using AACSeq1 schema
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
Returns
-------
AACSeq1
List of encoded frames (Level 1 schema).
"""
x, fs = aac_read_wav_stereo_48k(filename_in)
_ = fs # kept for clarity; The assignment assumes 48 kHz
hop = 1024
win = 2048
# Pad at the beginning to support the first overlap region.
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
# Number of frames such that current frame fits; next frame will be padded if needed.
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS"
win_type: WinType = WIN_TYPE
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start:start + win, :]
if frame_t.shape != (win, 2):
# This should not happen due to K definition, but keep it explicit.
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop:start + hop + win, :]
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({
"frame_type": frame_type,
"win_type": win_type,
"chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f},
})
prev_frame_type = frame_type
return aac_seq

View File

@ -0,0 +1,22 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Configuration
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module contains the global configurations
#
# ------------------------------------------------------------
from __future__ import annotations
# Imports
from core.aac_types import WinType
# Window type
# Options: "SIN", "KBD"
WIN_TYPE: WinType = "SIN"

View File

@ -0,0 +1,166 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Inverse AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
# Keeps the same functional behavior as the original level_1 implementation:
# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
# - IMDCT synthesis per frame
# - Overlap-add with hop=1024
# - Remove encoder boundary padding: hop at start and hop at end
#
# Note:
# This core module returns the reconstructed samples. Writing to disk is kept
# in level_x demos.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_filterbank import aac_i_filter_bank
from core.aac_types import *
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
"""
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
FrameF container expected by aac_i_filter_bank().
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
chl_f : FrameChannelF
Left channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
Returns
-------
FrameF
Stereo coefficients:
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
- else: (1024, 2)
"""
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
return frame_f
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_f[:, 0]
frame_f[:, 1] = chr_f[:, 0]
return frame_f
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
"""
Remove the boundary padding that the Level-1 encoder adds:
hop samples at start and hop samples at end.
Parameters
----------
y_pad : StereoSignal (np.ndarray)
Reconstructed padded stream, shape (N_pad, 2).
hop : int
Hop size in samples (default 1024).
Returns
-------
StereoSignal (np.ndarray)
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
Raises
------
ValueError
If y_pad is too short to unpad.
"""
if y_pad.shape[0] < 2 * hop:
raise ValueError("Decoded stream too short to unpad.")
return y_pad[hop:-hop, :]
# -----------------------------------------------------------------------------
# Level 1 decoder (core)
# -----------------------------------------------------------------------------
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-1 AAC decoder (inverse of aac_coder_1()).
This function preserves the behavior of the original level_1 implementation:
- Reconstruct the full padded stream by overlap-adding K synthesized frames
- Remove hop padding at the beginning and hop padding at the end
- Write the reconstructed stereo WAV file (48 kHz)
- Return reconstructed stereo samples as float64
Parameters
----------
aac_seq_1 : AACSeq1
Encoded sequence as produced by aac_coder_1().
filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_1)
# Output includes the encoder padding region, so we reconstruct the full padded stream.
# For K frames: last frame starts at (K-1)*hop and spans win,
# so total length = (K-1)*hop + win.
n_pad = (K - 1) * hop + win
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
for i, fr in enumerate(aac_seq_1):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
start = i * hop
y_pad[start:start + win, :] += frame_t_hat
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
# Level 1 assumption: 48 kHz output.
sf.write(str(filename_out), y, 48000)
return y

View File

@ -0,0 +1,454 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
#
# ------------------------------------------------------------
from __future__ import annotations
from core.aac_types import *
from scipy.signal.windows import kaiser
# Private helpers for Filterbank
# ------------------------------------------------------------
def _sin_window(N: int) -> Window:
"""
Build a sinusoidal (SIN) window of length N.
The AAC sinusoid window is:
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
Parameters
----------
N : int
Window length in samples.
Returns
-------
Window
1-D array of shape (N, ) with dtype float64.
"""
n = np.arange(N, dtype=np.float64)
return np.sin((np.pi / N) * (n + 0.5))
def _kbd_window(N: int, alpha: float) -> Window:
"""
Build a Kaiser-Bessel-Derived (KBD) window of length N.
This follows the standard KBD construction used in AAC:
1) Build a Kaiser kernel of length (N/2 + 1).
2) Form the left half by cumulative summation, normalization, and sqrt.
3) Mirror the left half to form the right half (symmetric full-length window).
Notes
-----
- N must be even (AAC uses N=2048 for long and N=256 for short).
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
Parameters
----------
N : int
Window length in samples (must be even).
alpha : float
KBD alpha parameter.
Returns
-------
Window
1-D array of shape (N,) with dtype float64.
"""
half = N // 2
# Kaiser kernel length: half + 1 samples (0 .. half)
# beta = pi * alpha per the usual correspondence with the ISO definition
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
csum = np.cumsum(kernel)
denom = csum[-1]
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
w_right = w_left[::-1] # mirror for second half
return np.concatenate([w_left, w_right])
def _long_window(win_type: WinType) -> Window:
"""
Return the long AAC window (length 2048) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(2048)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(2048, alpha=6.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _short_window(win_type: WinType) -> Window:
"""
Return the short AAC window (length 256) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (256,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(256)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(256, alpha=4.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
"""
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
In this assignment we assume a single window family is used globally
(no mixed KBD/SIN halves). Therefore, both the long and short windows
are drawn from the same family.
For frame_type:
- "OLS": return the long window Wl (2048).
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
Parameters
----------
frame_type : FrameType
One of "OLS", "LSS", "LPS".
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
wL = _long_window(win_type) # length 2048
wS = _short_window(win_type) # length 256
if frame_type == "OLS":
return wL
if frame_type == "LSS":
# 0..1023: left half of long window
# 1024..1471: ones (448 samples)
# 1472..1599: right half of short window (128 samples)
# 1600..2047: zeros (448 samples)
out = np.zeros(2048, dtype=np.float64)
out[0:1024] = wL[0:1024]
out[1024:1472] = 1.0
out[1472:1600] = wS[128:256]
out[1600:2048] = 0.0
return out
if frame_type == "LPS":
# 0..447: zeros (448)
# 448..575: left half of short window (128)
# 576..1023: ones (448)
# 1024..2047: right half of long window (1024)
out = np.zeros(2048, dtype=np.float64)
out[0:448] = 0.0
out[448:576] = wS[0:128]
out[576:1024] = 1.0
out[1024:2048] = wL[1024:2048]
return out
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
def _mdct(s: TimeSignal) -> MdctCoeffs:
"""
MDCT (direct form) as specified in the assignment.
Parameters
----------
s : TimeSignal
Windowed time samples, 1-D array of length N (N = 2048 or 256).
Returns
-------
MdctCoeffs
MDCT coefficients, 1-D array of length N/2.
Definition
----------
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
s = np.asarray(s, dtype=np.float64).reshape(-1)
N = int(s.shape[0])
if N not in (2048, 256):
raise ValueError("MDCT input length must be 2048 or 256.")
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(N // 2, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
X = 2.0 * (s @ C) # (N/2,)
return X
def _imdct(X: MdctCoeffs) -> TimeSignal:
"""
IMDCT (direct form) as specified in the assignment.
Parameters
----------
X : MdctCoeffs
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
Returns
-------
TimeSignal
Reconstructed time samples, 1-D array of length N = 2K.
Definition
----------
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
X = np.asarray(X, dtype=np.float64).reshape(-1)
K = int(X.shape[0])
if K not in (1024, 128):
raise ValueError("IMDCT input length must be 1024 or 128.")
N = 2 * K
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(K, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
s = (2.0 / N) * (C @ X) # (N,)
return s
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
"""
ESH analysis for one channel.
Parameters
----------
x_ch : FrameChannelT
Time-domain channel frame (expected shape: (2048,)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelF
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
of the j-th short window.
"""
wS = _short_window(win_type) # (256,)
X_esh = np.empty((128, 8), dtype=np.float64)
# ESH subwindows are taken from the central region:
# start positions: 448 + 128*j, j = 0..7
for j in range(8):
start = 448 + 128 * j
seg = x_ch[start:start + 256] * wS # (256,)
X_esh[:, j] = _mdct(seg) # (128,)
return X_esh
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
Parameters
----------
frame_F : FrameF
Packed ESH spectrum (expected shape: (128, 16)).
Returns
-------
left : FrameChannelF
Left channel spectrum, shape (128, 8).
right : FrameChannelF
Right channel spectrum, shape (128, 8).
Notes
-----
Inverse mapping of the packing used in aac_filter_bank():
packed[:, 2*j] = left[:, j]
packed[:, 2*j+1] = right[:, j]
"""
if frame_F.shape != (128, 16):
raise ValueError("ESH frame_F must have shape (128, 16).")
left = np.empty((128, 8), dtype=np.float64)
right = np.empty((128, 8), dtype=np.float64)
for j in range(8):
left[:, j] = frame_F[:, 2 * j + 0]
right[:, j] = frame_F[:, 2 * j + 1]
return left, right
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
"""
ESH synthesis for one channel.
Parameters
----------
X_esh : FrameChannelF
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelT
Time-domain channel contribution, shape (2048,).
This is already overlap-added internally for the 8 short blocks and
ready for OLA at the caller level.
"""
if X_esh.shape != (128, 8):
raise ValueError("X_esh must have shape (128, 8).")
wS = _short_window(win_type) # (256,)
out = np.zeros(2048, dtype=np.float64)
# Each short IMDCT returns 256 samples. Place them at:
# start = 448 + 128*j, j=0..7 (50% overlap)
for j in range(8):
seg = _imdct(X_esh[:, j]) * wS # (256,)
start = 448 + 128 * j
out[start:start + 256] += seg
return out
# -----------------------------------------------------------------------------
# Public Function prototypes (Level 1)
# -----------------------------------------------------------------------------
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
"""
Filterbank stage (MDCT analysis).
Parameters
----------
frame_T : FrameT
Time-domain frame, stereo, shape (2048, 2).
frame_type : FrameType
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN") used for the current frame.
Returns
-------
frame_F : FrameF
Frequency-domain MDCT coefficients:
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
containing MDCT coefficients for both channels.
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
placed in columns according to subframe order, i.e. overall shape (128, 16).
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
if frame_type in ("OLS", "LSS", "LPS"):
w = _window_sequence(frame_type, win_type) # length 2048
XL = _mdct(xL * w) # length 1024
XR = _mdct(xR * w) # length 1024
out = np.empty((1024, 2), dtype=np.float64)
out[:, 0] = XL
out[:, 1] = XR
return out
if frame_type == "ESH":
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
# Pack into (128, 16): each subframe as (128,2) placed in columns
out = np.empty((128, 16), dtype=np.float64)
for j in range(8):
out[:, 2 * j + 0] = Xl[:, j]
out[:, 2 * j + 1] = Xr[:, j]
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
"""
Inverse filterbank (IMDCT synthesis).
Parameters
----------
frame_F : FrameF
Frequency-domain MDCT coefficients as produced by filter_bank().
frame_type : FrameType
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN").
Returns
-------
frame_T : FrameT
Reconstructed time-domain frame, stereo, shape (2048, 2).
"""
if frame_type in ("OLS", "LSS", "LPS"):
if frame_F.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
w = _window_sequence(frame_type, win_type)
xL = _imdct(frame_F[:, 0]) * w
xR = _imdct(frame_F[:, 1]) * w
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
if frame_type == "ESH":
if frame_F.shape != (128, 16):
raise ValueError("For ESH, frame_F must have shape (128, 16).")
Xl, Xr = _unpack_esh(frame_F)
xL = _i_filter_bank_esh_channel(Xl, win_type)
xR = _i_filter_bank_esh_channel(Xr, win_type)
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")

View File

@ -0,0 +1,217 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Sequence Segmentation Control module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Sequence Segmentation Control module (SSC).
# Selects and returns the frame type based on input parameters.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Dict, Tuple
from core.aac_types import FrameType, FrameT, FrameChannelT
import numpy as np
# -----------------------------------------------------------------------------
# Private helpers for SSC
# -----------------------------------------------------------------------------
# See Table 1 in mm-2025-hw-v0.1.pdf
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
("OLS", "OLS"): "OLS",
("OLS", "LSS"): "LSS",
("OLS", "ESH"): "ESH",
("OLS", "LPS"): "LPS",
("LSS", "OLS"): "LSS",
("LSS", "LSS"): "LSS",
("LSS", "ESH"): "ESH",
("LSS", "LPS"): "ESH",
("ESH", "OLS"): "ESH",
("ESH", "LSS"): "ESH",
("ESH", "ESH"): "ESH",
("ESH", "LPS"): "ESH",
("LPS", "OLS"): "LPS",
("LPS", "LSS"): "ESH",
("LPS", "ESH"): "ESH",
("LPS", "LPS"): "LPS",
}
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
"""
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
according to the assignment's criterion.
Parameters
----------
next_frame_channel : FrameChannelT
One channel of next_frame_T (expected shape: (2048,)).
Returns
-------
bool
True if an attack is detected (=> next frame predicted ESH), else False.
Notes
-----
The criterion is implemented as described in the spec:
1) Apply the high-pass filter:
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
implemented in the time domain as:
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
2) Split y into 16 segments of length 128 and compute segment energies s[l].
3) Compute the ratio:
ds[l] = s[l] / s[l-1]
4) An attack exists if there exists l in {1..7} such that:
s[l] > 1e-3 and ds[l] > 10
"""
# Local alias; expected to be a 1-D array of length 2048.
x = next_frame_channel
# High-pass filter reference implementation (scalar recurrence).
y = np.zeros_like(x)
prev_x = 0.0
prev_y = 0.0
for n in range(x.shape[0]):
xn = float(x[n])
yn = (xn - prev_x) + 0.5 * prev_y
y[n] = yn
prev_x = xn
prev_y = yn
# Segment energies over 16 blocks of 128 samples.
s = np.empty(16, dtype=np.float64)
for l in range(16):
a = l * 128
b = (l + 1) * 128
seg = y[a:b]
s[l] = float(np.sum(seg * seg))
# ds[l] for l>=1. For l=0 not defined, keep 0.
ds = np.zeros(16, dtype=np.float64)
eps = 1e-12 # Avoid division by zero without materially changing the logic.
for l in range(1, 16):
ds[l] = s[l] / max(s[l - 1], eps)
# Spec: check l in {1..7}.
for l in range(1, 8):
if (s[l] > 1e-3) and (ds[l] > 10.0):
return True
return False
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
"""
Decide the current frame type for a single channel based on the previous
frame type and whether the next frame is predicted to be ESH.
Rules (spec):
- If prev is "LSS" => current is "ESH"
- If prev is "LPS" => current is "OLS"
- If prev is "OLS" => current is "LSS" if attack else "OLS"
- If prev is "ESH" => current is "ESH" if attack else "LPS"
Parameters
----------
prev_frame_type : FrameType
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
attack : bool
True if the next frame is predicted ESH for this channel.
Returns
-------
FrameType
The per-channel decision for the current frame.
"""
if prev_frame_type == "LSS":
return "ESH"
if prev_frame_type == "LPS":
return "OLS"
if prev_frame_type == "OLS":
return "LSS" if attack else "OLS"
if prev_frame_type == "ESH":
return "ESH" if attack else "LPS"
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
"""
Merge per-channel frame type decisions into one common frame type using
the stereo merge table from the spec.
Parameters
----------
ft_l : FrameType
Frame type decision for the left channel.
ft_r : FrameType
Frame type decision for the right channel.
Returns
-------
FrameType
The merged common frame type.
"""
try:
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
except KeyError as e:
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
# -----------------------------------------------------------------------------
# Public Function prototypes (Level 1)
# -----------------------------------------------------------------------------
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
"""
Sequence Segmentation Control (SSC).
Select and return the frame type for the current frame (i) based on:
- the current time-domain frame (stereo),
- the next time-domain frame (stereo), used for attack detection,
- the previous frame type.
Parameters
----------
frame_T : FrameT
Current time-domain frame i (expected shape: (2048, 2)).
next_frame_T : FrameT
Next time-domain frame (i+1), used to decide transitions to/from ESH
(expected shape: (2048, 2)).
prev_frame_type : FrameType
Frame type chosen for the previous frame (i-1).
Returns
-------
FrameType
One of: "OLS", "LSS", "ESH", "LPS".
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
if next_frame_T.shape != (2048, 2):
raise ValueError("next_frame_T must have shape (2048, 2).")
# Detect attack independently per channel on the next frame.
attack_l = _detect_attack(next_frame_T[:, 0])
attack_r = _detect_attack(next_frame_T[:, 1])
# Decide per-channel type based on shared prev_frame_type.
ft_l = _decide_frame_type(prev_frame_type, attack_l)
ft_r = _decide_frame_type(prev_frame_type, attack_r)
# Stereo merge as per the spec table.
return _stereo_merge(ft_l, ft_r)

View File

@ -0,0 +1,193 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Public Type Aliases
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements Public Type aliases
#
# ------------------------------------------------------------
from __future__ import annotations
from typing import List, Literal, TypeAlias, TypedDict
import numpy as np
from numpy.typing import NDArray
# -----------------------------------------------------------------------------
# Code enums (for readability; not intended to enforce shapes/lengths)
# -----------------------------------------------------------------------------
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
"""
Frame type codes (AAC):
- "OLS": ONLY_LONG_SEQUENCE
- "LSS": LONG_START_SEQUENCE
- "ESH": EIGHT_SHORT_SEQUENCE
- "LPS": LONG_STOP_SEQUENCE
"""
WinType: TypeAlias = Literal["KBD", "SIN"]
"""
Window type codes (AAC):
- "KBD": Kaiser-Bessel-Derived
- "SIN": sinusoid
"""
ChannelKey: TypeAlias = Literal["chl", "chr"]
"""Channel dictionary keys used in Level 1 payloads."""
# -----------------------------------------------------------------------------
# Array “semantic” aliases
#
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
# forcing strict shapes in the type system.
# -----------------------------------------------------------------------------
FloatArray: TypeAlias = NDArray[np.float64]
"""
Generic float64 NumPy array.
Note:
- We standardize internal numeric computations to float64 for stability and
reproducibility. External I/O can still be float32, but we convert at the
boundaries.
"""
Window: TypeAlias = FloatArray
"""
Time-domain window (weighting sequence), 1-D.
Typical lengths in this assignment:
- Long: 2048
- Short: 256
- Window sequences for LSS/LPS are also 2048
Expected shape: (N,)
dtype: float64
"""
TimeSignal: TypeAlias = FloatArray
"""
Time-domain signal samples, typically 1-D.
Examples:
- Windowed MDCT input: shape (N,)
- IMDCT output: shape (N,)
dtype: float64
"""
StereoSignal: TypeAlias = FloatArray
"""
Time-domain stereo signal stream.
Expected (typical) shape: (N, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
MdctCoeffs: TypeAlias = FloatArray
"""
MDCT coefficient vector, typically 1-D.
Examples:
- Long: shape (1024,)
- Short: shape (128,)
dtype: float64
"""
FrameT: TypeAlias = FloatArray
"""
Time-domain frame (stereo), as used by the filterbank input/output.
Expected (typical) shape for stereo: (2048, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
FrameChannelT: TypeAlias = FloatArray
"""
Time-domain single-channel frame.
Expected (typical) shape: (2048,)
dtype: float64
"""
FrameF: TypeAlias = FloatArray
"""
Frequency-domain frame (MDCT coefficients), stereo container.
Typical shapes (Level 1):
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
- If frame_type == "ESH": (128, 16)
Rationale for ESH (128, 16):
- 8 short subframes per channel => 8 * 2 = 16 columns total
- Each short subframe per stereo is (128, 2), flattened into columns
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
dtype: float64
"""
FrameChannelF: TypeAlias = FloatArray
"""
Frequency-domain single-channel frame (MDCT coefficients).
Typical shapes (Level 1):
- If frame_type in {"OLS","LSS","LPS"}: (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
dtype: float64
"""
# -----------------------------------------------------------------------------
# Level 1 AAC sequence payload types
# -----------------------------------------------------------------------------
class AACChannelFrameF(TypedDict):
"""
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
Keys
----
frame_F:
The MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8) (8 short subframes)
- else: (1024, )
"""
frame_F: FrameChannelF
class AACSeq1Frame(TypedDict):
"""
One frame dictionary element of aac_seq_1 (Level 1).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF
chr: AACChannelFrameF
AACSeq1: TypeAlias = List[AACSeq1Frame]
"""
AAC sequence for Level 1:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
"""

View File

@ -0,0 +1,234 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Sequence Segmentation Control Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Sequence Segmentation Control module (SSC).
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
from core.aac_ssc import aac_SSC
from core.aac_types import FrameT
# -----------------------------------------------------------------------------
# Helper fixtures for SSC
# -----------------------------------------------------------------------------
def _next_frame_no_attack() -> FrameT:
"""
Build a next_frame_T that must NOT trigger ESH detection.
Uses exact zeros so all segment energies are zero and the condition
s[l] > 1e-3 cannot hold for any l.
"""
return np.zeros((2048, 2), dtype=np.float64)
def _next_frame_strong_attack(
*,
attack_left: bool,
attack_right: bool,
segment_l: int = 4,
baseline: float = 1e-6,
burst_amp: float = 1.0,
) -> FrameT:
"""
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
Attack criterion (spec):
Attack exists if there exists l in {1..7} such that:
s[l] > 1e-3 and ds[l] > 10,
where s[l] is the energy of segment l (length 128) after high-pass filtering,
and ds[l] = s[l] / s[l-1].
Construction:
- A small baseline is added everywhere to avoid relying on the epsilon guard in ds,
keeping ds behavior stable/reproducible.
- A strong burst is added inside a chosen segment l in 1..7.
"""
if not (1 <= segment_l <= 7):
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
x = np.full((2048, 2), baseline, dtype=np.float64)
a = segment_l * 128
b = (segment_l + 1) * 128
if attack_left:
x[a:b, 0] += burst_amp
if attack_right:
x[a:b, 1] += burst_amp
return x
def _next_frame_below_s_threshold(
*,
left: bool,
right: bool,
segment_l: int = 4,
impulse_amp: float = 0.01,
) -> FrameT:
"""
Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered,
even if the ratio ds[l] could be large.
We place a single impulse of amplitude 'impulse_amp' inside one segment.
Approx. segment energy: s[l] ~= impulse_amp^2.
Example:
impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3
"""
if not (1 <= segment_l <= 7):
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
x = np.zeros((2048, 2), dtype=np.float64)
idx = segment_l * 128 + 10 # inside segment l
if left:
x[idx, 0] = impulse_amp
if right:
x[idx, 1] = impulse_amp
return x
# -----------------------------------------------------------------------------
# 1) Fixed/mandatory cases (prev frame type forces current type)
# -----------------------------------------------------------------------------
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
"""
Spec:
- If prev was LSS => current MUST be ESH
- If prev was LPS => current MUST be OLS
independent of attack detection on (i+1).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
out1 = aac_SSC(frame_t, next_attack, "LSS")
assert out1 == "ESH"
out2 = aac_SSC(frame_t, next_attack, "LPS")
assert out2 == "OLS"
# -----------------------------------------------------------------------------
# 2) Cases requiring next-frame ESH prediction (attack computation)
# -----------------------------------------------------------------------------
def test_prev_ols_next_not_esh_returns_ols() -> None:
"""
If prev=OLS, current is:
- LSS iff (i+1) is predicted ESH
- else OLS
Here: no attack => expect OLS.
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "OLS"
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
"""
prev=OLS and next predicted ESH for both channels:
per-channel: LSS, LSS
merged: LSS
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "LSS"
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
"""
prev=OLS:
- one channel predicts ESH => LSS
- other channel predicts not ESH => OLS
Merge table: OLS + LSS => LSS (either side).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = aac_SSC(frame_t, next1_t, "OLS")
assert out1 == "LSS"
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
out2 = aac_SSC(frame_t, next2_t, "OLS")
assert out2 == "LSS"
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
"""
prev=ESH and next predicted ESH for both channels:
per-channel: ESH, ESH
merged: ESH
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = aac_SSC(frame_t, next_t, "ESH")
assert out == "ESH"
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
"""
prev=ESH and next not predicted ESH for both channels:
per-channel: LPS, LPS
merged: LPS
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = aac_SSC(frame_t, next_t, "ESH")
assert out == "LPS"
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
"""
prev=ESH:
- one channel predicts ESH => ESH
- other channel predicts not ESH => LPS
Merge table: ESH + LPS => ESH (either side).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = aac_SSC(frame_t, next1_t, "ESH")
assert out1 == "ESH"
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
out2 = aac_SSC(frame_t, next2_t, "ESH")
assert out2 == "ESH"
def test_threshold_s_must_exceed_1e_3() -> None:
"""
Spec: next frame is predicted ESH only if:
s[l] > 1e-3 AND ds[l] > 10
for some l in 1..7.
This test checks the necessity of the s[l] threshold:
- Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
- Expect: not classified as ESH -> for prev=OLS return OLS.
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "OLS"

View File

@ -0,0 +1,156 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder/DecoderTests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for AAC Coder/Decoder module.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
import numpy as np
import pytest
import soundfile as sf
from core.aac_coder import aac_coder_1
from core.aac_decoder import aac_decoder_1
from core.aac_types import *
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
# -----------------------------------------------------------------------------
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference signal, shape (N, 2) typical.
x_hat : StereoSignal
Reconstructed signal, shape (M, 2) typical.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
# Be conservative: align lengths and common channels.
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))
@pytest.fixture()
def tmp_stereo_wav(tmp_path: Path) -> Path:
"""
Create a temporary 48 kHz stereo WAV with random samples.
"""
rng = np.random.default_rng(123)
fs = 48000
# ~1 second of audio (kept small for test speed).
n = fs
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
wav_path = tmp_path / "in.wav"
sf.write(str(wav_path), x, fs)
return wav_path
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
"""
Module-level contract test:
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
"""
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
assert isinstance(aac_seq, list)
assert len(aac_seq) > 0
for fr in aac_seq:
assert isinstance(fr, dict)
# Required keys
assert "frame_type" in fr
assert "win_type" in fr
assert "chl" in fr
assert "chr" in fr
frame_type = fr["frame_type"]
win_type = fr["win_type"]
assert frame_type in ("OLS", "LSS", "ESH", "LPS")
assert win_type in ("SIN", "KBD")
assert isinstance(fr["chl"], dict)
assert isinstance(fr["chr"], dict)
assert "frame_F" in fr["chl"]
assert "frame_F" in fr["chr"]
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
if frame_type == "ESH":
assert chl_f.shape == (128, 8)
assert chr_f.shape == (128, 8)
else:
assert chl_f.shape == (1024, 1)
assert chr_f.shape == (1024, 1)
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
"""
End-to-end test:
Encode + decode and check SNR is very high (numerical-noise only).
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
"""
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
x_ref = np.asarray(x_ref, dtype=np.float64)
assert int(fs) == 48000
out_wav = tmp_path / "out.wav"
aac_seq = aac_coder_1(tmp_stereo_wav)
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
# Basic sanity: output file exists and is readable
assert out_wav.exists()
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
assert int(fs_hat) == 48000
# SNR against returned array (file should match closely, but we do not require it here).
snr = _snr_db(x_ref, x_hat)
assert snr > 80.0

View File

@ -0,0 +1,269 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Filterbank module.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Sequence
import pytest
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
from core.aac_types import *
# Helper fixtures for filterbank
# -----------------------------------------------------------------------------
def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal:
"""
Analyze-synthesize each frame and overlap-add with hop=1024.
Parameters
----------
x : StereoSignal
Input stereo stream, expected shape (N, 2).
frame_types : Sequence[FrameType]
Length K sequence of frame types for frames starting at i*1024.
win_type : WinType
Window type ("SIN" or "KBD").
Returns
-------
StereoSignal
Reconstructed stereo stream, same shape as x (N, 2).
"""
hop = 1024
win = 2048
K = len(frame_types)
y: StereoSignal = np.zeros_like(x, dtype=np.float64)
for i in range(K):
start = i * hop
frame_t: FrameT = x[start:start + win, :]
frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type)
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type)
y[start:start + win, :] += frame_t_hat
return y
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
"""
Compute SNR in dB over all samples/channels.
"""
err = x - y
ps = float(np.sum(x * x))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return 10.0 * float(np.log10(ps / pn))
# -----------------------------------------------------------------------------
# Forward filterbank tests
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
"""
Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
assert frame_f.shape == (1024, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_shapes_esh(win_type: WinType) -> None:
"""
Contract test: for ESH, aac_filter_bank returns shape (128, 16).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
assert frame_f.shape == (128, 16)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
"""
Behavior test: for OLS (representative long-sequence), channels are independent.
If right channel is zero and left is random, right spectrum should be near zero.
"""
rng = np.random.default_rng(0)
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = aac_filter_bank(frame_t, "OLS", win_type)
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
"""
Behavior test: for ESH, channels are independent.
If right channel is zero and left is random, all odd columns (right) should be near zero.
"""
rng = np.random.default_rng(1)
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15
assert np.max(np.abs(right_cols)) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
"""
Spec-driven behavior test:
ESH uses only the central region [448, 1600), split into 8 overlapping
windows of length 256 with 50% overlap.
Therefore, changing samples outside [448, 1600) must not affect the output.
"""
rng = np.random.default_rng(2)
frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64)
center = rng.normal(size=(1152, 2))
frame_a[448:1600, :] = center
frame_b[448:1600, :] = center
frame_b[0:448, :] = rng.normal(size=(448, 2))
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
fa = aac_filter_bank(frame_a, "ESH", win_type)
fb = aac_filter_bank(frame_b, "ESH", win_type)
# Use a tiny tolerance to avoid flaky failures due to floating-point minutiae.
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_output_is_finite(win_type: WinType) -> None:
"""
Sanity test: output must not contain NaN or inf for representative cases.
"""
rng = np.random.default_rng(3)
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
assert np.isfinite(frame_f).all()
# -----------------------------------------------------------------------------
# Reverse i_filterbank tests
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None:
"""
Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2).
"""
frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64)
for frame_type in ("OLS", "LSS", "LPS"):
frame_t = aac_i_filter_bank(frame_f, frame_type, win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_esh(win_type: WinType) -> None:
"""
Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2).
"""
frame_f: FrameF = np.zeros((128, 16), dtype=np.float64)
frame_t = aac_i_filter_bank(frame_f, "ESH", win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None:
"""
Sanity test: per-frame analysis+synthesis must produce finite outputs.
"""
rng = np.random.default_rng(0)
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type)
assert np.isfinite(frame_t_hat).all()
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
"""
Module-level test:
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(1)
K = 6
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 50.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
"""
Module-level test:
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(2)
K = 6
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 45.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
"""
Transition sequence test matching the windowing logic:
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
"""
rng = np.random.default_rng(3)
frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
K = len(frame_types)
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, frame_types, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 40.0

View File

@ -0,0 +1,117 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank internal (mdct) Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Filterbank internal MDCT/IMDCT functionality.
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
import pytest
from core.aac_filterbank import _imdct, _mdct
from core.aac_types import FloatArray, TimeSignal, MdctCoeffs
def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None:
"""
Helper for consistent tolerances across tests.
"""
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float:
"""
Estimate scalar gain g such that y ~= g*x in least-squares sense.
"""
denom = float(np.dot(x, x))
if denom == 0.0:
return 0.0
return float(np.dot(y, x) / denom)
tolerance = 1e-10
@pytest.mark.parametrize("N", [256, 2048])
def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None:
"""
Consistency test in coefficient domain:
mdct(imdct(X)) ~= g * X
For the chosen (non-orthonormal) scaling, g is expected to be close to 2.
"""
rng = np.random.default_rng(0)
K = N // 2
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
x: TimeSignal = _imdct(X)
X_hat: MdctCoeffs = _mdct(x)
g = _estimate_gain(X_hat, X)
_assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance)
_assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance)
@pytest.mark.parametrize("N", [256, 2048])
def test_mdct_linearity(N: int) -> None:
"""
Linearity test:
mdct(a*x + b*y) == a*mdct(x) + b*mdct(y)
"""
rng = np.random.default_rng(1)
x: TimeSignal = rng.normal(size=N).astype(np.float64)
y: TimeSignal = rng.normal(size=N).astype(np.float64)
a = 0.37
b = -1.12
left: MdctCoeffs = _mdct(a * x + b * y)
right: MdctCoeffs = a * _mdct(x) + b * _mdct(y)
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
@pytest.mark.parametrize("N", [256, 2048])
def test_imdct_linearity(N: int) -> None:
"""
Linearity test for IMDCT:
imdct(a*X + b*Y) == a*imdct(X) + b*imdct(Y)
"""
rng = np.random.default_rng(2)
K = N // 2
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
Y: MdctCoeffs = rng.normal(size=K).astype(np.float64)
a = -0.5
b = 2.0
left: TimeSignal = _imdct(a * X + b * Y)
right: TimeSignal = a * _imdct(X) + b * _imdct(Y)
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
@pytest.mark.parametrize("N", [256, 2048])
def test_mdct_imdct_outputs_are_finite(N: int) -> None:
"""
Sanity test: no NaN/inf on random inputs.
"""
rng = np.random.default_rng(3)
K = N // 2
x: TimeSignal = rng.normal(size=N).astype(np.float64)
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
X1 = _mdct(x)
x1 = _imdct(X)
assert np.isfinite(X1).all()
assert np.isfinite(x1).all()

View File

@ -1,843 +1,186 @@
#! /usr/bin/env python
# ------------------------------------------------------------
# AAC Coder/Decoder - Level 1 Wrappers + Demo
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 wrapper module.
#
# This file provides:
# - Thin wrappers for Level 1 API functions (encode/decode) that delegate
# to the corresponding core implementations.
# - A demo function that runs end-to-end and computes SNR.
# - A small CLI entrypoint for convenience.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Dict, Tuple, List, Literal, TypedDict, Union
from typing import Union
import numpy as np
import soundfile as sf
from scipy.signal.windows import kaiser
# --------------------------------
# Public Type aliases (Level 1)
# --------------------------------
from core.aac_types import AACSeq1, StereoSignal
from core.aac_coder import aac_coder_1 as core_aac_coder_1
from core.aac_coder import aac_read_wav_stereo_48k
from core.aac_decoder import aac_decoder_1 as core_aac_decoder_1
FrameType = Literal["OLS", "LSS", "ESH", "LPS"]
"""
Frame type codes:
- "OLS": ONLY_LONG_SEQUENCE
- "LSS": LONG_START_SEQUENCE
- "ESH": EIGHT_SHORT_SEQUENCE
- "LPS": LONG_STOP_SEQUENCE
"""
WinType = Literal["KBD", "SIN"]
"""
Window type codes:
- "KBD": Kaiser-Bessel-Derived
- "SIN": sinusoid
"""
FrameT = np.ndarray
"""
Time-domain frame.
Expected shape: (2048, 2) for stereo (two channels).
dtype: float (e.g., float32/float64).
"""
FrameChannelT = np.ndarray
"""
Time-domain single channel frame.
Expected shape: (2048,).
dtype: float (e.g., float32/float64).
"""
FrameF = np.ndarray
"""
Frequency-domain frame (MDCT coefficients).
As per spec (Level 1):
- If frame_type in {"OLS","LSS","LPS"}: shape (1024, 2)
- If frame_type == "ESH": shape (128, 16) where 8 subframes x 2 channels
are placed in columns according to the subframe order (i.e., each subframe is (128,2)).
"""
ChannelKey = Literal["chl", "chr"]
class AACChannelFrameF(TypedDict):
"""Channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1)."""
frame_F: np.ndarray
# frame_F for one channel:
# - ESH: shape (128, 8)
# - else: shape (1024, 1)
class AACSeq1Frame(TypedDict):
"""One frame dictionary of aac_seq_1 (Level 1)."""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF
chr: AACChannelFrameF
AACSeq1 = List[AACSeq1Frame]
"""AAC sequence for Level 1:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
"""
# Global Options
# -----------------------------------------------------------------------------
# Window type
# Options: "SIN", "KBD"
WIN_TYPE: WinType = "SIN"
# Private helpers for SSC
# -----------------------------------------------------------------------------
# See Table 1 in mm-2025-hw-v0.1.pdf
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
("OLS", "OLS"): "OLS",
("OLS", "LSS"): "LSS",
("OLS", "ESH"): "ESH",
("OLS", "LPS"): "LPS",
("LSS", "OLS"): "LSS",
("LSS", "LSS"): "LSS",
("LSS", "ESH"): "ESH",
("LSS", "LPS"): "ESH",
("ESH", "OLS"): "ESH",
("ESH", "LSS"): "ESH",
("ESH", "ESH"): "ESH",
("ESH", "LPS"): "ESH",
("LPS", "OLS"): "LPS",
("LPS", "LSS"): "ESH",
("LPS", "ESH"): "ESH",
("LPS", "LPS"): "LPS",
}
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
"""
Detect if next frame (single channel) implies ESH according to the spec's attack criterion.
Parameters
----------
next_frame_channel : FrameChannelT
One channel of next_frame_T (shape: (2048,), dtype float).
Returns
-------
attack : bool
True if an attack is detected (=> next frame predicted ESH), else False.
Notes
-----
The spec describes:
- High-pass filter applied to next_frame_channel
- Split into 16 segments of length 128
- Compute segment energies s(l)
- Compute ds(l) = s(l) / s(l-1)
- Attack exists if there exists l in {1..7} such that:
s(l) > 1e-3 and ds(l) > 10
"""
x = next_frame_channel # local alias, x assumed to be a 1-D array of length 2048
# High-pass filter H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
# Implemented as: y[n] = x[n] - x[n-1] + 0.5*y[n-1]
y = np.zeros_like(x)
prev_x = 0.0
prev_y = 0.0
for n in range(x.shape[0]):
xn = float(x[n])
yn = (xn - prev_x) + 0.5 * prev_y
y[n] = yn
prev_x = xn
prev_y = yn
# Segment energies over 16 blocks of 128 samples.
s = np.empty(16, dtype=np.float64)
for l in range(16):
a = l * 128
b = (l + 1) * 128
seg = y[a:b]
s[l] = float(np.sum(seg * seg))
# ds(l) for l>=1. For l=0 not defined, keep 0.
ds = np.zeros(16, dtype=np.float64)
eps = 1e-12 # avoid division by zero without changing logic materially
for l in range(1, 16):
ds[l] = s[l] / max(s[l - 1], eps)
# Spec: check l in {1..7}
for l in range(1, 8):
if (s[l] > 1e-3) and (ds[l] > 10.0):
return True
return False
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
"""
Decide current frame type for a single channel based on prev_frame_type and next-frame attack.
Parameters
----------
prev_frame_type : FrameType
Previous frame type (one of "OLS","LSS","ESH","LPS").
attack : bool
Whether next frame is predicted ESH for this channel.
Returns
-------
frame_type : FrameType
The per-channel decision for the current frame.
Rules (spec)
------------
- If prev is "LSS" => current is "ESH" (fixed)
- If prev is "LPS" => current is "OLS" (fixed)
- If prev is "OLS" => current is "LSS" if attack else "OLS"
- If prev is "ESH" => current is "ESH" if attack else "LPS"
"""
if prev_frame_type == "LSS":
return "ESH"
if prev_frame_type == "LPS":
return "OLS"
if prev_frame_type == "OLS":
return "LSS" if attack else "OLS"
if prev_frame_type == "ESH":
return "ESH" if attack else "LPS"
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
"""
Merge per-channel frame types into one common frame type using the spec table.
Parameters
----------
ft_l : FrameType
Frame type decision for channel 0 (left).
ft_r : FrameType
Frame type decision for channel 1 (right).
Returns
-------
common : FrameType
The common final frame type.
"""
try:
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
except KeyError as e:
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
# Private helpers for Filterbank
# -----------------------------------------------------------------------------
def _sin_window(N: int) -> np.ndarray:
"""
Sine window (full length N).
w[n] = sin(pi/N * (n + 0.5)), 0 <= n < N
"""
n = np.arange(N, dtype=np.float64)
return np.sin((np.pi / N) * (n + 0.5))
def _kbd_window(N: int, alpha: float) -> np.ndarray:
"""
Kaiser-Bessel-Derived (KBD) window (full length N).
This follows the standard KBD construction:
- Build Kaiser kernel of length N/2 + 1
- Use cumulative sum and sqrt normalization to form left and right halves
"""
half = N // 2
# Kaiser kernel length: half + 1 samples (0 .. half)
# beta = pi * alpha per the usual correspondence with the ISO definition
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
csum = np.cumsum(kernel)
denom = csum[-1]
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
w_right = w_left[::-1] # mirror for second half
return np.concatenate([w_left, w_right])
def _long_window(win_type: WinType) -> np.ndarray:
"""
Long window (length 2048) for the selected win_type.
"""
if win_type == "SIN":
return _sin_window(2048)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(2048, alpha=6.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _short_window(win_type: WinType) -> np.ndarray:
"""
Short window (length 256) for the selected win_type.
"""
if win_type == "SIN":
return _sin_window(256)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(256, alpha=4.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _window_sequence(frame_type: FrameType, win_type: WinType) -> np.ndarray:
"""
Build the 2048-sample window sequence for OLS/LSS/LPS.
We follow the simplified assumption:
- The same window shape (KBD or SIN) is used globally (no mixed halves).
- Therefore, the left and right halves are drawn from the same family.
"""
wL = _long_window(win_type) # length 2048
wS = _short_window(win_type) # length 256
if frame_type == "OLS":
return wL
if frame_type == "LSS":
# 0..1023: left half of long window
# 1024..1471: ones (448 samples)
# 1472..1599: right half of short window (128 samples)
# 1600..2047: zeros (448 samples)
out = np.zeros(2048, dtype=np.float64)
out[0:1024] = wL[0:1024]
out[1024:1472] = 1.0
out[1472:1600] = wS[128:256]
out[1600:2048] = 0.0
return out
if frame_type == "LPS":
# 0..447: zeros (448)
# 448..575: left half of short window (128)
# 576..1023: ones (448)
# 1024..2047: right half of long window (1024)
out = np.zeros(2048, dtype=np.float64)
out[0:448] = 0.0
out[448:576] = wS[0:128]
out[576:1024] = 1.0
out[1024:2048] = wL[1024:2048]
return out
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
def _mdct(s: np.ndarray) -> np.ndarray:
"""
MDCT (direct form) as given in the assignment.
Input:
s: windowed time samples of length N (N = 2048 or 256)
Output:
X: MDCT coefficients of length N/2
Definition:
X[k] = 2 * sum_{n=0 .. N-1} s[n] * cos(2*pi/N * (n + n0) * (k + 1/2))
where n0 = (N/2 + 1)/2
"""
s = np.asarray(s, dtype=np.float64)
N = int(s.shape[0])
if N not in (2048, 256):
raise ValueError("MDCT input length must be 2048 or 256.")
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(N // 2, dtype=np.float64) + 0.5
# Cosine matrix: shape (N, N/2)
C = np.cos((2.0 * np.pi / N) * np.outer(n, k))
X = 2.0 * (s @ C)
return X
def _imdct(X: np.ndarray) -> np.ndarray:
"""
IMDCT (direct form) as given in the assignment.
Input:
X: MDCT coefficients of length N/2 (N = 2048 or 256)
Output:
s: time samples of length N
Definition:
s[n] = (2/N) * sum_{k=0 .. N/2-1} X[k] * cos(2*pi/N * (n + n0) * (k + 1/2))
where n0 = (N/2 + 1)/2
"""
X = np.asarray(X, dtype=np.float64).reshape(-1)
K = int(X.shape[0])
if K not in (1024, 128):
raise ValueError("IMDCT input length must be 1024 or 128.")
N = 2 * K
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(K, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
s = (2.0 / N) * (C @ X)
return s
def _filter_bank_esh_channel(x_ch: np.ndarray, win_type: WinType) -> np.ndarray:
"""
ESH analysis for one channel.
Returns:
X_esh: shape (128, 8), where each column is the 128 MDCT coeffs of one short window.
"""
wS = _short_window(win_type)
X_esh = np.empty((128, 8), dtype=np.float64)
# ESH subwindows are taken from the central region:
# start positions: 448 + 128*j, j = 0..7
for j in range(8):
start = 448 + 128 * j
seg = x_ch[start:start + 256] * wS
X_esh[:, j] = _mdct(seg)
return X_esh
def _unpack_esh(frame_F: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
Mapping is the inverse of the packing used in filter_bank():
out[:, 2*j] = left[:, j]
out[:, 2*j+1] = right[:, j]
"""
if frame_F.shape != (128, 16):
raise ValueError("ESH frame_F must have shape (128, 16).")
left = np.empty((128, 8), dtype=np.float64)
right = np.empty((128, 8), dtype=np.float64)
for j in range(8):
left[:, j] = frame_F[:, 2 * j + 0]
right[:, j] = frame_F[:, 2 * j + 1]
return left, right
def _i_filter_bank_esh_channel(X_esh: np.ndarray, win_type: WinType) -> np.ndarray:
"""
ESH synthesis for one channel.
Input:
X_esh: (128, 8) MDCT coeffs for 8 short windows
Output:
x_ch: (2048, ) time-domain frame contribution (windowed),
ready for OLA at the caller level.
"""
if X_esh.shape != (128, 8):
raise ValueError("X_esh must have shape (128, 8).")
wS = _short_window(win_type)
out = np.zeros(2048, dtype=np.float64)
# Each short IMDCT returns 256 samples. Place them at:
# start = 448 + 128*j, j=0..7 (50% overlap)
for j in range(8):
seg = _imdct(X_esh[:, j]) * wS # (256,)
start = 448 + 128 * j
out[start:start + 256] += seg
return out
# -----------------------------------------------------------------------------
# Public Function prototypes (Level 1)
# Public Level 1 API (wrappers)
# -----------------------------------------------------------------------------
def SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
"""
Sequence Segmentation Control (SSC).
Selects and returns the frame type for the current frame (i) based on input parameters.
Parameters
-------
frame_T: FrameT
current time-domain frame i, stereo, shape (2048, 2)
next_frame_T: FrameT
next time-domain frame (i+1), stereo, shape (2048, 2)
(used to decide transitions to/from ESH)
prev_frame_type: FrameType
frame type chosen for the previous frame (i-1)
Returns
-------
frame_type : FrameType
- "OLS" (ONLY_LONG_SEQUENCE)
- "LSS" (LONG_START_SEQUENCE)
- "ESH" (EIGHT_SHORT_SEQUENCE)
- "LPS" (LONG_STOP_SEQUENCE)
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
if next_frame_T.shape != (2048, 2):
raise ValueError("next_frame_T must have shape (2048, 2).")
# Detect attack independently per channel on next frame.
attack_l = _detect_attack(next_frame_T[:, 0])
attack_r = _detect_attack(next_frame_T[:, 1])
# Decide per-channel type based on shared prev_frame_type.
ft_l = _decide_frame_type(prev_frame_type, attack_l)
ft_r = _decide_frame_type(prev_frame_type, attack_r)
# Stereo merge as per Table 1.
return _stereo_merge(ft_l, ft_r)
def filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
"""
Filterbank stage (MDCT analysis).
Parameters
----------
frame_T : FrameT
Time-domain frame, stereo, shape (2048, 2).
frame_type : FrameType
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN") used for the current frame.
Returns
-------
frame_F : FrameF
Frequency-domain MDCT coefficients:
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
containing MDCT coefficients for both channels.
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
placed in columns according to subframe order, i.e. overall shape (128, 16).
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
xL = frame_T[:, 0].astype(np.float64, copy=False)
xR = frame_T[:, 1].astype(np.float64, copy=False)
if frame_type in ("OLS", "LSS", "LPS"):
w = _window_sequence(frame_type, win_type) # length 2048
XL = _mdct(xL * w) # length 1024
XR = _mdct(xR * w) # length 1024
out = np.empty((1024, 2), dtype=np.float64)
out[:, 0] = XL
out[:, 1] = XR
return out
if frame_type == "ESH":
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
# Pack into (128, 16): each subframe as (128,2) placed in columns
out = np.empty((128, 16), dtype=np.float64)
for j in range(8):
out[:, 2 * j + 0] = Xl[:, j]
out[:, 2 * j + 1] = Xr[:, j]
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")
def i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
"""
Inverse filterbank (IMDCT synthesis).
Parameters
----------
frame_F : FrameF
Frequency-domain MDCT coefficients as produced by filter_bank().
frame_type : FrameType
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN").
Returns
-------
frame_T : FrameT
Reconstructed time-domain frame, stereo, shape (2048, 2).
"""
if frame_type in ("OLS", "LSS", "LPS"):
if frame_F.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
w = _window_sequence(frame_type, win_type)
xL = _imdct(frame_F[:, 0]) * w
xR = _imdct(frame_F[:, 1]) * w
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
if frame_type == "ESH":
if frame_F.shape != (128, 16):
raise ValueError("For ESH, frame_F must have shape (128, 16).")
Xl, Xr = _unpack_esh(frame_F)
xL = _i_filter_bank_esh_channel(Xl, win_type)
xR = _i_filter_bank_esh_channel(Xr, win_type)
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
"""
Level-1 AAC encoder.
Level-1 AAC encoder (wrapper).
Delegates to core implementation.
Parameters
----------
filename_in : str | Path
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
Returns
-------
aac_seq_1 : AACSeq1
List of K encoded frames.
For each i:
- aac_seq_1[i]["frame_type"]: FrameType
- aac_seq_1[i]["win_type"]: WinType
- aac_seq_1[i]["chl"]["frame_F"]:
- ESH: shape (128, 8)
- else: shape (1024, 1)
- aac_seq_1[i]["chr"]["frame_F"]:
- ESH: shape (128, 8)
- else: shape (1024, 1)
AACSeq1
List of encoded frames (Level 1 schema).
"""
filename_in = Path(filename_in)
x, fs = sf.read(str(filename_in), always_2d=True)
x = np.asarray(x, dtype=np.float64)
if x.shape[1] != 2:
raise ValueError("Input must be stereo (2 channels).")
if fs != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
hop = 1024
win = 2048
# Pad at the beginning to support the first overlap region.
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
# Number of frames such that current frame fits; next frame will be padded if needed.
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS"
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start:start + win, :]
if frame_t.shape != (win, 2):
# This should not happen due to K definition, but we keep it explicit.
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop:start + hop + win, :]
# Ensure next_t is always (2048,2) by zero-padding at the tail.
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = SSC(frame_t, next_t, prev_frame_type)
frame_f = filter_bank(frame_t, frame_type, WIN_TYPE)
# Store per-channel as required by AACSeq1 schema
if frame_type == "ESH":
# frame_f: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f[:, 2 * j + 0]
chr_f[:, j] = frame_f[:, 2 * j + 1]
else:
# frame_f: (1024, 2)
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
aac_seq.append({
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f},
})
prev_frame_type = frame_type
return aac_seq
return core_aac_coder_1(filename_in)
def i_aac_coder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> np.ndarray:
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-1 AAC decoder (inverse of aac_coder_1()).
Level-1 AAC decoder (wrapper).
Delegates to core implementation.
Parameters
----------
aac_seq_1 : AACSeq1
Encoded sequence as produced by aac_coder_1().
filename_out : str | Path
Output WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo.
Returns
-------
x : np.ndarray
Decoded audio samples (time-domain).
Expected shape: (N, 2) for stereo (N depends on input length).
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
return core_aac_decoder_1(aac_seq_1, filename_out)
hop = 1024
win = 2048
K = len(aac_seq_1)
# Output includes the encoder padding region, so we reconstruct
# full padded stream. For K frames: last frame starts at (K-1)*hop and spans win,
# so total length = (K-1)*hop + win
n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
# -----------------------------------------------------------------------------
# Demo (Level 1)
# -----------------------------------------------------------------------------
for i, fr in enumerate(aac_seq_1):
frame_type = fr["frame_type"]
win_type = fr["win_type"]
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
Parameters
----------
x_ref : StereoSignal
Reference stereo stream.
x_hat : StereoSignal
Reconstructed stereo stream.
# Re-pack into the format expected by i_filter_bank()
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
frame_f = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
else:
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_f[:, 0]
frame_f[:, 1] = chr_f[:, 0]
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
frame_t_hat = i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
start = i * hop
y_pad[start:start + win, :] += frame_t_hat
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
# Remove boundary padding that encoder adds: hop samples at start and hop at end.
if y_pad.shape[0] < 2 * hop:
raise ValueError("Decoded stream too short to unpad.")
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
y = y_pad[hop:-hop, :]
sf.write(str(filename_out), y, 48000)
return y
return float(10.0 * np.log10(ps / pn))
def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float:
"""
Demonstration for Level-1 codec.
Demonstration for the Level-1 codec.
Runs:
- aac_coder_1(filename_in)
- i_aac_coder_1(aac_seq_1, filename_out)
- aac_decoder_1(aac_seq_1, filename_out)
and computes total SNR between original and decoded audio.
Parameters
----------
filename_in : str | Path
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
filename_out : str | Path
filename_out : Union[str, Path]
Output WAV filename (stereo, 48 kHz).
Returns
-------
SNR : float
Overall Signal-to-Noise Ratio in dB.
float
Overall SNR in dB.
"""
filename_in = Path(filename_in)
filename_out = Path(filename_out)
# Read original audio (reference)
x_ref, fs_ref = sf.read(str(filename_in), always_2d=True)
x_ref = np.asarray(x_ref, dtype=np.float64)
# Read original audio (reference) with the same validation as the codec.
x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in)
if int(fs_ref) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
# Encode / decode
aac_seq_1 = aac_coder_1(filename_in)
x_hat = i_aac_coder_1(aac_seq_1, filename_out)
x_hat = np.asarray(x_hat, dtype=np.float64)
x_hat = aac_decoder_1(aac_seq_1, filename_out)
# Ensure 2D stereo shape (N, 2)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
# Optional sanity: ensure output file exists and is readable
x_hat_file, fs_hat = sf.read(str(filename_out), always_2d=True)
_ = x_hat_file
if int(fs_hat) != 48000:
raise ValueError("Decoded output sampling rate must be 48 kHz.")
# Align lengths (use common overlap)
n = min(x_ref.shape[0], x_hat.shape[0])
x_ref = x_ref[:n, :]
x_hat = x_hat[:n, :]
return _snr_db(x_ref, x_hat)
# Match channel count conservatively (common channels)
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:, :c]
x_hat = x_hat[:, :c]
# Compute overall SNR over all samples and channels
err = x_ref - x_hat
p_signal = float(np.sum(x_ref * x_ref))
p_noise = float(np.sum(err * err))
if p_noise <= 0.0:
return float("inf")
if p_signal <= 0.0:
# Degenerate case: silent input
return -float("inf")
# else:
snr_db = 10.0 * np.log10(p_signal / p_noise)
return float(snr_db)
# -----------------------------------------------------------------------------
# CLI
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Example usage:
# Example:
# python -m level_1.level_1 input.wav output.wav
import sys
if len(sys.argv) != 3:
raise SystemExit("Usage: python -m level_1.level_1 <input.wav> <output.wav>")
in_wav = sys.argv[1]
out_wav = sys.argv[2]
in_wav = Path(sys.argv[1])
out_wav = Path(sys.argv[2])
print(f"Encoding/Decoding {in_wav} to {out_wav}")
snr = demo_aac_1(in_wav, out_wav)
print(f"SNR = {snr:.3f} dB")

View File

@ -1,199 +0,0 @@
import numpy as np
import pytest
# Adjust the import based on package/module layout.
from level_1.level_1 import SSC
# Helper "fixtures" for SSC
# -----------------------------------------------------------------------------
def _next_frame_no_attack() -> np.ndarray:
"""
Build a next_frame_T that should NOT trigger ESH detection.
Uses exact zeros so all s2l are zero and the ESH condition (s2l > 1e-3) cannot hold.
"""
return np.zeros((2048, 2), dtype=np.float64)
def _next_frame_strong_attack(
*,
attack_left: bool,
attack_right: bool,
segment_l: int = 4,
baseline: float = 1e-6,
burst_amp: float = 1.0,
) -> np.ndarray:
"""
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
Spec: ESH if exists l in {1..7} with s2l > 1e-3 AND ds2l > 10.
We create:
- small baseline energy in all samples (avoids division by zero in ds2l),
- a strong burst inside one 128-sample segment l in 1..7.
"""
assert 1 <= segment_l <= 7
x = np.full((2048, 2), baseline, dtype=np.float64)
a = segment_l * 128
b = (segment_l + 1) * 128
if attack_left:
x[a:b, 0] += burst_amp
if attack_right:
x[a:b, 1] += burst_amp
return x
def _next_frame_below_s2l_threshold(
*,
left: bool,
right: bool,
segment_l: int = 4,
impulse_amp: float = 0.01,
) -> np.ndarray:
"""
Construct a next_frame_T where s2l is below 1e-3, so ESH must NOT be triggered,
even if ds2l could be large.
Put a single impulse of amplitude 'impulse_amp' inside a segment.
Energy in the 128-sample segment: s2l ~= impulse_amp^2.
With impulse_amp=0.01 => s2l ~= 1e-4 < 1e-3.
"""
assert 1 <= segment_l <= 7
x = np.zeros((2048, 2), dtype=np.float64)
idx = segment_l * 128 + 10 # inside segment
if left:
x[idx, 0] = impulse_amp
if right:
x[idx, 1] = impulse_amp
return x
# ---------------------------------------------------------------------
# 1) Fixed/mandatory cases (prev frame type forces current type)
# ---------------------------------------------------------------------
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
"""
Spec: if prev was:
- LSS => current MUST be ESH
- LPS => current MUST be OLS
independent of next frame check.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
# Even if next frame has a strong attack, LSS must force ESH.
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
out1 = SSC(frame_t, next_attack, "LSS")
assert out1 == "ESH"
# Even if next frame has a strong attack, LPS must force OLS.
out2 = SSC(frame_t, next_attack, "LPS")
assert out2 == "OLS"
# ---------------------------------------------------------------------
# 2) Cases requiring next-frame ESH prediction (energy/attack computation)
# ---------------------------------------------------------------------
def test_prev_ols_next_not_esh_returns_ols() -> None:
"""
Spec: if prev=OLS, current is OLS or LSS.
Choose LSS iff (i+1) predicted ESH, else OLS.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = SSC(frame_t, next_t, "OLS")
assert out == "OLS"
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
"""
prev=OLS, next predicted ESH (both channels) => per-channel decisions are LSS and LSS
and merge table keeps LSS.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = SSC(frame_t, next_t, "OLS")
assert out == "LSS"
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
"""
prev=OLS:
- one channel predicts ESH => LSS
- other channel predicts not ESH => OLS
Merge table: OLS + LSS => LSS.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = SSC(frame_t, next1_t, "OLS")
assert out1 == "LSS"
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
out2 = SSC(frame_t, next2_t, "OLS")
assert out2 == "LSS"
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
"""
prev=ESH:
- next predicted ESH => current ESH (per-channel)
Merge table: ESH + ESH => ESH.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = SSC(frame_t, next_t, "ESH")
assert out == "ESH"
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
"""
prev=ESH:
- next not predicted ESH => current LPS (per-channel)
Merge table: LPS + LPS => LPS.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = SSC(frame_t, next_t, "ESH")
assert out == "LPS"
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
"""
prev=ESH:
- one channel predicts ESH => ESH
- other channel predicts not ESH => LPS
Merge table: ESH + LPS => ESH.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = SSC(frame_t, next1_t, "ESH")
assert out1 == "ESH"
next2_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out2 = SSC(frame_t, next2_t, "ESH")
assert out2 == "ESH"
def test_threshold_s2l_must_exceed_1e_3() -> None:
"""
Spec: next frame is ESH only if s2l > 1e-3 AND ds2l > 10 for some l in 1..7.
This test checks the necessity of the s2l threshold:
- Create a frame with s2l ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
- Expect: not classified as ESH -> for prev=OLS return OLS.
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_below_s2l_threshold(left=True, right=True, impulse_amp=0.01)
out = SSC(frame_t, next_t, "OLS")
assert out == "OLS"

View File

@ -1,235 +0,0 @@
import numpy as np
import pytest
from level_1.level_1 import FrameType, WinType, filter_bank, i_filter_bank
# Helper "fixtures" for filterbank
# -----------------------------------------------------------------------------
def _ola_reconstruct(x: np.ndarray, frame_types: list[str], win_type: str) -> np.ndarray:
"""
Analyze-synthesize each frame and overlap-add with hop=1024.
x: shape (N,2)
frame_types: length K, for frames starting at i*1024
"""
hop = 1024
win = 2048
K = len(frame_types)
y = np.zeros_like(x, dtype=np.float64)
for i in range(K):
start = i * hop
frame_t = x[start:start + win, :]
frame_f = filter_bank(frame_t, frame_types[i], win_type)
frame_t_hat = i_filter_bank(frame_f, frame_types[i], win_type)
y[start:start + win, :] += frame_t_hat
return y
def _snr_db(x: np.ndarray, y: np.ndarray) -> float:
err = x - y
ps = float(np.sum(x * x))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
return 10.0 * np.log10(ps / pn)
# ---------------------------------------------------------------------
# Forward filterbank tests
# ---------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
"""
Contract test:
For OLS/LSS/LPS, filter_bank returns shape (1024, 2).
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
frame_f = filter_bank(frame_t, frame_type, win_type)
assert frame_f.shape == (1024, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_shapes_esh(win_type: WinType) -> None:
"""
Contract test:
For ESH, filter_bank returns shape (128, 16).
"""
frame_t = np.zeros((2048, 2), dtype=np.float64)
frame_f = filter_bank(frame_t, "ESH", win_type)
assert frame_f.shape == (128, 16)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
"""
Module behavior test:
For OLS (representative long-sequence), channels are processed independently:
- If right channel is zero and left is random, right spectrum should be near zero.
"""
rng = np.random.default_rng(0)
frame_t = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = filter_bank(frame_t, "OLS", win_type)
# Right channel output should be (close to) zero
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
"""
Module behavior test:
For ESH, channels are processed independently:
- If right channel is zero and left is random, all odd columns (right) should be near zero.
"""
rng = np.random.default_rng(1)
frame_t = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = filter_bank(frame_t, "ESH", win_type)
# Right channel appears in columns 1,3,5,...,15
right_cols = frame_f[:, 1::2]
assert np.max(np.abs(right_cols)) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
"""
Spec-driven behavior test:
ESH uses only the central 1152 samples (from 448 to 1599), split into 8 overlapping
windows of length 256 with 50% overlap.
Therefore, changing samples outside [448, 1600) must not affect the output.
"""
rng = np.random.default_rng(2)
frame_a = np.zeros((2048, 2), dtype=np.float64)
frame_b = np.zeros((2048, 2), dtype=np.float64)
# Same central region for both frames
center = rng.normal(size=(1152, 2))
frame_a[448:1600, :] = center
frame_b[448:1600, :] = center
# Modify only the outer regions of frame_b
frame_b[0:448, :] = rng.normal(size=(448, 2))
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
fa = filter_bank(frame_a, "ESH", win_type)
fb = filter_bank(frame_b, "ESH", win_type)
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=0.0)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_output_is_finite(win_type: WinType) -> None:
"""
Sanity test:
Output must not contain NaN or inf for representative cases.
"""
rng = np.random.default_rng(3)
frame_t = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = filter_bank(frame_t, frame_type, win_type)
assert np.isfinite(frame_f).all()
# ---------------------------------------------------------------------
# Reverse i_filterbank tests
# ---------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_long_sequences(win_type: str) -> None:
frame_f = np.zeros((1024, 2), dtype=np.float64)
for frame_type in ("OLS", "LSS", "LPS"):
frame_t = i_filter_bank(frame_f, frame_type, win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_esh(win_type: str) -> None:
frame_f = np.zeros((128, 16), dtype=np.float64)
frame_t = i_filter_bank(frame_f, "ESH", win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_roundtrip_per_frame_is_finite(win_type: str) -> None:
rng = np.random.default_rng(0)
frame_t = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = filter_bank(frame_t, frame_type, win_type)
frame_t_hat = i_filter_bank(frame_f, frame_type, win_type)
assert np.isfinite(frame_t_hat).all()
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_ols_high_snr(win_type: str) -> None:
"""
Core module-level test:
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(1)
K = 6
N = 1024 * (K + 1)
x = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
# Exclude edges (first and last hop) where full overlap is not available
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 50.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_esh_high_snr(win_type: str) -> None:
"""
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(2)
K = 6
N = 1024 * (K + 1)
x = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 45.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_transition_sequence(win_type: str) -> None:
"""
Transition sequence test matching the windowing logic:
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
"""
rng = np.random.default_rng(3)
frame_types = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
K = len(frame_types)
N = 1024 * (K + 1)
x = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, frame_types, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 40.0

21
source/level_2/level_2.py Normal file
View File

@ -0,0 +1,21 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Level 2 Wrappers + Demo
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 2 wrapper module.
#
# This file provides:
# - Thin wrappers for Level 2 API functions (encode/decode) that delegate
# to the corresponding core implementations.
# - A demo function that runs end-to-end and computes SNR.
# - A small CLI entrypoint for convenience.
# ------------------------------------------------------------
from __future__ import annotations

4
source/pytest.ini Normal file
View File

@ -0,0 +1,4 @@
[pytest]
pythonpath = .
testpaths =
core/tests