Level_1: File restructure to support centralized development
This commit is contained in:
parent
dde11ddebe
commit
8427d0e721
198
source/core/aac_coder.py
Normal file
198
source/core/aac_coder.py
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - AAC Coder (Core)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 1 AAC encoder orchestration.
|
||||||
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Reads WAV via soundfile
|
||||||
|
# - Validates stereo and 48 kHz
|
||||||
|
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
|
||||||
|
# - SSC decision uses next-frame attack detection
|
||||||
|
# - Filterbank analysis (MDCT)
|
||||||
|
# - Stores per-channel spectra in AACSeq1 schema:
|
||||||
|
# * ESH: (128, 8)
|
||||||
|
# * else: (1024, 1)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from core.aac_configuration import WIN_TYPE
|
||||||
|
from core.aac_filterbank import aac_filter_bank
|
||||||
|
from core.aac_ssc import aac_SSC
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
|
||||||
|
"""
|
||||||
|
Read a WAV file using soundfile and validate the Level-1 assumptions.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
x : StereoSignal (np.ndarray)
|
||||||
|
Stereo samples as float64, shape (N, 2).
|
||||||
|
fs : int
|
||||||
|
Sampling rate (Hz). Must be 48000.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the input is not stereo or the sampling rate is not 48 kHz.
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, fs = sf.read(str(filename_in), always_2d=True)
|
||||||
|
x = np.asarray(x, dtype=np.float64)
|
||||||
|
|
||||||
|
if x.shape[1] != 2:
|
||||||
|
raise ValueError("Input must be stereo (2 channels).")
|
||||||
|
if int(fs) != 48000:
|
||||||
|
raise ValueError("Input sampling rate must be 48 kHz.")
|
||||||
|
|
||||||
|
return x, int(fs)
|
||||||
|
|
||||||
|
|
||||||
|
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||||
|
"""
|
||||||
|
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
|
||||||
|
as required by the Level-1 AACSeq1 schema.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
"OLS" | "LSS" | "ESH" | "LPS".
|
||||||
|
frame_f : FrameF
|
||||||
|
Output of aac_filter_bank():
|
||||||
|
- If frame_type != "ESH": shape (1024, 2)
|
||||||
|
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
chl_f : FrameChannelF
|
||||||
|
Left channel coefficients:
|
||||||
|
- ESH: shape (128, 8)
|
||||||
|
- else: shape (1024, 1)
|
||||||
|
chr_f : FrameChannelF
|
||||||
|
Right channel coefficients:
|
||||||
|
- ESH: shape (128, 8)
|
||||||
|
- else: shape (1024, 1)
|
||||||
|
"""
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if frame_f.shape != (128, 16):
|
||||||
|
raise ValueError("For ESH, frame_f must have shape (128, 16).")
|
||||||
|
|
||||||
|
chl_f = np.empty((128, 8), dtype=np.float64)
|
||||||
|
chr_f = np.empty((128, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
chl_f[:, j] = frame_f[:, 2 * j + 0]
|
||||||
|
chr_f[:, j] = frame_f[:, 2 * j + 1]
|
||||||
|
return chl_f, chr_f
|
||||||
|
|
||||||
|
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
|
||||||
|
if frame_f.shape != (1024, 2):
|
||||||
|
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
|
||||||
|
|
||||||
|
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
|
||||||
|
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
|
||||||
|
return chl_f, chr_f
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 encoder
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||||
|
"""
|
||||||
|
Level-1 AAC encoder.
|
||||||
|
|
||||||
|
This function preserves the behavior of the original level_1 implementation:
|
||||||
|
- Read stereo 48 kHz WAV
|
||||||
|
- Pad hop samples at start and hop samples at end
|
||||||
|
- Frame with win=2048, hop=1024
|
||||||
|
- Use SSC with next-frame lookahead
|
||||||
|
- Apply filterbank analysis
|
||||||
|
- Store per-channel coefficients using AACSeq1 schema
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename.
|
||||||
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq1
|
||||||
|
List of encoded frames (Level 1 schema).
|
||||||
|
"""
|
||||||
|
x, fs = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
_ = fs # kept for clarity; The assignment assumes 48 kHz
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
# Pad at the beginning to support the first overlap region.
|
||||||
|
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
# Number of frames such that current frame fits; next frame will be padded if needed.
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
aac_seq: AACSeq1 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
win_type: WinType = WIN_TYPE
|
||||||
|
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start:start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
# This should not happen due to K definition, but keep it explicit.
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop:start + hop + win, :]
|
||||||
|
|
||||||
|
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
|
|
||||||
|
aac_seq.append({
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": win_type,
|
||||||
|
"chl": {"frame_F": chl_f},
|
||||||
|
"chr": {"frame_F": chr_f},
|
||||||
|
})
|
||||||
|
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
|
return aac_seq
|
||||||
22
source/core/aac_configuration.py
Normal file
22
source/core/aac_configuration.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Configuration
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module contains the global configurations
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# Imports
|
||||||
|
from core.aac_types import WinType
|
||||||
|
|
||||||
|
# Window type
|
||||||
|
# Options: "SIN", "KBD"
|
||||||
|
WIN_TYPE: WinType = "SIN"
|
||||||
166
source/core/aac_decoder.py
Normal file
166
source/core/aac_decoder.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Inverse AAC Coder (Core)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
|
||||||
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
|
||||||
|
# - IMDCT synthesis per frame
|
||||||
|
# - Overlap-add with hop=1024
|
||||||
|
# - Remove encoder boundary padding: hop at start and hop at end
|
||||||
|
#
|
||||||
|
# Note:
|
||||||
|
# This core module returns the reconstructed samples. Writing to disk is kept
|
||||||
|
# in level_x demos.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from core.aac_filterbank import aac_i_filter_bank
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
|
||||||
|
"""
|
||||||
|
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
|
||||||
|
FrameF container expected by aac_i_filter_bank().
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
"OLS" | "LSS" | "ESH" | "LPS".
|
||||||
|
chl_f : FrameChannelF
|
||||||
|
Left channel coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- else: (1024, 1)
|
||||||
|
chr_f : FrameChannelF
|
||||||
|
Right channel coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- else: (1024, 1)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameF
|
||||||
|
Stereo coefficients:
|
||||||
|
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||||
|
- else: (1024, 2)
|
||||||
|
"""
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||||
|
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||||
|
|
||||||
|
frame_f = np.empty((128, 16), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||||
|
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||||
|
return frame_f
|
||||||
|
|
||||||
|
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
|
||||||
|
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
|
||||||
|
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
|
||||||
|
|
||||||
|
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||||
|
frame_f[:, 0] = chl_f[:, 0]
|
||||||
|
frame_f[:, 1] = chr_f[:, 0]
|
||||||
|
return frame_f
|
||||||
|
|
||||||
|
|
||||||
|
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Remove the boundary padding that the Level-1 encoder adds:
|
||||||
|
hop samples at start and hop samples at end.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y_pad : StereoSignal (np.ndarray)
|
||||||
|
Reconstructed padded stream, shape (N_pad, 2).
|
||||||
|
hop : int
|
||||||
|
Hop size in samples (default 1024).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal (np.ndarray)
|
||||||
|
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If y_pad is too short to unpad.
|
||||||
|
"""
|
||||||
|
if y_pad.shape[0] < 2 * hop:
|
||||||
|
raise ValueError("Decoded stream too short to unpad.")
|
||||||
|
return y_pad[hop:-hop, :]
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 decoder (core)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||||||
|
|
||||||
|
This function preserves the behavior of the original level_1 implementation:
|
||||||
|
- Reconstruct the full padded stream by overlap-adding K synthesized frames
|
||||||
|
- Remove hop padding at the beginning and hop padding at the end
|
||||||
|
- Write the reconstructed stereo WAV file (48 kHz)
|
||||||
|
- Return reconstructed stereo samples as float64
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_1 : AACSeq1
|
||||||
|
Encoded sequence as produced by aac_coder_1().
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_1)
|
||||||
|
|
||||||
|
# Output includes the encoder padding region, so we reconstruct the full padded stream.
|
||||||
|
# For K frames: last frame starts at (K-1)*hop and spans win,
|
||||||
|
# so total length = (K-1)*hop + win.
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
for i, fr in enumerate(aac_seq_1):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||||
|
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||||
|
|
||||||
|
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start:start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
|
||||||
|
# Level 1 assumption: 48 kHz output.
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
|
||||||
|
return y
|
||||||
454
source/core/aac_filterbank.py
Normal file
454
source/core/aac_filterbank.py
Normal file
@ -0,0 +1,454 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Filterbank module
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
from scipy.signal.windows import kaiser
|
||||||
|
|
||||||
|
# Private helpers for Filterbank
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
def _sin_window(N: int) -> Window:
|
||||||
|
"""
|
||||||
|
Build a sinusoidal (SIN) window of length N.
|
||||||
|
|
||||||
|
The AAC sinusoid window is:
|
||||||
|
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
N : int
|
||||||
|
Window length in samples.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (N, ) with dtype float64.
|
||||||
|
"""
|
||||||
|
n = np.arange(N, dtype=np.float64)
|
||||||
|
return np.sin((np.pi / N) * (n + 0.5))
|
||||||
|
|
||||||
|
|
||||||
|
def _kbd_window(N: int, alpha: float) -> Window:
|
||||||
|
"""
|
||||||
|
Build a Kaiser-Bessel-Derived (KBD) window of length N.
|
||||||
|
|
||||||
|
This follows the standard KBD construction used in AAC:
|
||||||
|
1) Build a Kaiser kernel of length (N/2 + 1).
|
||||||
|
2) Form the left half by cumulative summation, normalization, and sqrt.
|
||||||
|
3) Mirror the left half to form the right half (symmetric full-length window).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
- N must be even (AAC uses N=2048 for long and N=256 for short).
|
||||||
|
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
|
||||||
|
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
N : int
|
||||||
|
Window length in samples (must be even).
|
||||||
|
alpha : float
|
||||||
|
KBD alpha parameter.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (N,) with dtype float64.
|
||||||
|
"""
|
||||||
|
half = N // 2
|
||||||
|
|
||||||
|
# Kaiser kernel length: half + 1 samples (0 .. half)
|
||||||
|
# beta = pi * alpha per the usual correspondence with the ISO definition
|
||||||
|
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
|
||||||
|
|
||||||
|
csum = np.cumsum(kernel)
|
||||||
|
denom = csum[-1]
|
||||||
|
|
||||||
|
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
|
||||||
|
w_right = w_left[::-1] # mirror for second half
|
||||||
|
|
||||||
|
return np.concatenate([w_left, w_right])
|
||||||
|
|
||||||
|
|
||||||
|
def _long_window(win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Return the long AAC window (length 2048) for the selected window family.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (2048,) with dtype float64.
|
||||||
|
"""
|
||||||
|
if win_type == "SIN":
|
||||||
|
return _sin_window(2048)
|
||||||
|
if win_type == "KBD":
|
||||||
|
# Assignment-specific alpha values
|
||||||
|
return _kbd_window(2048, alpha=6.0)
|
||||||
|
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _short_window(win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Return the short AAC window (length 256) for the selected window family.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (256,) with dtype float64.
|
||||||
|
"""
|
||||||
|
if win_type == "SIN":
|
||||||
|
return _sin_window(256)
|
||||||
|
if win_type == "KBD":
|
||||||
|
# Assignment-specific alpha values
|
||||||
|
return _kbd_window(256, alpha=4.0)
|
||||||
|
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
|
||||||
|
|
||||||
|
In this assignment we assume a single window family is used globally
|
||||||
|
(no mixed KBD/SIN halves). Therefore, both the long and short windows
|
||||||
|
are drawn from the same family.
|
||||||
|
|
||||||
|
For frame_type:
|
||||||
|
- "OLS": return the long window Wl (2048).
|
||||||
|
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
|
||||||
|
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
One of "OLS", "LSS", "LPS".
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (2048,) with dtype float64.
|
||||||
|
"""
|
||||||
|
wL = _long_window(win_type) # length 2048
|
||||||
|
wS = _short_window(win_type) # length 256
|
||||||
|
|
||||||
|
if frame_type == "OLS":
|
||||||
|
return wL
|
||||||
|
|
||||||
|
if frame_type == "LSS":
|
||||||
|
# 0..1023: left half of long window
|
||||||
|
# 1024..1471: ones (448 samples)
|
||||||
|
# 1472..1599: right half of short window (128 samples)
|
||||||
|
# 1600..2047: zeros (448 samples)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
out[0:1024] = wL[0:1024]
|
||||||
|
out[1024:1472] = 1.0
|
||||||
|
out[1472:1600] = wS[128:256]
|
||||||
|
out[1600:2048] = 0.0
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "LPS":
|
||||||
|
# 0..447: zeros (448)
|
||||||
|
# 448..575: left half of short window (128)
|
||||||
|
# 576..1023: ones (448)
|
||||||
|
# 1024..2047: right half of long window (1024)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
out[0:448] = 0.0
|
||||||
|
out[448:576] = wS[0:128]
|
||||||
|
out[576:1024] = 1.0
|
||||||
|
out[1024:2048] = wL[1024:2048]
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _mdct(s: TimeSignal) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
MDCT (direct form) as specified in the assignment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
s : TimeSignal
|
||||||
|
Windowed time samples, 1-D array of length N (N = 2048 or 256).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
MDCT coefficients, 1-D array of length N/2.
|
||||||
|
|
||||||
|
Definition
|
||||||
|
----------
|
||||||
|
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||||
|
where n0 = (N/2 + 1)/2.
|
||||||
|
"""
|
||||||
|
s = np.asarray(s, dtype=np.float64).reshape(-1)
|
||||||
|
N = int(s.shape[0])
|
||||||
|
if N not in (2048, 256):
|
||||||
|
raise ValueError("MDCT input length must be 2048 or 256.")
|
||||||
|
|
||||||
|
n0 = (N / 2.0 + 1.0) / 2.0
|
||||||
|
n = np.arange(N, dtype=np.float64) + n0
|
||||||
|
k = np.arange(N // 2, dtype=np.float64) + 0.5
|
||||||
|
|
||||||
|
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
|
||||||
|
X = 2.0 * (s @ C) # (N/2,)
|
||||||
|
return X
|
||||||
|
|
||||||
|
|
||||||
|
def _imdct(X: MdctCoeffs) -> TimeSignal:
|
||||||
|
"""
|
||||||
|
IMDCT (direct form) as specified in the assignment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : MdctCoeffs
|
||||||
|
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
TimeSignal
|
||||||
|
Reconstructed time samples, 1-D array of length N = 2K.
|
||||||
|
|
||||||
|
Definition
|
||||||
|
----------
|
||||||
|
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||||
|
where n0 = (N/2 + 1)/2.
|
||||||
|
"""
|
||||||
|
X = np.asarray(X, dtype=np.float64).reshape(-1)
|
||||||
|
K = int(X.shape[0])
|
||||||
|
if K not in (1024, 128):
|
||||||
|
raise ValueError("IMDCT input length must be 1024 or 128.")
|
||||||
|
|
||||||
|
N = 2 * K
|
||||||
|
n0 = (N / 2.0 + 1.0) / 2.0
|
||||||
|
|
||||||
|
n = np.arange(N, dtype=np.float64) + n0
|
||||||
|
k = np.arange(K, dtype=np.float64) + 0.5
|
||||||
|
|
||||||
|
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
|
||||||
|
s = (2.0 / N) * (C @ X) # (N,)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
|
||||||
|
"""
|
||||||
|
ESH analysis for one channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_ch : FrameChannelT
|
||||||
|
Time-domain channel frame (expected shape: (2048,)).
|
||||||
|
win_type : WinType
|
||||||
|
Window family ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelF
|
||||||
|
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
|
||||||
|
of the j-th short window.
|
||||||
|
"""
|
||||||
|
wS = _short_window(win_type) # (256,)
|
||||||
|
X_esh = np.empty((128, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
# ESH subwindows are taken from the central region:
|
||||||
|
# start positions: 448 + 128*j, j = 0..7
|
||||||
|
for j in range(8):
|
||||||
|
start = 448 + 128 * j
|
||||||
|
seg = x_ch[start:start + 256] * wS # (256,)
|
||||||
|
X_esh[:, j] = _mdct(seg) # (128,)
|
||||||
|
|
||||||
|
return X_esh
|
||||||
|
|
||||||
|
|
||||||
|
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||||
|
"""
|
||||||
|
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameF
|
||||||
|
Packed ESH spectrum (expected shape: (128, 16)).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
left : FrameChannelF
|
||||||
|
Left channel spectrum, shape (128, 8).
|
||||||
|
right : FrameChannelF
|
||||||
|
Right channel spectrum, shape (128, 8).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Inverse mapping of the packing used in aac_filter_bank():
|
||||||
|
packed[:, 2*j] = left[:, j]
|
||||||
|
packed[:, 2*j+1] = right[:, j]
|
||||||
|
"""
|
||||||
|
if frame_F.shape != (128, 16):
|
||||||
|
raise ValueError("ESH frame_F must have shape (128, 16).")
|
||||||
|
|
||||||
|
left = np.empty((128, 8), dtype=np.float64)
|
||||||
|
right = np.empty((128, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
left[:, j] = frame_F[:, 2 * j + 0]
|
||||||
|
right[:, j] = frame_F[:, 2 * j + 1]
|
||||||
|
return left, right
|
||||||
|
|
||||||
|
|
||||||
|
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
|
||||||
|
"""
|
||||||
|
ESH synthesis for one channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X_esh : FrameChannelF
|
||||||
|
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
|
||||||
|
win_type : WinType
|
||||||
|
Window family ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelT
|
||||||
|
Time-domain channel contribution, shape (2048,).
|
||||||
|
This is already overlap-added internally for the 8 short blocks and
|
||||||
|
ready for OLA at the caller level.
|
||||||
|
"""
|
||||||
|
if X_esh.shape != (128, 8):
|
||||||
|
raise ValueError("X_esh must have shape (128, 8).")
|
||||||
|
|
||||||
|
wS = _short_window(win_type) # (256,)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
|
||||||
|
# Each short IMDCT returns 256 samples. Place them at:
|
||||||
|
# start = 448 + 128*j, j=0..7 (50% overlap)
|
||||||
|
for j in range(8):
|
||||||
|
seg = _imdct(X_esh[:, j]) * wS # (256,)
|
||||||
|
start = 448 + 128 * j
|
||||||
|
out[start:start + 256] += seg
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Function prototypes (Level 1)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
|
||||||
|
"""
|
||||||
|
Filterbank stage (MDCT analysis).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_T : FrameT
|
||||||
|
Time-domain frame, stereo, shape (2048, 2).
|
||||||
|
frame_type : FrameType
|
||||||
|
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("KBD" or "SIN") used for the current frame.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
frame_F : FrameF
|
||||||
|
Frequency-domain MDCT coefficients:
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
|
||||||
|
containing MDCT coefficients for both channels.
|
||||||
|
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
|
||||||
|
placed in columns according to subframe order, i.e. overall shape (128, 16).
|
||||||
|
"""
|
||||||
|
if frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("frame_T must have shape (2048, 2).")
|
||||||
|
|
||||||
|
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
|
||||||
|
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
|
||||||
|
|
||||||
|
if frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
w = _window_sequence(frame_type, win_type) # length 2048
|
||||||
|
XL = _mdct(xL * w) # length 1024
|
||||||
|
XR = _mdct(xR * w) # length 1024
|
||||||
|
out = np.empty((1024, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = XL
|
||||||
|
out[:, 1] = XR
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
|
||||||
|
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
|
||||||
|
|
||||||
|
# Pack into (128, 16): each subframe as (128,2) placed in columns
|
||||||
|
out = np.empty((128, 16), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
out[:, 2 * j + 0] = Xl[:, j]
|
||||||
|
out[:, 2 * j + 1] = Xr[:, j]
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
|
||||||
|
"""
|
||||||
|
Inverse filterbank (IMDCT synthesis).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameF
|
||||||
|
Frequency-domain MDCT coefficients as produced by filter_bank().
|
||||||
|
frame_type : FrameType
|
||||||
|
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
frame_T : FrameT
|
||||||
|
Reconstructed time-domain frame, stereo, shape (2048, 2).
|
||||||
|
"""
|
||||||
|
if frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
if frame_F.shape != (1024, 2):
|
||||||
|
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
|
||||||
|
|
||||||
|
w = _window_sequence(frame_type, win_type)
|
||||||
|
|
||||||
|
xL = _imdct(frame_F[:, 0]) * w
|
||||||
|
xR = _imdct(frame_F[:, 1]) * w
|
||||||
|
|
||||||
|
out = np.empty((2048, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = xL
|
||||||
|
out[:, 1] = xR
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if frame_F.shape != (128, 16):
|
||||||
|
raise ValueError("For ESH, frame_F must have shape (128, 16).")
|
||||||
|
|
||||||
|
Xl, Xr = _unpack_esh(frame_F)
|
||||||
|
xL = _i_filter_bank_esh_channel(Xl, win_type)
|
||||||
|
xR = _i_filter_bank_esh_channel(Xr, win_type)
|
||||||
|
|
||||||
|
out = np.empty((2048, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = xL
|
||||||
|
out[:, 1] = xR
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||||
217
source/core/aac_ssc.py
Normal file
217
source/core/aac_ssc.py
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Sequence Segmentation Control module
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Sequence Segmentation Control module (SSC).
|
||||||
|
# Selects and returns the frame type based on input parameters.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
from core.aac_types import FrameType, FrameT, FrameChannelT
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Private helpers for SSC
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# See Table 1 in mm-2025-hw-v0.1.pdf
|
||||||
|
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
|
||||||
|
("OLS", "OLS"): "OLS",
|
||||||
|
("OLS", "LSS"): "LSS",
|
||||||
|
("OLS", "ESH"): "ESH",
|
||||||
|
("OLS", "LPS"): "LPS",
|
||||||
|
("LSS", "OLS"): "LSS",
|
||||||
|
("LSS", "LSS"): "LSS",
|
||||||
|
("LSS", "ESH"): "ESH",
|
||||||
|
("LSS", "LPS"): "ESH",
|
||||||
|
("ESH", "OLS"): "ESH",
|
||||||
|
("ESH", "LSS"): "ESH",
|
||||||
|
("ESH", "ESH"): "ESH",
|
||||||
|
("ESH", "LPS"): "ESH",
|
||||||
|
("LPS", "OLS"): "LPS",
|
||||||
|
("LPS", "LSS"): "ESH",
|
||||||
|
("LPS", "ESH"): "ESH",
|
||||||
|
("LPS", "LPS"): "LPS",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
|
||||||
|
"""
|
||||||
|
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
|
||||||
|
according to the assignment's criterion.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
next_frame_channel : FrameChannelT
|
||||||
|
One channel of next_frame_T (expected shape: (2048,)).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if an attack is detected (=> next frame predicted ESH), else False.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The criterion is implemented as described in the spec:
|
||||||
|
|
||||||
|
1) Apply the high-pass filter:
|
||||||
|
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
|
||||||
|
implemented in the time domain as:
|
||||||
|
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
|
||||||
|
|
||||||
|
2) Split y into 16 segments of length 128 and compute segment energies s[l].
|
||||||
|
|
||||||
|
3) Compute the ratio:
|
||||||
|
ds[l] = s[l] / s[l-1]
|
||||||
|
|
||||||
|
4) An attack exists if there exists l in {1..7} such that:
|
||||||
|
s[l] > 1e-3 and ds[l] > 10
|
||||||
|
"""
|
||||||
|
# Local alias; expected to be a 1-D array of length 2048.
|
||||||
|
x = next_frame_channel
|
||||||
|
|
||||||
|
# High-pass filter reference implementation (scalar recurrence).
|
||||||
|
y = np.zeros_like(x)
|
||||||
|
prev_x = 0.0
|
||||||
|
prev_y = 0.0
|
||||||
|
for n in range(x.shape[0]):
|
||||||
|
xn = float(x[n])
|
||||||
|
yn = (xn - prev_x) + 0.5 * prev_y
|
||||||
|
y[n] = yn
|
||||||
|
prev_x = xn
|
||||||
|
prev_y = yn
|
||||||
|
|
||||||
|
# Segment energies over 16 blocks of 128 samples.
|
||||||
|
s = np.empty(16, dtype=np.float64)
|
||||||
|
for l in range(16):
|
||||||
|
a = l * 128
|
||||||
|
b = (l + 1) * 128
|
||||||
|
seg = y[a:b]
|
||||||
|
s[l] = float(np.sum(seg * seg))
|
||||||
|
|
||||||
|
# ds[l] for l>=1. For l=0 not defined, keep 0.
|
||||||
|
ds = np.zeros(16, dtype=np.float64)
|
||||||
|
eps = 1e-12 # Avoid division by zero without materially changing the logic.
|
||||||
|
for l in range(1, 16):
|
||||||
|
ds[l] = s[l] / max(s[l - 1], eps)
|
||||||
|
|
||||||
|
# Spec: check l in {1..7}.
|
||||||
|
for l in range(1, 8):
|
||||||
|
if (s[l] > 1e-3) and (ds[l] > 10.0):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
|
||||||
|
"""
|
||||||
|
Decide the current frame type for a single channel based on the previous
|
||||||
|
frame type and whether the next frame is predicted to be ESH.
|
||||||
|
|
||||||
|
Rules (spec):
|
||||||
|
|
||||||
|
- If prev is "LSS" => current is "ESH"
|
||||||
|
- If prev is "LPS" => current is "OLS"
|
||||||
|
- If prev is "OLS" => current is "LSS" if attack else "OLS"
|
||||||
|
- If prev is "ESH" => current is "ESH" if attack else "LPS"
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
prev_frame_type : FrameType
|
||||||
|
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
|
||||||
|
attack : bool
|
||||||
|
True if the next frame is predicted ESH for this channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
The per-channel decision for the current frame.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if prev_frame_type == "LSS":
|
||||||
|
return "ESH"
|
||||||
|
if prev_frame_type == "LPS":
|
||||||
|
return "OLS"
|
||||||
|
if prev_frame_type == "OLS":
|
||||||
|
return "LSS" if attack else "OLS"
|
||||||
|
if prev_frame_type == "ESH":
|
||||||
|
return "ESH" if attack else "LPS"
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
||||||
|
"""
|
||||||
|
Merge per-channel frame type decisions into one common frame type using
|
||||||
|
the stereo merge table from the spec.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ft_l : FrameType
|
||||||
|
Frame type decision for the left channel.
|
||||||
|
ft_r : FrameType
|
||||||
|
Frame type decision for the right channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
The merged common frame type.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Function prototypes (Level 1)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
||||||
|
"""
|
||||||
|
Sequence Segmentation Control (SSC).
|
||||||
|
|
||||||
|
Select and return the frame type for the current frame (i) based on:
|
||||||
|
- the current time-domain frame (stereo),
|
||||||
|
- the next time-domain frame (stereo), used for attack detection,
|
||||||
|
- the previous frame type.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_T : FrameT
|
||||||
|
Current time-domain frame i (expected shape: (2048, 2)).
|
||||||
|
next_frame_T : FrameT
|
||||||
|
Next time-domain frame (i+1), used to decide transitions to/from ESH
|
||||||
|
(expected shape: (2048, 2)).
|
||||||
|
prev_frame_type : FrameType
|
||||||
|
Frame type chosen for the previous frame (i-1).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
One of: "OLS", "LSS", "ESH", "LPS".
|
||||||
|
"""
|
||||||
|
if frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("frame_T must have shape (2048, 2).")
|
||||||
|
if next_frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("next_frame_T must have shape (2048, 2).")
|
||||||
|
|
||||||
|
# Detect attack independently per channel on the next frame.
|
||||||
|
attack_l = _detect_attack(next_frame_T[:, 0])
|
||||||
|
attack_r = _detect_attack(next_frame_T[:, 1])
|
||||||
|
|
||||||
|
# Decide per-channel type based on shared prev_frame_type.
|
||||||
|
ft_l = _decide_frame_type(prev_frame_type, attack_l)
|
||||||
|
ft_r = _decide_frame_type(prev_frame_type, attack_r)
|
||||||
|
|
||||||
|
# Stereo merge as per the spec table.
|
||||||
|
return _stereo_merge(ft_l, ft_r)
|
||||||
193
source/core/aac_types.py
Normal file
193
source/core/aac_types.py
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Public Type Aliases
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module implements Public Type aliases
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import List, Literal, TypeAlias, TypedDict
|
||||||
|
import numpy as np
|
||||||
|
from numpy.typing import NDArray
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Code enums (for readability; not intended to enforce shapes/lengths)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
|
||||||
|
"""
|
||||||
|
Frame type codes (AAC):
|
||||||
|
- "OLS": ONLY_LONG_SEQUENCE
|
||||||
|
- "LSS": LONG_START_SEQUENCE
|
||||||
|
- "ESH": EIGHT_SHORT_SEQUENCE
|
||||||
|
- "LPS": LONG_STOP_SEQUENCE
|
||||||
|
"""
|
||||||
|
|
||||||
|
WinType: TypeAlias = Literal["KBD", "SIN"]
|
||||||
|
"""
|
||||||
|
Window type codes (AAC):
|
||||||
|
- "KBD": Kaiser-Bessel-Derived
|
||||||
|
- "SIN": sinusoid
|
||||||
|
"""
|
||||||
|
|
||||||
|
ChannelKey: TypeAlias = Literal["chl", "chr"]
|
||||||
|
"""Channel dictionary keys used in Level 1 payloads."""
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Array “semantic” aliases
|
||||||
|
#
|
||||||
|
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
|
||||||
|
# forcing strict shapes in the type system.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FloatArray: TypeAlias = NDArray[np.float64]
|
||||||
|
"""
|
||||||
|
Generic float64 NumPy array.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- We standardize internal numeric computations to float64 for stability and
|
||||||
|
reproducibility. External I/O can still be float32, but we convert at the
|
||||||
|
boundaries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Window: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain window (weighting sequence), 1-D.
|
||||||
|
|
||||||
|
Typical lengths in this assignment:
|
||||||
|
- Long: 2048
|
||||||
|
- Short: 256
|
||||||
|
- Window sequences for LSS/LPS are also 2048
|
||||||
|
|
||||||
|
Expected shape: (N,)
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
TimeSignal: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain signal samples, typically 1-D.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- Windowed MDCT input: shape (N,)
|
||||||
|
- IMDCT output: shape (N,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
StereoSignal: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain stereo signal stream.
|
||||||
|
|
||||||
|
Expected (typical) shape: (N, 2)
|
||||||
|
- axis 0: time samples
|
||||||
|
- axis 1: channels [L, R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
MdctCoeffs: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
MDCT coefficient vector, typically 1-D.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- Long: shape (1024,)
|
||||||
|
- Short: shape (128,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
FrameT: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain frame (stereo), as used by the filterbank input/output.
|
||||||
|
|
||||||
|
Expected (typical) shape for stereo: (2048, 2)
|
||||||
|
- axis 0: time samples
|
||||||
|
- axis 1: channels [L, R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameChannelT: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain single-channel frame.
|
||||||
|
|
||||||
|
Expected (typical) shape: (2048,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameF: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Frequency-domain frame (MDCT coefficients), stereo container.
|
||||||
|
|
||||||
|
Typical shapes (Level 1):
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
|
||||||
|
- If frame_type == "ESH": (128, 16)
|
||||||
|
|
||||||
|
Rationale for ESH (128, 16):
|
||||||
|
- 8 short subframes per channel => 8 * 2 = 16 columns total
|
||||||
|
- Each short subframe per stereo is (128, 2), flattened into columns
|
||||||
|
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameChannelF: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Frequency-domain single-channel frame (MDCT coefficients).
|
||||||
|
|
||||||
|
Typical shapes (Level 1):
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: (1024,)
|
||||||
|
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 AAC sequence payload types
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
frame_F:
|
||||||
|
The MDCT coefficients for ONE channel.
|
||||||
|
Typical shapes:
|
||||||
|
- ESH: (128, 8) (8 short subframes)
|
||||||
|
- else: (1024, )
|
||||||
|
"""
|
||||||
|
frame_F: FrameChannelF
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq1Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_1 (Level 1).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF
|
||||||
|
chr: AACChannelFrameF
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq1: TypeAlias = List[AACSeq1Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 1:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
"""
|
||||||
234
source/core/tests/test_SSC.py
Normal file
234
source/core/tests/test_SSC.py
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Sequence Segmentation Control Tests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for Sequence Segmentation Control module (SSC).
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from core.aac_ssc import aac_SSC
|
||||||
|
from core.aac_types import FrameT
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper fixtures for SSC
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _next_frame_no_attack() -> FrameT:
|
||||||
|
"""
|
||||||
|
Build a next_frame_T that must NOT trigger ESH detection.
|
||||||
|
|
||||||
|
Uses exact zeros so all segment energies are zero and the condition
|
||||||
|
s[l] > 1e-3 cannot hold for any l.
|
||||||
|
"""
|
||||||
|
return np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
|
||||||
|
def _next_frame_strong_attack(
|
||||||
|
*,
|
||||||
|
attack_left: bool,
|
||||||
|
attack_right: bool,
|
||||||
|
segment_l: int = 4,
|
||||||
|
baseline: float = 1e-6,
|
||||||
|
burst_amp: float = 1.0,
|
||||||
|
) -> FrameT:
|
||||||
|
"""
|
||||||
|
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
|
||||||
|
|
||||||
|
Attack criterion (spec):
|
||||||
|
Attack exists if there exists l in {1..7} such that:
|
||||||
|
s[l] > 1e-3 and ds[l] > 10,
|
||||||
|
where s[l] is the energy of segment l (length 128) after high-pass filtering,
|
||||||
|
and ds[l] = s[l] / s[l-1].
|
||||||
|
|
||||||
|
Construction:
|
||||||
|
- A small baseline is added everywhere to avoid relying on the epsilon guard in ds,
|
||||||
|
keeping ds behavior stable/reproducible.
|
||||||
|
- A strong burst is added inside a chosen segment l in 1..7.
|
||||||
|
"""
|
||||||
|
if not (1 <= segment_l <= 7):
|
||||||
|
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
|
||||||
|
|
||||||
|
x = np.full((2048, 2), baseline, dtype=np.float64)
|
||||||
|
|
||||||
|
a = segment_l * 128
|
||||||
|
b = (segment_l + 1) * 128
|
||||||
|
|
||||||
|
if attack_left:
|
||||||
|
x[a:b, 0] += burst_amp
|
||||||
|
if attack_right:
|
||||||
|
x[a:b, 1] += burst_amp
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def _next_frame_below_s_threshold(
|
||||||
|
*,
|
||||||
|
left: bool,
|
||||||
|
right: bool,
|
||||||
|
segment_l: int = 4,
|
||||||
|
impulse_amp: float = 0.01,
|
||||||
|
) -> FrameT:
|
||||||
|
"""
|
||||||
|
Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered,
|
||||||
|
even if the ratio ds[l] could be large.
|
||||||
|
|
||||||
|
We place a single impulse of amplitude 'impulse_amp' inside one segment.
|
||||||
|
Approx. segment energy: s[l] ~= impulse_amp^2.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3
|
||||||
|
"""
|
||||||
|
if not (1 <= segment_l <= 7):
|
||||||
|
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
|
||||||
|
|
||||||
|
x = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
idx = segment_l * 128 + 10 # inside segment l
|
||||||
|
if left:
|
||||||
|
x[idx, 0] = impulse_amp
|
||||||
|
if right:
|
||||||
|
x[idx, 1] = impulse_amp
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# 1) Fixed/mandatory cases (prev frame type forces current type)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
|
||||||
|
"""
|
||||||
|
Spec:
|
||||||
|
- If prev was LSS => current MUST be ESH
|
||||||
|
- If prev was LPS => current MUST be OLS
|
||||||
|
independent of attack detection on (i+1).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
|
out1 = aac_SSC(frame_t, next_attack, "LSS")
|
||||||
|
assert out1 == "ESH"
|
||||||
|
|
||||||
|
out2 = aac_SSC(frame_t, next_attack, "LPS")
|
||||||
|
assert out2 == "OLS"
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# 2) Cases requiring next-frame ESH prediction (attack computation)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_prev_ols_next_not_esh_returns_ols() -> None:
|
||||||
|
"""
|
||||||
|
If prev=OLS, current is:
|
||||||
|
- LSS iff (i+1) is predicted ESH
|
||||||
|
- else OLS
|
||||||
|
Here: no attack => expect OLS.
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_no_attack()
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "OLS")
|
||||||
|
assert out == "OLS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
|
||||||
|
"""
|
||||||
|
prev=OLS and next predicted ESH for both channels:
|
||||||
|
per-channel: LSS, LSS
|
||||||
|
merged: LSS
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "OLS")
|
||||||
|
assert out == "LSS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
|
||||||
|
"""
|
||||||
|
prev=OLS:
|
||||||
|
- one channel predicts ESH => LSS
|
||||||
|
- other channel predicts not ESH => OLS
|
||||||
|
Merge table: OLS + LSS => LSS (either side).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||||
|
out1 = aac_SSC(frame_t, next1_t, "OLS")
|
||||||
|
assert out1 == "LSS"
|
||||||
|
|
||||||
|
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||||
|
out2 = aac_SSC(frame_t, next2_t, "OLS")
|
||||||
|
assert out2 == "LSS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
|
||||||
|
"""
|
||||||
|
prev=ESH and next predicted ESH for both channels:
|
||||||
|
per-channel: ESH, ESH
|
||||||
|
merged: ESH
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "ESH")
|
||||||
|
assert out == "ESH"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
|
||||||
|
"""
|
||||||
|
prev=ESH and next not predicted ESH for both channels:
|
||||||
|
per-channel: LPS, LPS
|
||||||
|
merged: LPS
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_no_attack()
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "ESH")
|
||||||
|
assert out == "LPS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
|
||||||
|
"""
|
||||||
|
prev=ESH:
|
||||||
|
- one channel predicts ESH => ESH
|
||||||
|
- other channel predicts not ESH => LPS
|
||||||
|
Merge table: ESH + LPS => ESH (either side).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||||
|
out1 = aac_SSC(frame_t, next1_t, "ESH")
|
||||||
|
assert out1 == "ESH"
|
||||||
|
|
||||||
|
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||||
|
out2 = aac_SSC(frame_t, next2_t, "ESH")
|
||||||
|
assert out2 == "ESH"
|
||||||
|
|
||||||
|
|
||||||
|
def test_threshold_s_must_exceed_1e_3() -> None:
|
||||||
|
"""
|
||||||
|
Spec: next frame is predicted ESH only if:
|
||||||
|
s[l] > 1e-3 AND ds[l] > 10
|
||||||
|
for some l in 1..7.
|
||||||
|
|
||||||
|
This test checks the necessity of the s[l] threshold:
|
||||||
|
- Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
|
||||||
|
- Expect: not classified as ESH -> for prev=OLS return OLS.
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "OLS")
|
||||||
|
assert out == "OLS"
|
||||||
@ -1,3 +1,16 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - AAC Coder/DecoderTests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for AAC Coder/Decoder module.
|
||||||
|
# ------------------------------------------------------------
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -6,18 +19,36 @@ import numpy as np
|
|||||||
import pytest
|
import pytest
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
from level_1.level_1 import aac_coder_1, i_aac_coder_1
|
from core.aac_coder import aac_coder_1
|
||||||
|
from core.aac_decoder import aac_decoder_1
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def _snr_db(x_ref: np.ndarray, x_hat: np.ndarray) -> float:
|
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||||
"""
|
"""
|
||||||
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_ref : StereoSignal
|
||||||
|
Reference signal, shape (N, 2) typical.
|
||||||
|
x_hat : StereoSignal
|
||||||
|
Reconstructed signal, shape (M, 2) typical.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
SNR in dB.
|
||||||
|
- Returns +inf if noise power is zero.
|
||||||
|
- Returns -inf if signal power is zero.
|
||||||
"""
|
"""
|
||||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||||
|
|
||||||
|
# Be conservative: align lengths and common channels.
|
||||||
if x_ref.ndim == 1:
|
if x_ref.ndim == 1:
|
||||||
x_ref = x_ref.reshape(-1, 1)
|
x_ref = x_ref.reshape(-1, 1)
|
||||||
if x_hat.ndim == 1:
|
if x_hat.ndim == 1:
|
||||||
@ -36,7 +67,7 @@ def _snr_db(x_ref: np.ndarray, x_hat: np.ndarray) -> float:
|
|||||||
if pn <= 0.0:
|
if pn <= 0.0:
|
||||||
return float("inf")
|
return float("inf")
|
||||||
if ps <= 0.0:
|
if ps <= 0.0:
|
||||||
return -float("inf")
|
return float("-inf")
|
||||||
|
|
||||||
return float(10.0 * np.log10(ps / pn))
|
return float(10.0 * np.log10(ps / pn))
|
||||||
|
|
||||||
@ -49,9 +80,9 @@ def tmp_stereo_wav(tmp_path: Path) -> Path:
|
|||||||
rng = np.random.default_rng(123)
|
rng = np.random.default_rng(123)
|
||||||
fs = 48000
|
fs = 48000
|
||||||
|
|
||||||
# ~1 second of audio, keep small for test speed
|
# ~1 second of audio (kept small for test speed).
|
||||||
n = fs
|
n = fs
|
||||||
x = rng.normal(size=(n, 2)).astype(np.float64)
|
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
|
||||||
|
|
||||||
wav_path = tmp_path / "in.wav"
|
wav_path = tmp_path / "in.wav"
|
||||||
sf.write(str(wav_path), x, fs)
|
sf.write(str(wav_path), x, fs)
|
||||||
@ -63,7 +94,7 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
|||||||
Module-level contract test:
|
Module-level contract test:
|
||||||
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
|
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
|
||||||
"""
|
"""
|
||||||
aac_seq = aac_coder_1(tmp_stereo_wav)
|
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
|
||||||
|
|
||||||
assert isinstance(aac_seq, list)
|
assert isinstance(aac_seq, list)
|
||||||
assert len(aac_seq) > 0
|
assert len(aac_seq) > 0
|
||||||
@ -88,8 +119,8 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
|||||||
assert "frame_F" in fr["chl"]
|
assert "frame_F" in fr["chl"]
|
||||||
assert "frame_F" in fr["chr"]
|
assert "frame_F" in fr["chr"]
|
||||||
|
|
||||||
chl_f = np.asarray(fr["chl"]["frame_F"])
|
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||||
chr_f = np.asarray(fr["chr"]["frame_F"])
|
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||||
|
|
||||||
if frame_type == "ESH":
|
if frame_type == "ESH":
|
||||||
assert chl_f.shape == (128, 8)
|
assert chl_f.shape == (128, 8)
|
||||||
@ -101,23 +132,25 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
|||||||
|
|
||||||
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
|
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
|
||||||
"""
|
"""
|
||||||
End-to-end module test:
|
End-to-end test:
|
||||||
Encode + decode and check SNR is very high (numerical-noise only).
|
Encode + decode and check SNR is very high (numerical-noise only).
|
||||||
Threshold is intentionally loose to avoid fragility.
|
|
||||||
|
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
|
||||||
"""
|
"""
|
||||||
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
|
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
|
||||||
assert fs == 48000
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
|
assert int(fs) == 48000
|
||||||
|
|
||||||
out_wav = tmp_path / "out.wav"
|
out_wav = tmp_path / "out.wav"
|
||||||
|
|
||||||
aac_seq = aac_coder_1(tmp_stereo_wav)
|
aac_seq = aac_coder_1(tmp_stereo_wav)
|
||||||
x_hat = i_aac_coder_1(aac_seq, out_wav)
|
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
|
||||||
|
|
||||||
# Basic sanity: output file exists and is readable
|
# Basic sanity: output file exists and is readable
|
||||||
assert out_wav.exists()
|
assert out_wav.exists()
|
||||||
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
|
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
|
||||||
assert fs_hat == 48000
|
assert int(fs_hat) == 48000
|
||||||
|
|
||||||
# SNR computed against the array returned by i_aac_coder_1 (should match file, but not required)
|
# SNR against returned array (file should match closely, but we do not require it here).
|
||||||
snr = _snr_db(x_ref, x_hat)
|
snr = _snr_db(x_ref, x_hat)
|
||||||
assert snr > 80.0
|
assert snr > 80.0
|
||||||
269
source/core/tests/test_filterbank.py
Normal file
269
source/core/tests/test_filterbank.py
Normal file
@ -0,0 +1,269 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Filterbank Tests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for Filterbank module.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Sequence
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
# Helper fixtures for filterbank
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Analyze-synthesize each frame and overlap-add with hop=1024.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : StereoSignal
|
||||||
|
Input stereo stream, expected shape (N, 2).
|
||||||
|
frame_types : Sequence[FrameType]
|
||||||
|
Length K sequence of frame types for frames starting at i*1024.
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("SIN" or "KBD").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Reconstructed stereo stream, same shape as x (N, 2).
|
||||||
|
"""
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(frame_types)
|
||||||
|
|
||||||
|
y: StereoSignal = np.zeros_like(x, dtype=np.float64)
|
||||||
|
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
frame_t: FrameT = x[start:start + win, :]
|
||||||
|
frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type)
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type)
|
||||||
|
y[start:start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
|
||||||
|
"""
|
||||||
|
Compute SNR in dB over all samples/channels.
|
||||||
|
"""
|
||||||
|
err = x - y
|
||||||
|
ps = float(np.sum(x * x))
|
||||||
|
pn = float(np.sum(err * err))
|
||||||
|
if pn <= 0.0:
|
||||||
|
return float("inf")
|
||||||
|
if ps <= 0.0:
|
||||||
|
return float("-inf")
|
||||||
|
return 10.0 * float(np.log10(ps / pn))
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Forward filterbank tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
assert frame_f.shape == (1024, 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_shapes_esh(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for ESH, aac_filter_bank returns shape (128, 16).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
|
||||||
|
assert frame_f.shape == (128, 16)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Behavior test: for OLS (representative long-sequence), channels are independent.
|
||||||
|
If right channel is zero and left is random, right spectrum should be near zero.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(0)
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_t[:, 0] = rng.normal(size=2048)
|
||||||
|
|
||||||
|
frame_f = aac_filter_bank(frame_t, "OLS", win_type)
|
||||||
|
|
||||||
|
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Behavior test: for ESH, channels are independent.
|
||||||
|
If right channel is zero and left is random, all odd columns (right) should be near zero.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(1)
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_t[:, 0] = rng.normal(size=2048)
|
||||||
|
|
||||||
|
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
|
||||||
|
|
||||||
|
right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15
|
||||||
|
assert np.max(np.abs(right_cols)) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Spec-driven behavior test:
|
||||||
|
ESH uses only the central region [448, 1600), split into 8 overlapping
|
||||||
|
windows of length 256 with 50% overlap.
|
||||||
|
|
||||||
|
Therefore, changing samples outside [448, 1600) must not affect the output.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(2)
|
||||||
|
|
||||||
|
frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
center = rng.normal(size=(1152, 2))
|
||||||
|
frame_a[448:1600, :] = center
|
||||||
|
frame_b[448:1600, :] = center
|
||||||
|
|
||||||
|
frame_b[0:448, :] = rng.normal(size=(448, 2))
|
||||||
|
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
|
||||||
|
|
||||||
|
fa = aac_filter_bank(frame_a, "ESH", win_type)
|
||||||
|
fb = aac_filter_bank(frame_b, "ESH", win_type)
|
||||||
|
|
||||||
|
# Use a tiny tolerance to avoid flaky failures due to floating-point minutiae.
|
||||||
|
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_output_is_finite(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Sanity test: output must not contain NaN or inf for representative cases.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(3)
|
||||||
|
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
assert np.isfinite(frame_f).all()
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Reverse i_filterbank tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2).
|
||||||
|
"""
|
||||||
|
frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64)
|
||||||
|
for frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
frame_t = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
assert frame_t.shape == (2048, 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ifilterbank_shapes_esh(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2).
|
||||||
|
"""
|
||||||
|
frame_f: FrameF = np.zeros((128, 16), dtype=np.float64)
|
||||||
|
frame_t = aac_i_filter_bank(frame_f, "ESH", win_type)
|
||||||
|
assert frame_t.shape == (2048, 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Sanity test: per-frame analysis+synthesis must produce finite outputs.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(0)
|
||||||
|
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
assert np.isfinite(frame_t_hat).all()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Module-level test:
|
||||||
|
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
|
||||||
|
in the steady-state region.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(1)
|
||||||
|
|
||||||
|
K = 6
|
||||||
|
N = 1024 * (K + 1)
|
||||||
|
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
|
||||||
|
|
||||||
|
a = 1024
|
||||||
|
b = N - 1024
|
||||||
|
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||||
|
assert snr > 50.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Module-level test:
|
||||||
|
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
|
||||||
|
in the steady-state region.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(2)
|
||||||
|
|
||||||
|
K = 6
|
||||||
|
N = 1024 * (K + 1)
|
||||||
|
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
|
||||||
|
|
||||||
|
a = 1024
|
||||||
|
b = N - 1024
|
||||||
|
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||||
|
assert snr > 45.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Transition sequence test matching the windowing logic:
|
||||||
|
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(3)
|
||||||
|
|
||||||
|
frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
|
||||||
|
K = len(frame_types)
|
||||||
|
N = 1024 * (K + 1)
|
||||||
|
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
y = _ola_reconstruct(x, frame_types, win_type)
|
||||||
|
|
||||||
|
a = 1024
|
||||||
|
b = N - 1024
|
||||||
|
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||||
|
assert snr > 40.0
|
||||||
@ -1,16 +1,33 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Filterbank internal (mdct) Tests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for Filterbank internal MDCT/IMDCT functionality.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from level_1.level_1 import _imdct, _mdct
|
from core.aac_filterbank import _imdct, _mdct
|
||||||
|
from core.aac_types import FloatArray, TimeSignal, MdctCoeffs
|
||||||
|
|
||||||
# Helper "fixtures" for filterbank internals (MDCT/IMDCT)
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _assert_allclose(a: np.ndarray, b: np.ndarray, *, rtol: float, atol: float) -> None:
|
def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None:
|
||||||
# Helper for consistent tolerances across tests.
|
"""
|
||||||
|
Helper for consistent tolerances across tests.
|
||||||
|
"""
|
||||||
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
|
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
|
||||||
|
|
||||||
def _estimate_gain(y: np.ndarray, x: np.ndarray) -> float:
|
|
||||||
|
def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float:
|
||||||
"""
|
"""
|
||||||
Estimate scalar gain g such that y ~= g*x in least-squares sense.
|
Estimate scalar gain g such that y ~= g*x in least-squares sense.
|
||||||
"""
|
"""
|
||||||
@ -28,18 +45,18 @@ def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None:
|
|||||||
Consistency test in coefficient domain:
|
Consistency test in coefficient domain:
|
||||||
mdct(imdct(X)) ~= g * X
|
mdct(imdct(X)) ~= g * X
|
||||||
|
|
||||||
For our chosen (non-orthonormal) scaling, g is expected to be close to 2.
|
For the chosen (non-orthonormal) scaling, g is expected to be close to 2.
|
||||||
"""
|
"""
|
||||||
rng = np.random.default_rng(0)
|
rng = np.random.default_rng(0)
|
||||||
K = N // 2
|
K = N // 2
|
||||||
|
|
||||||
X = rng.normal(size=K).astype(np.float64)
|
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
x = _imdct(X)
|
x: TimeSignal = _imdct(X)
|
||||||
X_hat = _mdct(x)
|
X_hat: MdctCoeffs = _mdct(x)
|
||||||
|
|
||||||
g = _estimate_gain(X_hat, X)
|
g = _estimate_gain(X_hat, X)
|
||||||
_assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance)
|
_assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance)
|
||||||
_assert_allclose(np.array([g]), np.array([2.0]), rtol=tolerance, atol=tolerance)
|
_assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("N", [256, 2048])
|
@pytest.mark.parametrize("N", [256, 2048])
|
||||||
@ -47,18 +64,16 @@ def test_mdct_linearity(N: int) -> None:
|
|||||||
"""
|
"""
|
||||||
Linearity test:
|
Linearity test:
|
||||||
mdct(a*x + b*y) == a*mdct(x) + b*mdct(y)
|
mdct(a*x + b*y) == a*mdct(x) + b*mdct(y)
|
||||||
|
|
||||||
This should hold up to numerical error.
|
|
||||||
"""
|
"""
|
||||||
rng = np.random.default_rng(1)
|
rng = np.random.default_rng(1)
|
||||||
x = rng.normal(size=N).astype(np.float64)
|
x: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||||
y = rng.normal(size=N).astype(np.float64)
|
y: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||||
|
|
||||||
a = 0.37
|
a = 0.37
|
||||||
b = -1.12
|
b = -1.12
|
||||||
|
|
||||||
left = _mdct(a * x + b * y)
|
left: MdctCoeffs = _mdct(a * x + b * y)
|
||||||
right = a * _mdct(x) + b * _mdct(y)
|
right: MdctCoeffs = a * _mdct(x) + b * _mdct(y)
|
||||||
|
|
||||||
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
||||||
|
|
||||||
@ -72,14 +87,14 @@ def test_imdct_linearity(N: int) -> None:
|
|||||||
rng = np.random.default_rng(2)
|
rng = np.random.default_rng(2)
|
||||||
K = N // 2
|
K = N // 2
|
||||||
|
|
||||||
X = rng.normal(size=K).astype(np.float64)
|
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
Y = rng.normal(size=K).astype(np.float64)
|
Y: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
|
|
||||||
a = -0.5
|
a = -0.5
|
||||||
b = 2.0
|
b = 2.0
|
||||||
|
|
||||||
left = _imdct(a * X + b * Y)
|
left: TimeSignal = _imdct(a * X + b * Y)
|
||||||
right = a * _imdct(X) + b * _imdct(Y)
|
right: TimeSignal = a * _imdct(X) + b * _imdct(Y)
|
||||||
|
|
||||||
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
||||||
|
|
||||||
@ -92,8 +107,8 @@ def test_mdct_imdct_outputs_are_finite(N: int) -> None:
|
|||||||
rng = np.random.default_rng(3)
|
rng = np.random.default_rng(3)
|
||||||
K = N // 2
|
K = N // 2
|
||||||
|
|
||||||
x = rng.normal(size=N).astype(np.float64)
|
x: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||||
X = rng.normal(size=K).astype(np.float64)
|
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
|
|
||||||
X1 = _mdct(x)
|
X1 = _mdct(x)
|
||||||
x1 = _imdct(X)
|
x1 = _imdct(X)
|
||||||
198
source/level_1/core/aac_coder.py
Normal file
198
source/level_1/core/aac_coder.py
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - AAC Coder (Core)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 1 AAC encoder orchestration.
|
||||||
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Reads WAV via soundfile
|
||||||
|
# - Validates stereo and 48 kHz
|
||||||
|
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
|
||||||
|
# - SSC decision uses next-frame attack detection
|
||||||
|
# - Filterbank analysis (MDCT)
|
||||||
|
# - Stores per-channel spectra in AACSeq1 schema:
|
||||||
|
# * ESH: (128, 8)
|
||||||
|
# * else: (1024, 1)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from core.aac_configuration import WIN_TYPE
|
||||||
|
from core.aac_filterbank import aac_filter_bank
|
||||||
|
from core.aac_ssc import aac_SSC
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
|
||||||
|
"""
|
||||||
|
Read a WAV file using soundfile and validate the Level-1 assumptions.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
x : StereoSignal (np.ndarray)
|
||||||
|
Stereo samples as float64, shape (N, 2).
|
||||||
|
fs : int
|
||||||
|
Sampling rate (Hz). Must be 48000.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the input is not stereo or the sampling rate is not 48 kHz.
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, fs = sf.read(str(filename_in), always_2d=True)
|
||||||
|
x = np.asarray(x, dtype=np.float64)
|
||||||
|
|
||||||
|
if x.shape[1] != 2:
|
||||||
|
raise ValueError("Input must be stereo (2 channels).")
|
||||||
|
if int(fs) != 48000:
|
||||||
|
raise ValueError("Input sampling rate must be 48 kHz.")
|
||||||
|
|
||||||
|
return x, int(fs)
|
||||||
|
|
||||||
|
|
||||||
|
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||||
|
"""
|
||||||
|
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
|
||||||
|
as required by the Level-1 AACSeq1 schema.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
"OLS" | "LSS" | "ESH" | "LPS".
|
||||||
|
frame_f : FrameF
|
||||||
|
Output of aac_filter_bank():
|
||||||
|
- If frame_type != "ESH": shape (1024, 2)
|
||||||
|
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
chl_f : FrameChannelF
|
||||||
|
Left channel coefficients:
|
||||||
|
- ESH: shape (128, 8)
|
||||||
|
- else: shape (1024, 1)
|
||||||
|
chr_f : FrameChannelF
|
||||||
|
Right channel coefficients:
|
||||||
|
- ESH: shape (128, 8)
|
||||||
|
- else: shape (1024, 1)
|
||||||
|
"""
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if frame_f.shape != (128, 16):
|
||||||
|
raise ValueError("For ESH, frame_f must have shape (128, 16).")
|
||||||
|
|
||||||
|
chl_f = np.empty((128, 8), dtype=np.float64)
|
||||||
|
chr_f = np.empty((128, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
chl_f[:, j] = frame_f[:, 2 * j + 0]
|
||||||
|
chr_f[:, j] = frame_f[:, 2 * j + 1]
|
||||||
|
return chl_f, chr_f
|
||||||
|
|
||||||
|
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
|
||||||
|
if frame_f.shape != (1024, 2):
|
||||||
|
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
|
||||||
|
|
||||||
|
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
|
||||||
|
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
|
||||||
|
return chl_f, chr_f
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 encoder
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||||
|
"""
|
||||||
|
Level-1 AAC encoder.
|
||||||
|
|
||||||
|
This function preserves the behavior of the original level_1 implementation:
|
||||||
|
- Read stereo 48 kHz WAV
|
||||||
|
- Pad hop samples at start and hop samples at end
|
||||||
|
- Frame with win=2048, hop=1024
|
||||||
|
- Use SSC with next-frame lookahead
|
||||||
|
- Apply filterbank analysis
|
||||||
|
- Store per-channel coefficients using AACSeq1 schema
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename.
|
||||||
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq1
|
||||||
|
List of encoded frames (Level 1 schema).
|
||||||
|
"""
|
||||||
|
x, fs = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
_ = fs # kept for clarity; The assignment assumes 48 kHz
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
# Pad at the beginning to support the first overlap region.
|
||||||
|
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
# Number of frames such that current frame fits; next frame will be padded if needed.
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
aac_seq: AACSeq1 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
win_type: WinType = WIN_TYPE
|
||||||
|
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start:start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
# This should not happen due to K definition, but keep it explicit.
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop:start + hop + win, :]
|
||||||
|
|
||||||
|
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
|
|
||||||
|
aac_seq.append({
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": win_type,
|
||||||
|
"chl": {"frame_F": chl_f},
|
||||||
|
"chr": {"frame_F": chr_f},
|
||||||
|
})
|
||||||
|
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
|
return aac_seq
|
||||||
22
source/level_1/core/aac_configuration.py
Normal file
22
source/level_1/core/aac_configuration.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Configuration
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module contains the global configurations
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# Imports
|
||||||
|
from core.aac_types import WinType
|
||||||
|
|
||||||
|
# Window type
|
||||||
|
# Options: "SIN", "KBD"
|
||||||
|
WIN_TYPE: WinType = "SIN"
|
||||||
166
source/level_1/core/aac_decoder.py
Normal file
166
source/level_1/core/aac_decoder.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Inverse AAC Coder (Core)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
|
||||||
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
|
||||||
|
# - IMDCT synthesis per frame
|
||||||
|
# - Overlap-add with hop=1024
|
||||||
|
# - Remove encoder boundary padding: hop at start and hop at end
|
||||||
|
#
|
||||||
|
# Note:
|
||||||
|
# This core module returns the reconstructed samples. Writing to disk is kept
|
||||||
|
# in level_x demos.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from core.aac_filterbank import aac_i_filter_bank
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
|
||||||
|
"""
|
||||||
|
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
|
||||||
|
FrameF container expected by aac_i_filter_bank().
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
"OLS" | "LSS" | "ESH" | "LPS".
|
||||||
|
chl_f : FrameChannelF
|
||||||
|
Left channel coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- else: (1024, 1)
|
||||||
|
chr_f : FrameChannelF
|
||||||
|
Right channel coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- else: (1024, 1)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameF
|
||||||
|
Stereo coefficients:
|
||||||
|
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||||
|
- else: (1024, 2)
|
||||||
|
"""
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||||
|
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||||
|
|
||||||
|
frame_f = np.empty((128, 16), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||||
|
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||||
|
return frame_f
|
||||||
|
|
||||||
|
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
|
||||||
|
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
|
||||||
|
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
|
||||||
|
|
||||||
|
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||||
|
frame_f[:, 0] = chl_f[:, 0]
|
||||||
|
frame_f[:, 1] = chr_f[:, 0]
|
||||||
|
return frame_f
|
||||||
|
|
||||||
|
|
||||||
|
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Remove the boundary padding that the Level-1 encoder adds:
|
||||||
|
hop samples at start and hop samples at end.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y_pad : StereoSignal (np.ndarray)
|
||||||
|
Reconstructed padded stream, shape (N_pad, 2).
|
||||||
|
hop : int
|
||||||
|
Hop size in samples (default 1024).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal (np.ndarray)
|
||||||
|
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If y_pad is too short to unpad.
|
||||||
|
"""
|
||||||
|
if y_pad.shape[0] < 2 * hop:
|
||||||
|
raise ValueError("Decoded stream too short to unpad.")
|
||||||
|
return y_pad[hop:-hop, :]
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 decoder (core)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||||||
|
|
||||||
|
This function preserves the behavior of the original level_1 implementation:
|
||||||
|
- Reconstruct the full padded stream by overlap-adding K synthesized frames
|
||||||
|
- Remove hop padding at the beginning and hop padding at the end
|
||||||
|
- Write the reconstructed stereo WAV file (48 kHz)
|
||||||
|
- Return reconstructed stereo samples as float64
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_1 : AACSeq1
|
||||||
|
Encoded sequence as produced by aac_coder_1().
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_1)
|
||||||
|
|
||||||
|
# Output includes the encoder padding region, so we reconstruct the full padded stream.
|
||||||
|
# For K frames: last frame starts at (K-1)*hop and spans win,
|
||||||
|
# so total length = (K-1)*hop + win.
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
for i, fr in enumerate(aac_seq_1):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||||
|
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||||
|
|
||||||
|
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start:start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
|
||||||
|
# Level 1 assumption: 48 kHz output.
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
|
||||||
|
return y
|
||||||
454
source/level_1/core/aac_filterbank.py
Normal file
454
source/level_1/core/aac_filterbank.py
Normal file
@ -0,0 +1,454 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Filterbank module
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
from scipy.signal.windows import kaiser
|
||||||
|
|
||||||
|
# Private helpers for Filterbank
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
def _sin_window(N: int) -> Window:
|
||||||
|
"""
|
||||||
|
Build a sinusoidal (SIN) window of length N.
|
||||||
|
|
||||||
|
The AAC sinusoid window is:
|
||||||
|
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
N : int
|
||||||
|
Window length in samples.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (N, ) with dtype float64.
|
||||||
|
"""
|
||||||
|
n = np.arange(N, dtype=np.float64)
|
||||||
|
return np.sin((np.pi / N) * (n + 0.5))
|
||||||
|
|
||||||
|
|
||||||
|
def _kbd_window(N: int, alpha: float) -> Window:
|
||||||
|
"""
|
||||||
|
Build a Kaiser-Bessel-Derived (KBD) window of length N.
|
||||||
|
|
||||||
|
This follows the standard KBD construction used in AAC:
|
||||||
|
1) Build a Kaiser kernel of length (N/2 + 1).
|
||||||
|
2) Form the left half by cumulative summation, normalization, and sqrt.
|
||||||
|
3) Mirror the left half to form the right half (symmetric full-length window).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
- N must be even (AAC uses N=2048 for long and N=256 for short).
|
||||||
|
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
|
||||||
|
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
N : int
|
||||||
|
Window length in samples (must be even).
|
||||||
|
alpha : float
|
||||||
|
KBD alpha parameter.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (N,) with dtype float64.
|
||||||
|
"""
|
||||||
|
half = N // 2
|
||||||
|
|
||||||
|
# Kaiser kernel length: half + 1 samples (0 .. half)
|
||||||
|
# beta = pi * alpha per the usual correspondence with the ISO definition
|
||||||
|
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
|
||||||
|
|
||||||
|
csum = np.cumsum(kernel)
|
||||||
|
denom = csum[-1]
|
||||||
|
|
||||||
|
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
|
||||||
|
w_right = w_left[::-1] # mirror for second half
|
||||||
|
|
||||||
|
return np.concatenate([w_left, w_right])
|
||||||
|
|
||||||
|
|
||||||
|
def _long_window(win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Return the long AAC window (length 2048) for the selected window family.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (2048,) with dtype float64.
|
||||||
|
"""
|
||||||
|
if win_type == "SIN":
|
||||||
|
return _sin_window(2048)
|
||||||
|
if win_type == "KBD":
|
||||||
|
# Assignment-specific alpha values
|
||||||
|
return _kbd_window(2048, alpha=6.0)
|
||||||
|
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _short_window(win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Return the short AAC window (length 256) for the selected window family.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (256,) with dtype float64.
|
||||||
|
"""
|
||||||
|
if win_type == "SIN":
|
||||||
|
return _sin_window(256)
|
||||||
|
if win_type == "KBD":
|
||||||
|
# Assignment-specific alpha values
|
||||||
|
return _kbd_window(256, alpha=4.0)
|
||||||
|
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
|
||||||
|
|
||||||
|
In this assignment we assume a single window family is used globally
|
||||||
|
(no mixed KBD/SIN halves). Therefore, both the long and short windows
|
||||||
|
are drawn from the same family.
|
||||||
|
|
||||||
|
For frame_type:
|
||||||
|
- "OLS": return the long window Wl (2048).
|
||||||
|
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
|
||||||
|
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
One of "OLS", "LSS", "LPS".
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (2048,) with dtype float64.
|
||||||
|
"""
|
||||||
|
wL = _long_window(win_type) # length 2048
|
||||||
|
wS = _short_window(win_type) # length 256
|
||||||
|
|
||||||
|
if frame_type == "OLS":
|
||||||
|
return wL
|
||||||
|
|
||||||
|
if frame_type == "LSS":
|
||||||
|
# 0..1023: left half of long window
|
||||||
|
# 1024..1471: ones (448 samples)
|
||||||
|
# 1472..1599: right half of short window (128 samples)
|
||||||
|
# 1600..2047: zeros (448 samples)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
out[0:1024] = wL[0:1024]
|
||||||
|
out[1024:1472] = 1.0
|
||||||
|
out[1472:1600] = wS[128:256]
|
||||||
|
out[1600:2048] = 0.0
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "LPS":
|
||||||
|
# 0..447: zeros (448)
|
||||||
|
# 448..575: left half of short window (128)
|
||||||
|
# 576..1023: ones (448)
|
||||||
|
# 1024..2047: right half of long window (1024)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
out[0:448] = 0.0
|
||||||
|
out[448:576] = wS[0:128]
|
||||||
|
out[576:1024] = 1.0
|
||||||
|
out[1024:2048] = wL[1024:2048]
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _mdct(s: TimeSignal) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
MDCT (direct form) as specified in the assignment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
s : TimeSignal
|
||||||
|
Windowed time samples, 1-D array of length N (N = 2048 or 256).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
MDCT coefficients, 1-D array of length N/2.
|
||||||
|
|
||||||
|
Definition
|
||||||
|
----------
|
||||||
|
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||||
|
where n0 = (N/2 + 1)/2.
|
||||||
|
"""
|
||||||
|
s = np.asarray(s, dtype=np.float64).reshape(-1)
|
||||||
|
N = int(s.shape[0])
|
||||||
|
if N not in (2048, 256):
|
||||||
|
raise ValueError("MDCT input length must be 2048 or 256.")
|
||||||
|
|
||||||
|
n0 = (N / 2.0 + 1.0) / 2.0
|
||||||
|
n = np.arange(N, dtype=np.float64) + n0
|
||||||
|
k = np.arange(N // 2, dtype=np.float64) + 0.5
|
||||||
|
|
||||||
|
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
|
||||||
|
X = 2.0 * (s @ C) # (N/2,)
|
||||||
|
return X
|
||||||
|
|
||||||
|
|
||||||
|
def _imdct(X: MdctCoeffs) -> TimeSignal:
|
||||||
|
"""
|
||||||
|
IMDCT (direct form) as specified in the assignment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : MdctCoeffs
|
||||||
|
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
TimeSignal
|
||||||
|
Reconstructed time samples, 1-D array of length N = 2K.
|
||||||
|
|
||||||
|
Definition
|
||||||
|
----------
|
||||||
|
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||||
|
where n0 = (N/2 + 1)/2.
|
||||||
|
"""
|
||||||
|
X = np.asarray(X, dtype=np.float64).reshape(-1)
|
||||||
|
K = int(X.shape[0])
|
||||||
|
if K not in (1024, 128):
|
||||||
|
raise ValueError("IMDCT input length must be 1024 or 128.")
|
||||||
|
|
||||||
|
N = 2 * K
|
||||||
|
n0 = (N / 2.0 + 1.0) / 2.0
|
||||||
|
|
||||||
|
n = np.arange(N, dtype=np.float64) + n0
|
||||||
|
k = np.arange(K, dtype=np.float64) + 0.5
|
||||||
|
|
||||||
|
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
|
||||||
|
s = (2.0 / N) * (C @ X) # (N,)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
|
||||||
|
"""
|
||||||
|
ESH analysis for one channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_ch : FrameChannelT
|
||||||
|
Time-domain channel frame (expected shape: (2048,)).
|
||||||
|
win_type : WinType
|
||||||
|
Window family ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelF
|
||||||
|
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
|
||||||
|
of the j-th short window.
|
||||||
|
"""
|
||||||
|
wS = _short_window(win_type) # (256,)
|
||||||
|
X_esh = np.empty((128, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
# ESH subwindows are taken from the central region:
|
||||||
|
# start positions: 448 + 128*j, j = 0..7
|
||||||
|
for j in range(8):
|
||||||
|
start = 448 + 128 * j
|
||||||
|
seg = x_ch[start:start + 256] * wS # (256,)
|
||||||
|
X_esh[:, j] = _mdct(seg) # (128,)
|
||||||
|
|
||||||
|
return X_esh
|
||||||
|
|
||||||
|
|
||||||
|
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||||
|
"""
|
||||||
|
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameF
|
||||||
|
Packed ESH spectrum (expected shape: (128, 16)).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
left : FrameChannelF
|
||||||
|
Left channel spectrum, shape (128, 8).
|
||||||
|
right : FrameChannelF
|
||||||
|
Right channel spectrum, shape (128, 8).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Inverse mapping of the packing used in aac_filter_bank():
|
||||||
|
packed[:, 2*j] = left[:, j]
|
||||||
|
packed[:, 2*j+1] = right[:, j]
|
||||||
|
"""
|
||||||
|
if frame_F.shape != (128, 16):
|
||||||
|
raise ValueError("ESH frame_F must have shape (128, 16).")
|
||||||
|
|
||||||
|
left = np.empty((128, 8), dtype=np.float64)
|
||||||
|
right = np.empty((128, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
left[:, j] = frame_F[:, 2 * j + 0]
|
||||||
|
right[:, j] = frame_F[:, 2 * j + 1]
|
||||||
|
return left, right
|
||||||
|
|
||||||
|
|
||||||
|
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
|
||||||
|
"""
|
||||||
|
ESH synthesis for one channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X_esh : FrameChannelF
|
||||||
|
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
|
||||||
|
win_type : WinType
|
||||||
|
Window family ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelT
|
||||||
|
Time-domain channel contribution, shape (2048,).
|
||||||
|
This is already overlap-added internally for the 8 short blocks and
|
||||||
|
ready for OLA at the caller level.
|
||||||
|
"""
|
||||||
|
if X_esh.shape != (128, 8):
|
||||||
|
raise ValueError("X_esh must have shape (128, 8).")
|
||||||
|
|
||||||
|
wS = _short_window(win_type) # (256,)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
|
||||||
|
# Each short IMDCT returns 256 samples. Place them at:
|
||||||
|
# start = 448 + 128*j, j=0..7 (50% overlap)
|
||||||
|
for j in range(8):
|
||||||
|
seg = _imdct(X_esh[:, j]) * wS # (256,)
|
||||||
|
start = 448 + 128 * j
|
||||||
|
out[start:start + 256] += seg
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Function prototypes (Level 1)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
|
||||||
|
"""
|
||||||
|
Filterbank stage (MDCT analysis).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_T : FrameT
|
||||||
|
Time-domain frame, stereo, shape (2048, 2).
|
||||||
|
frame_type : FrameType
|
||||||
|
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("KBD" or "SIN") used for the current frame.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
frame_F : FrameF
|
||||||
|
Frequency-domain MDCT coefficients:
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
|
||||||
|
containing MDCT coefficients for both channels.
|
||||||
|
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
|
||||||
|
placed in columns according to subframe order, i.e. overall shape (128, 16).
|
||||||
|
"""
|
||||||
|
if frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("frame_T must have shape (2048, 2).")
|
||||||
|
|
||||||
|
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
|
||||||
|
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
|
||||||
|
|
||||||
|
if frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
w = _window_sequence(frame_type, win_type) # length 2048
|
||||||
|
XL = _mdct(xL * w) # length 1024
|
||||||
|
XR = _mdct(xR * w) # length 1024
|
||||||
|
out = np.empty((1024, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = XL
|
||||||
|
out[:, 1] = XR
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
|
||||||
|
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
|
||||||
|
|
||||||
|
# Pack into (128, 16): each subframe as (128,2) placed in columns
|
||||||
|
out = np.empty((128, 16), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
out[:, 2 * j + 0] = Xl[:, j]
|
||||||
|
out[:, 2 * j + 1] = Xr[:, j]
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
|
||||||
|
"""
|
||||||
|
Inverse filterbank (IMDCT synthesis).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameF
|
||||||
|
Frequency-domain MDCT coefficients as produced by filter_bank().
|
||||||
|
frame_type : FrameType
|
||||||
|
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
frame_T : FrameT
|
||||||
|
Reconstructed time-domain frame, stereo, shape (2048, 2).
|
||||||
|
"""
|
||||||
|
if frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
if frame_F.shape != (1024, 2):
|
||||||
|
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
|
||||||
|
|
||||||
|
w = _window_sequence(frame_type, win_type)
|
||||||
|
|
||||||
|
xL = _imdct(frame_F[:, 0]) * w
|
||||||
|
xR = _imdct(frame_F[:, 1]) * w
|
||||||
|
|
||||||
|
out = np.empty((2048, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = xL
|
||||||
|
out[:, 1] = xR
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if frame_F.shape != (128, 16):
|
||||||
|
raise ValueError("For ESH, frame_F must have shape (128, 16).")
|
||||||
|
|
||||||
|
Xl, Xr = _unpack_esh(frame_F)
|
||||||
|
xL = _i_filter_bank_esh_channel(Xl, win_type)
|
||||||
|
xR = _i_filter_bank_esh_channel(Xr, win_type)
|
||||||
|
|
||||||
|
out = np.empty((2048, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = xL
|
||||||
|
out[:, 1] = xR
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||||
217
source/level_1/core/aac_ssc.py
Normal file
217
source/level_1/core/aac_ssc.py
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Sequence Segmentation Control module
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Sequence Segmentation Control module (SSC).
|
||||||
|
# Selects and returns the frame type based on input parameters.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
from core.aac_types import FrameType, FrameT, FrameChannelT
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Private helpers for SSC
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# See Table 1 in mm-2025-hw-v0.1.pdf
|
||||||
|
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
|
||||||
|
("OLS", "OLS"): "OLS",
|
||||||
|
("OLS", "LSS"): "LSS",
|
||||||
|
("OLS", "ESH"): "ESH",
|
||||||
|
("OLS", "LPS"): "LPS",
|
||||||
|
("LSS", "OLS"): "LSS",
|
||||||
|
("LSS", "LSS"): "LSS",
|
||||||
|
("LSS", "ESH"): "ESH",
|
||||||
|
("LSS", "LPS"): "ESH",
|
||||||
|
("ESH", "OLS"): "ESH",
|
||||||
|
("ESH", "LSS"): "ESH",
|
||||||
|
("ESH", "ESH"): "ESH",
|
||||||
|
("ESH", "LPS"): "ESH",
|
||||||
|
("LPS", "OLS"): "LPS",
|
||||||
|
("LPS", "LSS"): "ESH",
|
||||||
|
("LPS", "ESH"): "ESH",
|
||||||
|
("LPS", "LPS"): "LPS",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
|
||||||
|
"""
|
||||||
|
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
|
||||||
|
according to the assignment's criterion.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
next_frame_channel : FrameChannelT
|
||||||
|
One channel of next_frame_T (expected shape: (2048,)).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if an attack is detected (=> next frame predicted ESH), else False.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The criterion is implemented as described in the spec:
|
||||||
|
|
||||||
|
1) Apply the high-pass filter:
|
||||||
|
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
|
||||||
|
implemented in the time domain as:
|
||||||
|
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
|
||||||
|
|
||||||
|
2) Split y into 16 segments of length 128 and compute segment energies s[l].
|
||||||
|
|
||||||
|
3) Compute the ratio:
|
||||||
|
ds[l] = s[l] / s[l-1]
|
||||||
|
|
||||||
|
4) An attack exists if there exists l in {1..7} such that:
|
||||||
|
s[l] > 1e-3 and ds[l] > 10
|
||||||
|
"""
|
||||||
|
# Local alias; expected to be a 1-D array of length 2048.
|
||||||
|
x = next_frame_channel
|
||||||
|
|
||||||
|
# High-pass filter reference implementation (scalar recurrence).
|
||||||
|
y = np.zeros_like(x)
|
||||||
|
prev_x = 0.0
|
||||||
|
prev_y = 0.0
|
||||||
|
for n in range(x.shape[0]):
|
||||||
|
xn = float(x[n])
|
||||||
|
yn = (xn - prev_x) + 0.5 * prev_y
|
||||||
|
y[n] = yn
|
||||||
|
prev_x = xn
|
||||||
|
prev_y = yn
|
||||||
|
|
||||||
|
# Segment energies over 16 blocks of 128 samples.
|
||||||
|
s = np.empty(16, dtype=np.float64)
|
||||||
|
for l in range(16):
|
||||||
|
a = l * 128
|
||||||
|
b = (l + 1) * 128
|
||||||
|
seg = y[a:b]
|
||||||
|
s[l] = float(np.sum(seg * seg))
|
||||||
|
|
||||||
|
# ds[l] for l>=1. For l=0 not defined, keep 0.
|
||||||
|
ds = np.zeros(16, dtype=np.float64)
|
||||||
|
eps = 1e-12 # Avoid division by zero without materially changing the logic.
|
||||||
|
for l in range(1, 16):
|
||||||
|
ds[l] = s[l] / max(s[l - 1], eps)
|
||||||
|
|
||||||
|
# Spec: check l in {1..7}.
|
||||||
|
for l in range(1, 8):
|
||||||
|
if (s[l] > 1e-3) and (ds[l] > 10.0):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
|
||||||
|
"""
|
||||||
|
Decide the current frame type for a single channel based on the previous
|
||||||
|
frame type and whether the next frame is predicted to be ESH.
|
||||||
|
|
||||||
|
Rules (spec):
|
||||||
|
|
||||||
|
- If prev is "LSS" => current is "ESH"
|
||||||
|
- If prev is "LPS" => current is "OLS"
|
||||||
|
- If prev is "OLS" => current is "LSS" if attack else "OLS"
|
||||||
|
- If prev is "ESH" => current is "ESH" if attack else "LPS"
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
prev_frame_type : FrameType
|
||||||
|
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
|
||||||
|
attack : bool
|
||||||
|
True if the next frame is predicted ESH for this channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
The per-channel decision for the current frame.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if prev_frame_type == "LSS":
|
||||||
|
return "ESH"
|
||||||
|
if prev_frame_type == "LPS":
|
||||||
|
return "OLS"
|
||||||
|
if prev_frame_type == "OLS":
|
||||||
|
return "LSS" if attack else "OLS"
|
||||||
|
if prev_frame_type == "ESH":
|
||||||
|
return "ESH" if attack else "LPS"
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
||||||
|
"""
|
||||||
|
Merge per-channel frame type decisions into one common frame type using
|
||||||
|
the stereo merge table from the spec.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ft_l : FrameType
|
||||||
|
Frame type decision for the left channel.
|
||||||
|
ft_r : FrameType
|
||||||
|
Frame type decision for the right channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
The merged common frame type.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Function prototypes (Level 1)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
||||||
|
"""
|
||||||
|
Sequence Segmentation Control (SSC).
|
||||||
|
|
||||||
|
Select and return the frame type for the current frame (i) based on:
|
||||||
|
- the current time-domain frame (stereo),
|
||||||
|
- the next time-domain frame (stereo), used for attack detection,
|
||||||
|
- the previous frame type.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_T : FrameT
|
||||||
|
Current time-domain frame i (expected shape: (2048, 2)).
|
||||||
|
next_frame_T : FrameT
|
||||||
|
Next time-domain frame (i+1), used to decide transitions to/from ESH
|
||||||
|
(expected shape: (2048, 2)).
|
||||||
|
prev_frame_type : FrameType
|
||||||
|
Frame type chosen for the previous frame (i-1).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
One of: "OLS", "LSS", "ESH", "LPS".
|
||||||
|
"""
|
||||||
|
if frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("frame_T must have shape (2048, 2).")
|
||||||
|
if next_frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("next_frame_T must have shape (2048, 2).")
|
||||||
|
|
||||||
|
# Detect attack independently per channel on the next frame.
|
||||||
|
attack_l = _detect_attack(next_frame_T[:, 0])
|
||||||
|
attack_r = _detect_attack(next_frame_T[:, 1])
|
||||||
|
|
||||||
|
# Decide per-channel type based on shared prev_frame_type.
|
||||||
|
ft_l = _decide_frame_type(prev_frame_type, attack_l)
|
||||||
|
ft_r = _decide_frame_type(prev_frame_type, attack_r)
|
||||||
|
|
||||||
|
# Stereo merge as per the spec table.
|
||||||
|
return _stereo_merge(ft_l, ft_r)
|
||||||
193
source/level_1/core/aac_types.py
Normal file
193
source/level_1/core/aac_types.py
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Public Type Aliases
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module implements Public Type aliases
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import List, Literal, TypeAlias, TypedDict
|
||||||
|
import numpy as np
|
||||||
|
from numpy.typing import NDArray
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Code enums (for readability; not intended to enforce shapes/lengths)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
|
||||||
|
"""
|
||||||
|
Frame type codes (AAC):
|
||||||
|
- "OLS": ONLY_LONG_SEQUENCE
|
||||||
|
- "LSS": LONG_START_SEQUENCE
|
||||||
|
- "ESH": EIGHT_SHORT_SEQUENCE
|
||||||
|
- "LPS": LONG_STOP_SEQUENCE
|
||||||
|
"""
|
||||||
|
|
||||||
|
WinType: TypeAlias = Literal["KBD", "SIN"]
|
||||||
|
"""
|
||||||
|
Window type codes (AAC):
|
||||||
|
- "KBD": Kaiser-Bessel-Derived
|
||||||
|
- "SIN": sinusoid
|
||||||
|
"""
|
||||||
|
|
||||||
|
ChannelKey: TypeAlias = Literal["chl", "chr"]
|
||||||
|
"""Channel dictionary keys used in Level 1 payloads."""
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Array “semantic” aliases
|
||||||
|
#
|
||||||
|
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
|
||||||
|
# forcing strict shapes in the type system.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FloatArray: TypeAlias = NDArray[np.float64]
|
||||||
|
"""
|
||||||
|
Generic float64 NumPy array.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- We standardize internal numeric computations to float64 for stability and
|
||||||
|
reproducibility. External I/O can still be float32, but we convert at the
|
||||||
|
boundaries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Window: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain window (weighting sequence), 1-D.
|
||||||
|
|
||||||
|
Typical lengths in this assignment:
|
||||||
|
- Long: 2048
|
||||||
|
- Short: 256
|
||||||
|
- Window sequences for LSS/LPS are also 2048
|
||||||
|
|
||||||
|
Expected shape: (N,)
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
TimeSignal: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain signal samples, typically 1-D.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- Windowed MDCT input: shape (N,)
|
||||||
|
- IMDCT output: shape (N,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
StereoSignal: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain stereo signal stream.
|
||||||
|
|
||||||
|
Expected (typical) shape: (N, 2)
|
||||||
|
- axis 0: time samples
|
||||||
|
- axis 1: channels [L, R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
MdctCoeffs: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
MDCT coefficient vector, typically 1-D.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- Long: shape (1024,)
|
||||||
|
- Short: shape (128,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
FrameT: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain frame (stereo), as used by the filterbank input/output.
|
||||||
|
|
||||||
|
Expected (typical) shape for stereo: (2048, 2)
|
||||||
|
- axis 0: time samples
|
||||||
|
- axis 1: channels [L, R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameChannelT: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain single-channel frame.
|
||||||
|
|
||||||
|
Expected (typical) shape: (2048,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameF: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Frequency-domain frame (MDCT coefficients), stereo container.
|
||||||
|
|
||||||
|
Typical shapes (Level 1):
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
|
||||||
|
- If frame_type == "ESH": (128, 16)
|
||||||
|
|
||||||
|
Rationale for ESH (128, 16):
|
||||||
|
- 8 short subframes per channel => 8 * 2 = 16 columns total
|
||||||
|
- Each short subframe per stereo is (128, 2), flattened into columns
|
||||||
|
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameChannelF: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Frequency-domain single-channel frame (MDCT coefficients).
|
||||||
|
|
||||||
|
Typical shapes (Level 1):
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: (1024,)
|
||||||
|
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 AAC sequence payload types
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
frame_F:
|
||||||
|
The MDCT coefficients for ONE channel.
|
||||||
|
Typical shapes:
|
||||||
|
- ESH: (128, 8) (8 short subframes)
|
||||||
|
- else: (1024, )
|
||||||
|
"""
|
||||||
|
frame_F: FrameChannelF
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq1Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_1 (Level 1).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF
|
||||||
|
chr: AACChannelFrameF
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq1: TypeAlias = List[AACSeq1Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 1:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
"""
|
||||||
234
source/level_1/core/tests/test_SSC.py
Normal file
234
source/level_1/core/tests/test_SSC.py
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Sequence Segmentation Control Tests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for Sequence Segmentation Control module (SSC).
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from core.aac_ssc import aac_SSC
|
||||||
|
from core.aac_types import FrameT
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper fixtures for SSC
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _next_frame_no_attack() -> FrameT:
|
||||||
|
"""
|
||||||
|
Build a next_frame_T that must NOT trigger ESH detection.
|
||||||
|
|
||||||
|
Uses exact zeros so all segment energies are zero and the condition
|
||||||
|
s[l] > 1e-3 cannot hold for any l.
|
||||||
|
"""
|
||||||
|
return np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
|
||||||
|
def _next_frame_strong_attack(
|
||||||
|
*,
|
||||||
|
attack_left: bool,
|
||||||
|
attack_right: bool,
|
||||||
|
segment_l: int = 4,
|
||||||
|
baseline: float = 1e-6,
|
||||||
|
burst_amp: float = 1.0,
|
||||||
|
) -> FrameT:
|
||||||
|
"""
|
||||||
|
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
|
||||||
|
|
||||||
|
Attack criterion (spec):
|
||||||
|
Attack exists if there exists l in {1..7} such that:
|
||||||
|
s[l] > 1e-3 and ds[l] > 10,
|
||||||
|
where s[l] is the energy of segment l (length 128) after high-pass filtering,
|
||||||
|
and ds[l] = s[l] / s[l-1].
|
||||||
|
|
||||||
|
Construction:
|
||||||
|
- A small baseline is added everywhere to avoid relying on the epsilon guard in ds,
|
||||||
|
keeping ds behavior stable/reproducible.
|
||||||
|
- A strong burst is added inside a chosen segment l in 1..7.
|
||||||
|
"""
|
||||||
|
if not (1 <= segment_l <= 7):
|
||||||
|
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
|
||||||
|
|
||||||
|
x = np.full((2048, 2), baseline, dtype=np.float64)
|
||||||
|
|
||||||
|
a = segment_l * 128
|
||||||
|
b = (segment_l + 1) * 128
|
||||||
|
|
||||||
|
if attack_left:
|
||||||
|
x[a:b, 0] += burst_amp
|
||||||
|
if attack_right:
|
||||||
|
x[a:b, 1] += burst_amp
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def _next_frame_below_s_threshold(
|
||||||
|
*,
|
||||||
|
left: bool,
|
||||||
|
right: bool,
|
||||||
|
segment_l: int = 4,
|
||||||
|
impulse_amp: float = 0.01,
|
||||||
|
) -> FrameT:
|
||||||
|
"""
|
||||||
|
Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered,
|
||||||
|
even if the ratio ds[l] could be large.
|
||||||
|
|
||||||
|
We place a single impulse of amplitude 'impulse_amp' inside one segment.
|
||||||
|
Approx. segment energy: s[l] ~= impulse_amp^2.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3
|
||||||
|
"""
|
||||||
|
if not (1 <= segment_l <= 7):
|
||||||
|
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
|
||||||
|
|
||||||
|
x = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
idx = segment_l * 128 + 10 # inside segment l
|
||||||
|
if left:
|
||||||
|
x[idx, 0] = impulse_amp
|
||||||
|
if right:
|
||||||
|
x[idx, 1] = impulse_amp
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# 1) Fixed/mandatory cases (prev frame type forces current type)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
|
||||||
|
"""
|
||||||
|
Spec:
|
||||||
|
- If prev was LSS => current MUST be ESH
|
||||||
|
- If prev was LPS => current MUST be OLS
|
||||||
|
independent of attack detection on (i+1).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
|
out1 = aac_SSC(frame_t, next_attack, "LSS")
|
||||||
|
assert out1 == "ESH"
|
||||||
|
|
||||||
|
out2 = aac_SSC(frame_t, next_attack, "LPS")
|
||||||
|
assert out2 == "OLS"
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# 2) Cases requiring next-frame ESH prediction (attack computation)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_prev_ols_next_not_esh_returns_ols() -> None:
|
||||||
|
"""
|
||||||
|
If prev=OLS, current is:
|
||||||
|
- LSS iff (i+1) is predicted ESH
|
||||||
|
- else OLS
|
||||||
|
Here: no attack => expect OLS.
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_no_attack()
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "OLS")
|
||||||
|
assert out == "OLS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
|
||||||
|
"""
|
||||||
|
prev=OLS and next predicted ESH for both channels:
|
||||||
|
per-channel: LSS, LSS
|
||||||
|
merged: LSS
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "OLS")
|
||||||
|
assert out == "LSS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
|
||||||
|
"""
|
||||||
|
prev=OLS:
|
||||||
|
- one channel predicts ESH => LSS
|
||||||
|
- other channel predicts not ESH => OLS
|
||||||
|
Merge table: OLS + LSS => LSS (either side).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||||
|
out1 = aac_SSC(frame_t, next1_t, "OLS")
|
||||||
|
assert out1 == "LSS"
|
||||||
|
|
||||||
|
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||||
|
out2 = aac_SSC(frame_t, next2_t, "OLS")
|
||||||
|
assert out2 == "LSS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
|
||||||
|
"""
|
||||||
|
prev=ESH and next predicted ESH for both channels:
|
||||||
|
per-channel: ESH, ESH
|
||||||
|
merged: ESH
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "ESH")
|
||||||
|
assert out == "ESH"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
|
||||||
|
"""
|
||||||
|
prev=ESH and next not predicted ESH for both channels:
|
||||||
|
per-channel: LPS, LPS
|
||||||
|
merged: LPS
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_no_attack()
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "ESH")
|
||||||
|
assert out == "LPS"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
|
||||||
|
"""
|
||||||
|
prev=ESH:
|
||||||
|
- one channel predicts ESH => ESH
|
||||||
|
- other channel predicts not ESH => LPS
|
||||||
|
Merge table: ESH + LPS => ESH (either side).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||||
|
out1 = aac_SSC(frame_t, next1_t, "ESH")
|
||||||
|
assert out1 == "ESH"
|
||||||
|
|
||||||
|
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||||
|
out2 = aac_SSC(frame_t, next2_t, "ESH")
|
||||||
|
assert out2 == "ESH"
|
||||||
|
|
||||||
|
|
||||||
|
def test_threshold_s_must_exceed_1e_3() -> None:
|
||||||
|
"""
|
||||||
|
Spec: next frame is predicted ESH only if:
|
||||||
|
s[l] > 1e-3 AND ds[l] > 10
|
||||||
|
for some l in 1..7.
|
||||||
|
|
||||||
|
This test checks the necessity of the s[l] threshold:
|
||||||
|
- Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
|
||||||
|
- Expect: not classified as ESH -> for prev=OLS return OLS.
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
|
||||||
|
|
||||||
|
out = aac_SSC(frame_t, next_t, "OLS")
|
||||||
|
assert out == "OLS"
|
||||||
156
source/level_1/core/tests/test_aac_coder_decoder.py
Normal file
156
source/level_1/core/tests/test_aac_coder_decoder.py
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - AAC Coder/DecoderTests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for AAC Coder/Decoder module.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from core.aac_coder import aac_coder_1
|
||||||
|
from core.aac_decoder import aac_decoder_1
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||||
|
"""
|
||||||
|
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_ref : StereoSignal
|
||||||
|
Reference signal, shape (N, 2) typical.
|
||||||
|
x_hat : StereoSignal
|
||||||
|
Reconstructed signal, shape (M, 2) typical.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
SNR in dB.
|
||||||
|
- Returns +inf if noise power is zero.
|
||||||
|
- Returns -inf if signal power is zero.
|
||||||
|
"""
|
||||||
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
|
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||||
|
|
||||||
|
# Be conservative: align lengths and common channels.
|
||||||
|
if x_ref.ndim == 1:
|
||||||
|
x_ref = x_ref.reshape(-1, 1)
|
||||||
|
if x_hat.ndim == 1:
|
||||||
|
x_hat = x_hat.reshape(-1, 1)
|
||||||
|
|
||||||
|
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||||
|
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||||
|
|
||||||
|
x_ref = x_ref[:n, :c]
|
||||||
|
x_hat = x_hat[:n, :c]
|
||||||
|
|
||||||
|
err = x_ref - x_hat
|
||||||
|
ps = float(np.sum(x_ref * x_ref))
|
||||||
|
pn = float(np.sum(err * err))
|
||||||
|
|
||||||
|
if pn <= 0.0:
|
||||||
|
return float("inf")
|
||||||
|
if ps <= 0.0:
|
||||||
|
return float("-inf")
|
||||||
|
|
||||||
|
return float(10.0 * np.log10(ps / pn))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tmp_stereo_wav(tmp_path: Path) -> Path:
|
||||||
|
"""
|
||||||
|
Create a temporary 48 kHz stereo WAV with random samples.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(123)
|
||||||
|
fs = 48000
|
||||||
|
|
||||||
|
# ~1 second of audio (kept small for test speed).
|
||||||
|
n = fs
|
||||||
|
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
wav_path = tmp_path / "in.wav"
|
||||||
|
sf.write(str(wav_path), x, fs)
|
||||||
|
return wav_path
|
||||||
|
|
||||||
|
|
||||||
|
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
||||||
|
"""
|
||||||
|
Module-level contract test:
|
||||||
|
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
|
||||||
|
"""
|
||||||
|
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
|
||||||
|
|
||||||
|
assert isinstance(aac_seq, list)
|
||||||
|
assert len(aac_seq) > 0
|
||||||
|
|
||||||
|
for fr in aac_seq:
|
||||||
|
assert isinstance(fr, dict)
|
||||||
|
|
||||||
|
# Required keys
|
||||||
|
assert "frame_type" in fr
|
||||||
|
assert "win_type" in fr
|
||||||
|
assert "chl" in fr
|
||||||
|
assert "chr" in fr
|
||||||
|
|
||||||
|
frame_type = fr["frame_type"]
|
||||||
|
win_type = fr["win_type"]
|
||||||
|
|
||||||
|
assert frame_type in ("OLS", "LSS", "ESH", "LPS")
|
||||||
|
assert win_type in ("SIN", "KBD")
|
||||||
|
|
||||||
|
assert isinstance(fr["chl"], dict)
|
||||||
|
assert isinstance(fr["chr"], dict)
|
||||||
|
assert "frame_F" in fr["chl"]
|
||||||
|
assert "frame_F" in fr["chr"]
|
||||||
|
|
||||||
|
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||||
|
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
assert chl_f.shape == (128, 8)
|
||||||
|
assert chr_f.shape == (128, 8)
|
||||||
|
else:
|
||||||
|
assert chl_f.shape == (1024, 1)
|
||||||
|
assert chr_f.shape == (1024, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
End-to-end test:
|
||||||
|
Encode + decode and check SNR is very high (numerical-noise only).
|
||||||
|
|
||||||
|
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
|
||||||
|
"""
|
||||||
|
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
|
||||||
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
|
assert int(fs) == 48000
|
||||||
|
|
||||||
|
out_wav = tmp_path / "out.wav"
|
||||||
|
|
||||||
|
aac_seq = aac_coder_1(tmp_stereo_wav)
|
||||||
|
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
|
||||||
|
|
||||||
|
# Basic sanity: output file exists and is readable
|
||||||
|
assert out_wav.exists()
|
||||||
|
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
|
||||||
|
assert int(fs_hat) == 48000
|
||||||
|
|
||||||
|
# SNR against returned array (file should match closely, but we do not require it here).
|
||||||
|
snr = _snr_db(x_ref, x_hat)
|
||||||
|
assert snr > 80.0
|
||||||
269
source/level_1/core/tests/test_filterbank.py
Normal file
269
source/level_1/core/tests/test_filterbank.py
Normal file
@ -0,0 +1,269 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Filterbank Tests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for Filterbank module.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Sequence
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
# Helper fixtures for filterbank
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Analyze-synthesize each frame and overlap-add with hop=1024.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : StereoSignal
|
||||||
|
Input stereo stream, expected shape (N, 2).
|
||||||
|
frame_types : Sequence[FrameType]
|
||||||
|
Length K sequence of frame types for frames starting at i*1024.
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("SIN" or "KBD").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Reconstructed stereo stream, same shape as x (N, 2).
|
||||||
|
"""
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(frame_types)
|
||||||
|
|
||||||
|
y: StereoSignal = np.zeros_like(x, dtype=np.float64)
|
||||||
|
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
frame_t: FrameT = x[start:start + win, :]
|
||||||
|
frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type)
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type)
|
||||||
|
y[start:start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
|
||||||
|
"""
|
||||||
|
Compute SNR in dB over all samples/channels.
|
||||||
|
"""
|
||||||
|
err = x - y
|
||||||
|
ps = float(np.sum(x * x))
|
||||||
|
pn = float(np.sum(err * err))
|
||||||
|
if pn <= 0.0:
|
||||||
|
return float("inf")
|
||||||
|
if ps <= 0.0:
|
||||||
|
return float("-inf")
|
||||||
|
return 10.0 * float(np.log10(ps / pn))
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Forward filterbank tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
assert frame_f.shape == (1024, 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_shapes_esh(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for ESH, aac_filter_bank returns shape (128, 16).
|
||||||
|
"""
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
|
||||||
|
assert frame_f.shape == (128, 16)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Behavior test: for OLS (representative long-sequence), channels are independent.
|
||||||
|
If right channel is zero and left is random, right spectrum should be near zero.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(0)
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_t[:, 0] = rng.normal(size=2048)
|
||||||
|
|
||||||
|
frame_f = aac_filter_bank(frame_t, "OLS", win_type)
|
||||||
|
|
||||||
|
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Behavior test: for ESH, channels are independent.
|
||||||
|
If right channel is zero and left is random, all odd columns (right) should be near zero.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(1)
|
||||||
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_t[:, 0] = rng.normal(size=2048)
|
||||||
|
|
||||||
|
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
|
||||||
|
|
||||||
|
right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15
|
||||||
|
assert np.max(np.abs(right_cols)) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Spec-driven behavior test:
|
||||||
|
ESH uses only the central region [448, 1600), split into 8 overlapping
|
||||||
|
windows of length 256 with 50% overlap.
|
||||||
|
|
||||||
|
Therefore, changing samples outside [448, 1600) must not affect the output.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(2)
|
||||||
|
|
||||||
|
frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
center = rng.normal(size=(1152, 2))
|
||||||
|
frame_a[448:1600, :] = center
|
||||||
|
frame_b[448:1600, :] = center
|
||||||
|
|
||||||
|
frame_b[0:448, :] = rng.normal(size=(448, 2))
|
||||||
|
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
|
||||||
|
|
||||||
|
fa = aac_filter_bank(frame_a, "ESH", win_type)
|
||||||
|
fb = aac_filter_bank(frame_b, "ESH", win_type)
|
||||||
|
|
||||||
|
# Use a tiny tolerance to avoid flaky failures due to floating-point minutiae.
|
||||||
|
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_filterbank_output_is_finite(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Sanity test: output must not contain NaN or inf for representative cases.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(3)
|
||||||
|
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
assert np.isfinite(frame_f).all()
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Reverse i_filterbank tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2).
|
||||||
|
"""
|
||||||
|
frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64)
|
||||||
|
for frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
frame_t = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
assert frame_t.shape == (2048, 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ifilterbank_shapes_esh(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2).
|
||||||
|
"""
|
||||||
|
frame_f: FrameF = np.zeros((128, 16), dtype=np.float64)
|
||||||
|
frame_t = aac_i_filter_bank(frame_f, "ESH", win_type)
|
||||||
|
assert frame_t.shape == (2048, 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Sanity test: per-frame analysis+synthesis must produce finite outputs.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(0)
|
||||||
|
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
assert np.isfinite(frame_t_hat).all()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Module-level test:
|
||||||
|
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
|
||||||
|
in the steady-state region.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(1)
|
||||||
|
|
||||||
|
K = 6
|
||||||
|
N = 1024 * (K + 1)
|
||||||
|
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
|
||||||
|
|
||||||
|
a = 1024
|
||||||
|
b = N - 1024
|
||||||
|
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||||
|
assert snr > 50.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Module-level test:
|
||||||
|
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
|
||||||
|
in the steady-state region.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(2)
|
||||||
|
|
||||||
|
K = 6
|
||||||
|
N = 1024 * (K + 1)
|
||||||
|
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
|
||||||
|
|
||||||
|
a = 1024
|
||||||
|
b = N - 1024
|
||||||
|
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||||
|
assert snr > 45.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||||
|
def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
|
||||||
|
"""
|
||||||
|
Transition sequence test matching the windowing logic:
|
||||||
|
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(3)
|
||||||
|
|
||||||
|
frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
|
||||||
|
K = len(frame_types)
|
||||||
|
N = 1024 * (K + 1)
|
||||||
|
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||||
|
|
||||||
|
y = _ola_reconstruct(x, frame_types, win_type)
|
||||||
|
|
||||||
|
a = 1024
|
||||||
|
b = N - 1024
|
||||||
|
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||||
|
assert snr > 40.0
|
||||||
117
source/level_1/core/tests/test_filterbank_internal.py
Normal file
117
source/level_1/core/tests/test_filterbank_internal.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Filterbank internal (mdct) Tests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for Filterbank internal MDCT/IMDCT functionality.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.aac_filterbank import _imdct, _mdct
|
||||||
|
from core.aac_types import FloatArray, TimeSignal, MdctCoeffs
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None:
|
||||||
|
"""
|
||||||
|
Helper for consistent tolerances across tests.
|
||||||
|
"""
|
||||||
|
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float:
|
||||||
|
"""
|
||||||
|
Estimate scalar gain g such that y ~= g*x in least-squares sense.
|
||||||
|
"""
|
||||||
|
denom = float(np.dot(x, x))
|
||||||
|
if denom == 0.0:
|
||||||
|
return 0.0
|
||||||
|
return float(np.dot(y, x) / denom)
|
||||||
|
|
||||||
|
|
||||||
|
tolerance = 1e-10
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("N", [256, 2048])
|
||||||
|
def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None:
|
||||||
|
"""
|
||||||
|
Consistency test in coefficient domain:
|
||||||
|
mdct(imdct(X)) ~= g * X
|
||||||
|
|
||||||
|
For the chosen (non-orthonormal) scaling, g is expected to be close to 2.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(0)
|
||||||
|
K = N // 2
|
||||||
|
|
||||||
|
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
|
x: TimeSignal = _imdct(X)
|
||||||
|
X_hat: MdctCoeffs = _mdct(x)
|
||||||
|
|
||||||
|
g = _estimate_gain(X_hat, X)
|
||||||
|
_assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance)
|
||||||
|
_assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("N", [256, 2048])
|
||||||
|
def test_mdct_linearity(N: int) -> None:
|
||||||
|
"""
|
||||||
|
Linearity test:
|
||||||
|
mdct(a*x + b*y) == a*mdct(x) + b*mdct(y)
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(1)
|
||||||
|
x: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||||
|
y: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||||
|
|
||||||
|
a = 0.37
|
||||||
|
b = -1.12
|
||||||
|
|
||||||
|
left: MdctCoeffs = _mdct(a * x + b * y)
|
||||||
|
right: MdctCoeffs = a * _mdct(x) + b * _mdct(y)
|
||||||
|
|
||||||
|
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("N", [256, 2048])
|
||||||
|
def test_imdct_linearity(N: int) -> None:
|
||||||
|
"""
|
||||||
|
Linearity test for IMDCT:
|
||||||
|
imdct(a*X + b*Y) == a*imdct(X) + b*imdct(Y)
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(2)
|
||||||
|
K = N // 2
|
||||||
|
|
||||||
|
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
|
Y: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
|
|
||||||
|
a = -0.5
|
||||||
|
b = 2.0
|
||||||
|
|
||||||
|
left: TimeSignal = _imdct(a * X + b * Y)
|
||||||
|
right: TimeSignal = a * _imdct(X) + b * _imdct(Y)
|
||||||
|
|
||||||
|
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("N", [256, 2048])
|
||||||
|
def test_mdct_imdct_outputs_are_finite(N: int) -> None:
|
||||||
|
"""
|
||||||
|
Sanity test: no NaN/inf on random inputs.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(3)
|
||||||
|
K = N // 2
|
||||||
|
|
||||||
|
x: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||||
|
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||||
|
|
||||||
|
X1 = _mdct(x)
|
||||||
|
x1 = _imdct(X)
|
||||||
|
|
||||||
|
assert np.isfinite(X1).all()
|
||||||
|
assert np.isfinite(x1).all()
|
||||||
@ -1,843 +1,186 @@
|
|||||||
#! /usr/bin/env python
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Level 1 Wrappers + Demo
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 1 wrapper module.
|
||||||
|
#
|
||||||
|
# This file provides:
|
||||||
|
# - Thin wrappers for Level 1 API functions (encode/decode) that delegate
|
||||||
|
# to the corresponding core implementations.
|
||||||
|
# - A demo function that runs end-to-end and computes SNR.
|
||||||
|
# - A small CLI entrypoint for convenience.
|
||||||
|
# ------------------------------------------------------------
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Tuple, List, Literal, TypedDict, Union
|
from typing import Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
from scipy.signal.windows import kaiser
|
|
||||||
|
|
||||||
# --------------------------------
|
from core.aac_types import AACSeq1, StereoSignal
|
||||||
# Public Type aliases (Level 1)
|
from core.aac_coder import aac_coder_1 as core_aac_coder_1
|
||||||
# --------------------------------
|
from core.aac_coder import aac_read_wav_stereo_48k
|
||||||
|
from core.aac_decoder import aac_decoder_1 as core_aac_decoder_1
|
||||||
|
|
||||||
FrameType = Literal["OLS", "LSS", "ESH", "LPS"]
|
|
||||||
"""
|
|
||||||
Frame type codes:
|
|
||||||
- "OLS": ONLY_LONG_SEQUENCE
|
|
||||||
- "LSS": LONG_START_SEQUENCE
|
|
||||||
- "ESH": EIGHT_SHORT_SEQUENCE
|
|
||||||
- "LPS": LONG_STOP_SEQUENCE
|
|
||||||
"""
|
|
||||||
|
|
||||||
WinType = Literal["KBD", "SIN"]
|
|
||||||
"""
|
|
||||||
Window type codes:
|
|
||||||
- "KBD": Kaiser-Bessel-Derived
|
|
||||||
- "SIN": sinusoid
|
|
||||||
"""
|
|
||||||
|
|
||||||
FrameT = np.ndarray
|
|
||||||
"""
|
|
||||||
Time-domain frame.
|
|
||||||
Expected shape: (2048, 2) for stereo (two channels).
|
|
||||||
dtype: float (e.g., float32/float64).
|
|
||||||
"""
|
|
||||||
|
|
||||||
FrameChannelT = np.ndarray
|
|
||||||
"""
|
|
||||||
Time-domain single channel frame.
|
|
||||||
Expected shape: (2048,).
|
|
||||||
dtype: float (e.g., float32/float64).
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
FrameF = np.ndarray
|
|
||||||
"""
|
|
||||||
Frequency-domain frame (MDCT coefficients).
|
|
||||||
As per spec (Level 1):
|
|
||||||
- If frame_type in {"OLS","LSS","LPS"}: shape (1024, 2)
|
|
||||||
- If frame_type == "ESH": shape (128, 16) where 8 subframes x 2 channels
|
|
||||||
are placed in columns according to the subframe order (i.e., each subframe is (128,2)).
|
|
||||||
"""
|
|
||||||
|
|
||||||
ChannelKey = Literal["chl", "chr"]
|
|
||||||
|
|
||||||
|
|
||||||
class AACChannelFrameF(TypedDict):
|
|
||||||
"""Channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1)."""
|
|
||||||
frame_F: np.ndarray
|
|
||||||
# frame_F for one channel:
|
|
||||||
# - ESH: shape (128, 8)
|
|
||||||
# - else: shape (1024, 1)
|
|
||||||
|
|
||||||
|
|
||||||
class AACSeq1Frame(TypedDict):
|
|
||||||
"""One frame dictionary of aac_seq_1 (Level 1)."""
|
|
||||||
frame_type: FrameType
|
|
||||||
win_type: WinType
|
|
||||||
chl: AACChannelFrameF
|
|
||||||
chr: AACChannelFrameF
|
|
||||||
|
|
||||||
|
|
||||||
AACSeq1 = List[AACSeq1Frame]
|
|
||||||
"""AAC sequence for Level 1:
|
|
||||||
List of length K (K = number of frames).
|
|
||||||
Each element is a dict with keys:
|
|
||||||
- "frame_type", "win_type", "chl", "chr"
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Global Options
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# Window type
|
|
||||||
# Options: "SIN", "KBD"
|
|
||||||
WIN_TYPE: WinType = "SIN"
|
|
||||||
|
|
||||||
|
|
||||||
# Private helpers for SSC
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# See Table 1 in mm-2025-hw-v0.1.pdf
|
|
||||||
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
|
|
||||||
("OLS", "OLS"): "OLS",
|
|
||||||
("OLS", "LSS"): "LSS",
|
|
||||||
("OLS", "ESH"): "ESH",
|
|
||||||
("OLS", "LPS"): "LPS",
|
|
||||||
("LSS", "OLS"): "LSS",
|
|
||||||
("LSS", "LSS"): "LSS",
|
|
||||||
("LSS", "ESH"): "ESH",
|
|
||||||
("LSS", "LPS"): "ESH",
|
|
||||||
("ESH", "OLS"): "ESH",
|
|
||||||
("ESH", "LSS"): "ESH",
|
|
||||||
("ESH", "ESH"): "ESH",
|
|
||||||
("ESH", "LPS"): "ESH",
|
|
||||||
("LPS", "OLS"): "LPS",
|
|
||||||
("LPS", "LSS"): "ESH",
|
|
||||||
("LPS", "ESH"): "ESH",
|
|
||||||
("LPS", "LPS"): "LPS",
|
|
||||||
}
|
|
||||||
|
|
||||||
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
|
|
||||||
"""
|
|
||||||
Detect if next frame (single channel) implies ESH according to the spec's attack criterion.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
next_frame_channel : FrameChannelT
|
|
||||||
One channel of next_frame_T (shape: (2048,), dtype float).
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
attack : bool
|
|
||||||
True if an attack is detected (=> next frame predicted ESH), else False.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
The spec describes:
|
|
||||||
|
|
||||||
- High-pass filter applied to next_frame_channel
|
|
||||||
- Split into 16 segments of length 128
|
|
||||||
- Compute segment energies s(l)
|
|
||||||
- Compute ds(l) = s(l) / s(l-1)
|
|
||||||
- Attack exists if there exists l in {1..7} such that:
|
|
||||||
s(l) > 1e-3 and ds(l) > 10
|
|
||||||
"""
|
|
||||||
x = next_frame_channel # local alias, x assumed to be a 1-D array of length 2048
|
|
||||||
|
|
||||||
# High-pass filter H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
|
|
||||||
# Implemented as: y[n] = x[n] - x[n-1] + 0.5*y[n-1]
|
|
||||||
y = np.zeros_like(x)
|
|
||||||
prev_x = 0.0
|
|
||||||
prev_y = 0.0
|
|
||||||
for n in range(x.shape[0]):
|
|
||||||
xn = float(x[n])
|
|
||||||
yn = (xn - prev_x) + 0.5 * prev_y
|
|
||||||
y[n] = yn
|
|
||||||
prev_x = xn
|
|
||||||
prev_y = yn
|
|
||||||
|
|
||||||
# Segment energies over 16 blocks of 128 samples.
|
|
||||||
s = np.empty(16, dtype=np.float64)
|
|
||||||
for l in range(16):
|
|
||||||
a = l * 128
|
|
||||||
b = (l + 1) * 128
|
|
||||||
seg = y[a:b]
|
|
||||||
s[l] = float(np.sum(seg * seg))
|
|
||||||
|
|
||||||
# ds(l) for l>=1. For l=0 not defined, keep 0.
|
|
||||||
ds = np.zeros(16, dtype=np.float64)
|
|
||||||
eps = 1e-12 # avoid division by zero without changing logic materially
|
|
||||||
for l in range(1, 16):
|
|
||||||
ds[l] = s[l] / max(s[l - 1], eps)
|
|
||||||
|
|
||||||
# Spec: check l in {1..7}
|
|
||||||
for l in range(1, 8):
|
|
||||||
if (s[l] > 1e-3) and (ds[l] > 10.0):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
|
|
||||||
"""
|
|
||||||
Decide current frame type for a single channel based on prev_frame_type and next-frame attack.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
prev_frame_type : FrameType
|
|
||||||
Previous frame type (one of "OLS","LSS","ESH","LPS").
|
|
||||||
attack : bool
|
|
||||||
Whether next frame is predicted ESH for this channel.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
frame_type : FrameType
|
|
||||||
The per-channel decision for the current frame.
|
|
||||||
|
|
||||||
Rules (spec)
|
|
||||||
------------
|
|
||||||
- If prev is "LSS" => current is "ESH" (fixed)
|
|
||||||
- If prev is "LPS" => current is "OLS" (fixed)
|
|
||||||
- If prev is "OLS" => current is "LSS" if attack else "OLS"
|
|
||||||
- If prev is "ESH" => current is "ESH" if attack else "LPS"
|
|
||||||
"""
|
|
||||||
if prev_frame_type == "LSS":
|
|
||||||
return "ESH"
|
|
||||||
if prev_frame_type == "LPS":
|
|
||||||
return "OLS"
|
|
||||||
if prev_frame_type == "OLS":
|
|
||||||
return "LSS" if attack else "OLS"
|
|
||||||
if prev_frame_type == "ESH":
|
|
||||||
return "ESH" if attack else "LPS"
|
|
||||||
|
|
||||||
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
|
|
||||||
|
|
||||||
|
|
||||||
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
|
||||||
"""
|
|
||||||
Merge per-channel frame types into one common frame type using the spec table.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
ft_l : FrameType
|
|
||||||
Frame type decision for channel 0 (left).
|
|
||||||
ft_r : FrameType
|
|
||||||
Frame type decision for channel 1 (right).
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
common : FrameType
|
|
||||||
The common final frame type.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
|
|
||||||
except KeyError as e:
|
|
||||||
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Private helpers for Filterbank
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _sin_window(N: int) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Sine window (full length N).
|
|
||||||
w[n] = sin(pi/N * (n + 0.5)), 0 <= n < N
|
|
||||||
"""
|
|
||||||
n = np.arange(N, dtype=np.float64)
|
|
||||||
return np.sin((np.pi / N) * (n + 0.5))
|
|
||||||
|
|
||||||
|
|
||||||
def _kbd_window(N: int, alpha: float) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Kaiser-Bessel-Derived (KBD) window (full length N).
|
|
||||||
|
|
||||||
This follows the standard KBD construction:
|
|
||||||
- Build Kaiser kernel of length N/2 + 1
|
|
||||||
- Use cumulative sum and sqrt normalization to form left and right halves
|
|
||||||
"""
|
|
||||||
half = N // 2
|
|
||||||
|
|
||||||
# Kaiser kernel length: half + 1 samples (0 .. half)
|
|
||||||
# beta = pi * alpha per the usual correspondence with the ISO definition
|
|
||||||
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
|
|
||||||
|
|
||||||
csum = np.cumsum(kernel)
|
|
||||||
denom = csum[-1]
|
|
||||||
|
|
||||||
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
|
|
||||||
w_right = w_left[::-1] # mirror for second half
|
|
||||||
|
|
||||||
return np.concatenate([w_left, w_right])
|
|
||||||
|
|
||||||
|
|
||||||
def _long_window(win_type: WinType) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Long window (length 2048) for the selected win_type.
|
|
||||||
"""
|
|
||||||
if win_type == "SIN":
|
|
||||||
return _sin_window(2048)
|
|
||||||
if win_type == "KBD":
|
|
||||||
# Assignment-specific alpha values
|
|
||||||
return _kbd_window(2048, alpha=6.0)
|
|
||||||
raise ValueError(f"Invalid win_type: {win_type!r}")
|
|
||||||
|
|
||||||
|
|
||||||
def _short_window(win_type: WinType) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Short window (length 256) for the selected win_type.
|
|
||||||
"""
|
|
||||||
if win_type == "SIN":
|
|
||||||
return _sin_window(256)
|
|
||||||
if win_type == "KBD":
|
|
||||||
# Assignment-specific alpha values
|
|
||||||
return _kbd_window(256, alpha=4.0)
|
|
||||||
raise ValueError(f"Invalid win_type: {win_type!r}")
|
|
||||||
|
|
||||||
|
|
||||||
def _window_sequence(frame_type: FrameType, win_type: WinType) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Build the 2048-sample window sequence for OLS/LSS/LPS.
|
|
||||||
|
|
||||||
We follow the simplified assumption:
|
|
||||||
- The same window shape (KBD or SIN) is used globally (no mixed halves).
|
|
||||||
- Therefore, the left and right halves are drawn from the same family.
|
|
||||||
"""
|
|
||||||
wL = _long_window(win_type) # length 2048
|
|
||||||
wS = _short_window(win_type) # length 256
|
|
||||||
|
|
||||||
if frame_type == "OLS":
|
|
||||||
return wL
|
|
||||||
|
|
||||||
if frame_type == "LSS":
|
|
||||||
# 0..1023: left half of long window
|
|
||||||
# 1024..1471: ones (448 samples)
|
|
||||||
# 1472..1599: right half of short window (128 samples)
|
|
||||||
# 1600..2047: zeros (448 samples)
|
|
||||||
out = np.zeros(2048, dtype=np.float64)
|
|
||||||
out[0:1024] = wL[0:1024]
|
|
||||||
out[1024:1472] = 1.0
|
|
||||||
out[1472:1600] = wS[128:256]
|
|
||||||
out[1600:2048] = 0.0
|
|
||||||
return out
|
|
||||||
|
|
||||||
if frame_type == "LPS":
|
|
||||||
# 0..447: zeros (448)
|
|
||||||
# 448..575: left half of short window (128)
|
|
||||||
# 576..1023: ones (448)
|
|
||||||
# 1024..2047: right half of long window (1024)
|
|
||||||
out = np.zeros(2048, dtype=np.float64)
|
|
||||||
out[0:448] = 0.0
|
|
||||||
out[448:576] = wS[0:128]
|
|
||||||
out[576:1024] = 1.0
|
|
||||||
out[1024:2048] = wL[1024:2048]
|
|
||||||
return out
|
|
||||||
|
|
||||||
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
|
|
||||||
|
|
||||||
|
|
||||||
def _mdct(s: np.ndarray) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
MDCT (direct form) as given in the assignment.
|
|
||||||
|
|
||||||
Input:
|
|
||||||
s: windowed time samples of length N (N = 2048 or 256)
|
|
||||||
|
|
||||||
Output:
|
|
||||||
X: MDCT coefficients of length N/2
|
|
||||||
|
|
||||||
Definition:
|
|
||||||
X[k] = 2 * sum_{n=0 .. N-1} s[n] * cos(2*pi/N * (n + n0) * (k + 1/2))
|
|
||||||
where n0 = (N/2 + 1)/2
|
|
||||||
"""
|
|
||||||
s = np.asarray(s, dtype=np.float64)
|
|
||||||
N = int(s.shape[0])
|
|
||||||
if N not in (2048, 256):
|
|
||||||
raise ValueError("MDCT input length must be 2048 or 256.")
|
|
||||||
|
|
||||||
n0 = (N / 2.0 + 1.0) / 2.0
|
|
||||||
|
|
||||||
n = np.arange(N, dtype=np.float64) + n0
|
|
||||||
k = np.arange(N // 2, dtype=np.float64) + 0.5
|
|
||||||
|
|
||||||
# Cosine matrix: shape (N, N/2)
|
|
||||||
C = np.cos((2.0 * np.pi / N) * np.outer(n, k))
|
|
||||||
X = 2.0 * (s @ C)
|
|
||||||
|
|
||||||
return X
|
|
||||||
|
|
||||||
def _imdct(X: np.ndarray) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
IMDCT (direct form) as given in the assignment.
|
|
||||||
|
|
||||||
Input:
|
|
||||||
X: MDCT coefficients of length N/2 (N = 2048 or 256)
|
|
||||||
|
|
||||||
Output:
|
|
||||||
s: time samples of length N
|
|
||||||
|
|
||||||
Definition:
|
|
||||||
s[n] = (2/N) * sum_{k=0 .. N/2-1} X[k] * cos(2*pi/N * (n + n0) * (k + 1/2))
|
|
||||||
where n0 = (N/2 + 1)/2
|
|
||||||
"""
|
|
||||||
X = np.asarray(X, dtype=np.float64).reshape(-1)
|
|
||||||
K = int(X.shape[0])
|
|
||||||
if K not in (1024, 128):
|
|
||||||
raise ValueError("IMDCT input length must be 1024 or 128.")
|
|
||||||
|
|
||||||
N = 2 * K
|
|
||||||
n0 = (N / 2.0 + 1.0) / 2.0
|
|
||||||
|
|
||||||
n = np.arange(N, dtype=np.float64) + n0
|
|
||||||
k = np.arange(K, dtype=np.float64) + 0.5
|
|
||||||
|
|
||||||
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
|
|
||||||
s = (2.0 / N) * (C @ X)
|
|
||||||
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
def _filter_bank_esh_channel(x_ch: np.ndarray, win_type: WinType) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
ESH analysis for one channel.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
X_esh: shape (128, 8), where each column is the 128 MDCT coeffs of one short window.
|
|
||||||
"""
|
|
||||||
wS = _short_window(win_type)
|
|
||||||
X_esh = np.empty((128, 8), dtype=np.float64)
|
|
||||||
|
|
||||||
# ESH subwindows are taken from the central region:
|
|
||||||
# start positions: 448 + 128*j, j = 0..7
|
|
||||||
for j in range(8):
|
|
||||||
start = 448 + 128 * j
|
|
||||||
seg = x_ch[start:start + 256] * wS
|
|
||||||
X_esh[:, j] = _mdct(seg)
|
|
||||||
|
|
||||||
return X_esh
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _unpack_esh(frame_F: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""
|
|
||||||
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
|
|
||||||
|
|
||||||
Mapping is the inverse of the packing used in filter_bank():
|
|
||||||
out[:, 2*j] = left[:, j]
|
|
||||||
out[:, 2*j+1] = right[:, j]
|
|
||||||
"""
|
|
||||||
if frame_F.shape != (128, 16):
|
|
||||||
raise ValueError("ESH frame_F must have shape (128, 16).")
|
|
||||||
|
|
||||||
left = np.empty((128, 8), dtype=np.float64)
|
|
||||||
right = np.empty((128, 8), dtype=np.float64)
|
|
||||||
for j in range(8):
|
|
||||||
left[:, j] = frame_F[:, 2 * j + 0]
|
|
||||||
right[:, j] = frame_F[:, 2 * j + 1]
|
|
||||||
return left, right
|
|
||||||
|
|
||||||
|
|
||||||
def _i_filter_bank_esh_channel(X_esh: np.ndarray, win_type: WinType) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
ESH synthesis for one channel.
|
|
||||||
|
|
||||||
Input:
|
|
||||||
X_esh: (128, 8) MDCT coeffs for 8 short windows
|
|
||||||
|
|
||||||
Output:
|
|
||||||
x_ch: (2048, ) time-domain frame contribution (windowed),
|
|
||||||
ready for OLA at the caller level.
|
|
||||||
"""
|
|
||||||
if X_esh.shape != (128, 8):
|
|
||||||
raise ValueError("X_esh must have shape (128, 8).")
|
|
||||||
|
|
||||||
wS = _short_window(win_type)
|
|
||||||
out = np.zeros(2048, dtype=np.float64)
|
|
||||||
|
|
||||||
# Each short IMDCT returns 256 samples. Place them at:
|
|
||||||
# start = 448 + 128*j, j=0..7 (50% overlap)
|
|
||||||
for j in range(8):
|
|
||||||
seg = _imdct(X_esh[:, j]) * wS # (256,)
|
|
||||||
start = 448 + 128 * j
|
|
||||||
out[start:start + 256] += seg
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public Function prototypes (Level 1)
|
# Public Level 1 API (wrappers)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
|
||||||
"""
|
|
||||||
Sequence Segmentation Control (SSC).
|
|
||||||
Selects and returns the frame type for the current frame (i) based on input parameters.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-------
|
|
||||||
frame_T: FrameT
|
|
||||||
current time-domain frame i, stereo, shape (2048, 2)
|
|
||||||
next_frame_T: FrameT
|
|
||||||
next time-domain frame (i+1), stereo, shape (2048, 2)
|
|
||||||
(used to decide transitions to/from ESH)
|
|
||||||
prev_frame_type: FrameType
|
|
||||||
frame type chosen for the previous frame (i-1)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
frame_type : FrameType
|
|
||||||
- "OLS" (ONLY_LONG_SEQUENCE)
|
|
||||||
- "LSS" (LONG_START_SEQUENCE)
|
|
||||||
- "ESH" (EIGHT_SHORT_SEQUENCE)
|
|
||||||
- "LPS" (LONG_STOP_SEQUENCE)
|
|
||||||
"""
|
|
||||||
if frame_T.shape != (2048, 2):
|
|
||||||
raise ValueError("frame_T must have shape (2048, 2).")
|
|
||||||
if next_frame_T.shape != (2048, 2):
|
|
||||||
raise ValueError("next_frame_T must have shape (2048, 2).")
|
|
||||||
|
|
||||||
# Detect attack independently per channel on next frame.
|
|
||||||
attack_l = _detect_attack(next_frame_T[:, 0])
|
|
||||||
attack_r = _detect_attack(next_frame_T[:, 1])
|
|
||||||
|
|
||||||
# Decide per-channel type based on shared prev_frame_type.
|
|
||||||
ft_l = _decide_frame_type(prev_frame_type, attack_l)
|
|
||||||
ft_r = _decide_frame_type(prev_frame_type, attack_r)
|
|
||||||
|
|
||||||
# Stereo merge as per Table 1.
|
|
||||||
return _stereo_merge(ft_l, ft_r)
|
|
||||||
|
|
||||||
|
|
||||||
def filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
|
|
||||||
"""
|
|
||||||
Filterbank stage (MDCT analysis).
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
frame_T : FrameT
|
|
||||||
Time-domain frame, stereo, shape (2048, 2).
|
|
||||||
frame_type : FrameType
|
|
||||||
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
|
|
||||||
win_type : WinType
|
|
||||||
Window type ("KBD" or "SIN") used for the current frame.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
frame_F : FrameF
|
|
||||||
Frequency-domain MDCT coefficients:
|
|
||||||
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
|
|
||||||
containing MDCT coefficients for both channels.
|
|
||||||
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
|
|
||||||
placed in columns according to subframe order, i.e. overall shape (128, 16).
|
|
||||||
"""
|
|
||||||
if frame_T.shape != (2048, 2):
|
|
||||||
raise ValueError("frame_T must have shape (2048, 2).")
|
|
||||||
|
|
||||||
xL = frame_T[:, 0].astype(np.float64, copy=False)
|
|
||||||
xR = frame_T[:, 1].astype(np.float64, copy=False)
|
|
||||||
|
|
||||||
if frame_type in ("OLS", "LSS", "LPS"):
|
|
||||||
w = _window_sequence(frame_type, win_type) # length 2048
|
|
||||||
XL = _mdct(xL * w) # length 1024
|
|
||||||
XR = _mdct(xR * w) # length 1024
|
|
||||||
out = np.empty((1024, 2), dtype=np.float64)
|
|
||||||
out[:, 0] = XL
|
|
||||||
out[:, 1] = XR
|
|
||||||
return out
|
|
||||||
|
|
||||||
if frame_type == "ESH":
|
|
||||||
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
|
|
||||||
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
|
|
||||||
|
|
||||||
# Pack into (128, 16): each subframe as (128,2) placed in columns
|
|
||||||
out = np.empty((128, 16), dtype=np.float64)
|
|
||||||
for j in range(8):
|
|
||||||
out[:, 2 * j + 0] = Xl[:, j]
|
|
||||||
out[:, 2 * j + 1] = Xr[:, j]
|
|
||||||
return out
|
|
||||||
|
|
||||||
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
|
||||||
|
|
||||||
|
|
||||||
def i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
|
|
||||||
"""
|
|
||||||
Inverse filterbank (IMDCT synthesis).
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
frame_F : FrameF
|
|
||||||
Frequency-domain MDCT coefficients as produced by filter_bank().
|
|
||||||
frame_type : FrameType
|
|
||||||
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
|
|
||||||
win_type : WinType
|
|
||||||
Window type ("KBD" or "SIN").
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
frame_T : FrameT
|
|
||||||
Reconstructed time-domain frame, stereo, shape (2048, 2).
|
|
||||||
"""
|
|
||||||
if frame_type in ("OLS", "LSS", "LPS"):
|
|
||||||
if frame_F.shape != (1024, 2):
|
|
||||||
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
|
|
||||||
|
|
||||||
w = _window_sequence(frame_type, win_type)
|
|
||||||
|
|
||||||
xL = _imdct(frame_F[:, 0]) * w
|
|
||||||
xR = _imdct(frame_F[:, 1]) * w
|
|
||||||
|
|
||||||
out = np.empty((2048, 2), dtype=np.float64)
|
|
||||||
out[:, 0] = xL
|
|
||||||
out[:, 1] = xR
|
|
||||||
return out
|
|
||||||
|
|
||||||
if frame_type == "ESH":
|
|
||||||
if frame_F.shape != (128, 16):
|
|
||||||
raise ValueError("For ESH, frame_F must have shape (128, 16).")
|
|
||||||
|
|
||||||
Xl, Xr = _unpack_esh(frame_F)
|
|
||||||
xL = _i_filter_bank_esh_channel(Xl, win_type)
|
|
||||||
xR = _i_filter_bank_esh_channel(Xr, win_type)
|
|
||||||
|
|
||||||
out = np.empty((2048, 2), dtype=np.float64)
|
|
||||||
out[:, 0] = xL
|
|
||||||
out[:, 1] = xR
|
|
||||||
return out
|
|
||||||
|
|
||||||
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
|
||||||
|
|
||||||
|
|
||||||
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC encoder.
|
Level-1 AAC encoder (wrapper).
|
||||||
|
|
||||||
|
Delegates to core implementation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename_in : str | Path
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename.
|
Input WAV filename.
|
||||||
Assumption: stereo audio, sampling rate 48 kHz.
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
aac_seq_1 : AACSeq1
|
AACSeq1
|
||||||
List of K encoded frames.
|
List of encoded frames (Level 1 schema).
|
||||||
For each i:
|
|
||||||
|
|
||||||
- aac_seq_1[i]["frame_type"]: FrameType
|
|
||||||
- aac_seq_1[i]["win_type"]: WinType
|
|
||||||
- aac_seq_1[i]["chl"]["frame_F"]:
|
|
||||||
- ESH: shape (128, 8)
|
|
||||||
- else: shape (1024, 1)
|
|
||||||
- aac_seq_1[i]["chr"]["frame_F"]:
|
|
||||||
- ESH: shape (128, 8)
|
|
||||||
- else: shape (1024, 1)
|
|
||||||
"""
|
"""
|
||||||
filename_in = Path(filename_in)
|
return core_aac_coder_1(filename_in)
|
||||||
|
|
||||||
x, fs = sf.read(str(filename_in), always_2d=True)
|
|
||||||
x = np.asarray(x, dtype=np.float64)
|
|
||||||
|
|
||||||
if x.shape[1] != 2:
|
|
||||||
raise ValueError("Input must be stereo (2 channels).")
|
|
||||||
if fs != 48000:
|
|
||||||
raise ValueError("Input sampling rate must be 48 kHz.")
|
|
||||||
|
|
||||||
hop = 1024
|
|
||||||
win = 2048
|
|
||||||
|
|
||||||
# Pad at the beginning to support the first overlap region.
|
|
||||||
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
|
|
||||||
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
|
||||||
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
|
||||||
x_pad = np.vstack([pad_pre, x, pad_post])
|
|
||||||
|
|
||||||
# Number of frames such that current frame fits; next frame will be padded if needed.
|
|
||||||
K = int((x_pad.shape[0] - win) // hop + 1)
|
|
||||||
if K <= 0:
|
|
||||||
raise ValueError("Input too short for framing.")
|
|
||||||
|
|
||||||
aac_seq: AACSeq1 = []
|
|
||||||
prev_frame_type: FrameType = "OLS"
|
|
||||||
|
|
||||||
for i in range(K):
|
|
||||||
start = i * hop
|
|
||||||
|
|
||||||
frame_t: FrameT = x_pad[start:start + win, :]
|
|
||||||
if frame_t.shape != (win, 2):
|
|
||||||
# This should not happen due to K definition, but we keep it explicit.
|
|
||||||
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
|
||||||
|
|
||||||
next_t = x_pad[start + hop:start + hop + win, :]
|
|
||||||
|
|
||||||
# Ensure next_t is always (2048,2) by zero-padding at the tail.
|
|
||||||
if next_t.shape[0] < win:
|
|
||||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
|
||||||
next_t = np.vstack([next_t, tail])
|
|
||||||
|
|
||||||
frame_type = SSC(frame_t, next_t, prev_frame_type)
|
|
||||||
frame_f = filter_bank(frame_t, frame_type, WIN_TYPE)
|
|
||||||
|
|
||||||
# Store per-channel as required by AACSeq1 schema
|
|
||||||
if frame_type == "ESH":
|
|
||||||
# frame_f: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
|
||||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
for j in range(8):
|
|
||||||
chl_f[:, j] = frame_f[:, 2 * j + 0]
|
|
||||||
chr_f[:, j] = frame_f[:, 2 * j + 1]
|
|
||||||
else:
|
|
||||||
# frame_f: (1024, 2)
|
|
||||||
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
|
|
||||||
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
|
|
||||||
|
|
||||||
aac_seq.append({
|
|
||||||
"frame_type": frame_type,
|
|
||||||
"win_type": WIN_TYPE,
|
|
||||||
"chl": {"frame_F": chl_f},
|
|
||||||
"chr": {"frame_F": chr_f},
|
|
||||||
})
|
|
||||||
prev_frame_type = frame_type
|
|
||||||
return aac_seq
|
|
||||||
|
|
||||||
|
|
||||||
def i_aac_coder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> np.ndarray:
|
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC decoder (inverse of aac_coder_1()).
|
Level-1 AAC decoder (wrapper).
|
||||||
|
|
||||||
|
Delegates to core implementation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
aac_seq_1 : AACSeq1
|
aac_seq_1 : AACSeq1
|
||||||
Encoded sequence as produced by aac_coder_1().
|
Encoded sequence as produced by aac_coder_1().
|
||||||
filename_out : str | Path
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename.
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
Assumption: stereo audio, sampling rate 48 kHz.
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
x : np.ndarray
|
StereoSignal
|
||||||
Decoded audio samples (time-domain).
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
Expected shape: (N, 2) for stereo (N depends on input length).
|
|
||||||
"""
|
"""
|
||||||
filename_out = Path(filename_out)
|
return core_aac_decoder_1(aac_seq_1, filename_out)
|
||||||
|
|
||||||
hop = 1024
|
|
||||||
win = 2048
|
|
||||||
K = len(aac_seq_1)
|
|
||||||
|
|
||||||
# Output includes the encoder padding region, so we reconstruct
|
# -----------------------------------------------------------------------------
|
||||||
# full padded stream. For K frames: last frame starts at (K-1)*hop and spans win,
|
# Demo (Level 1)
|
||||||
# so total length = (K-1)*hop + win
|
# -----------------------------------------------------------------------------
|
||||||
n_pad = (K - 1) * hop + win
|
|
||||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
|
||||||
|
|
||||||
for i, fr in enumerate(aac_seq_1):
|
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||||
frame_type = fr["frame_type"]
|
"""
|
||||||
win_type = fr["win_type"]
|
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||||
|
|
||||||
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
Parameters
|
||||||
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
----------
|
||||||
|
x_ref : StereoSignal
|
||||||
|
Reference stereo stream.
|
||||||
|
x_hat : StereoSignal
|
||||||
|
Reconstructed stereo stream.
|
||||||
|
|
||||||
# Re-pack into the format expected by i_filter_bank()
|
Returns
|
||||||
if frame_type == "ESH":
|
-------
|
||||||
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
float
|
||||||
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
SNR in dB.
|
||||||
|
- Returns +inf if noise power is zero.
|
||||||
|
- Returns -inf if signal power is zero.
|
||||||
|
"""
|
||||||
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
|
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||||
|
|
||||||
frame_f = np.empty((128, 16), dtype=np.float64)
|
if x_ref.ndim == 1:
|
||||||
for j in range(8):
|
x_ref = x_ref.reshape(-1, 1)
|
||||||
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
if x_hat.ndim == 1:
|
||||||
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
x_hat = x_hat.reshape(-1, 1)
|
||||||
else:
|
|
||||||
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
|
|
||||||
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
|
|
||||||
|
|
||||||
frame_f = np.empty((1024, 2), dtype=np.float64)
|
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||||
frame_f[:, 0] = chl_f[:, 0]
|
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||||
frame_f[:, 1] = chr_f[:, 0]
|
|
||||||
|
|
||||||
frame_t_hat = i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
|
x_ref = x_ref[:n, :c]
|
||||||
|
x_hat = x_hat[:n, :c]
|
||||||
|
|
||||||
start = i * hop
|
err = x_ref - x_hat
|
||||||
y_pad[start:start + win, :] += frame_t_hat
|
ps = float(np.sum(x_ref * x_ref))
|
||||||
|
pn = float(np.sum(err * err))
|
||||||
|
|
||||||
# Remove boundary padding that encoder adds: hop samples at start and hop at end.
|
if pn <= 0.0:
|
||||||
if y_pad.shape[0] < 2 * hop:
|
return float("inf")
|
||||||
raise ValueError("Decoded stream too short to unpad.")
|
if ps <= 0.0:
|
||||||
|
return float("-inf")
|
||||||
|
|
||||||
y = y_pad[hop:-hop, :]
|
return float(10.0 * np.log10(ps / pn))
|
||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
|
||||||
return y
|
|
||||||
|
|
||||||
|
|
||||||
def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float:
|
def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float:
|
||||||
"""
|
"""
|
||||||
Demonstration for Level-1 codec.
|
Demonstration for the Level-1 codec.
|
||||||
|
|
||||||
Runs:
|
Runs:
|
||||||
- aac_coder_1(filename_in)
|
- aac_coder_1(filename_in)
|
||||||
- i_aac_coder_1(aac_seq_1, filename_out)
|
- aac_decoder_1(aac_seq_1, filename_out)
|
||||||
and computes total SNR between original and decoded audio.
|
and computes total SNR between original and decoded audio.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename_in : str | Path
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename (stereo, 48 kHz).
|
Input WAV filename (stereo, 48 kHz).
|
||||||
filename_out : str | Path
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename (stereo, 48 kHz).
|
Output WAV filename (stereo, 48 kHz).
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
SNR : float
|
float
|
||||||
Overall Signal-to-Noise Ratio in dB.
|
Overall SNR in dB.
|
||||||
"""
|
"""
|
||||||
filename_in = Path(filename_in)
|
filename_in = Path(filename_in)
|
||||||
filename_out = Path(filename_out)
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
# Read original audio (reference)
|
# Read original audio (reference) with the same validation as the codec.
|
||||||
x_ref, fs_ref = sf.read(str(filename_in), always_2d=True)
|
x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in)
|
||||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
if int(fs_ref) != 48000:
|
||||||
|
raise ValueError("Input sampling rate must be 48 kHz.")
|
||||||
|
|
||||||
# Encode / decode
|
# Encode / decode
|
||||||
aac_seq_1 = aac_coder_1(filename_in)
|
aac_seq_1 = aac_coder_1(filename_in)
|
||||||
x_hat = i_aac_coder_1(aac_seq_1, filename_out)
|
x_hat = aac_decoder_1(aac_seq_1, filename_out)
|
||||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
|
||||||
|
|
||||||
# Ensure 2D stereo shape (N, 2)
|
# Optional sanity: ensure output file exists and is readable
|
||||||
if x_hat.ndim == 1:
|
x_hat_file, fs_hat = sf.read(str(filename_out), always_2d=True)
|
||||||
x_hat = x_hat.reshape(-1, 1)
|
_ = x_hat_file
|
||||||
if x_ref.ndim == 1:
|
if int(fs_hat) != 48000:
|
||||||
x_ref = x_ref.reshape(-1, 1)
|
raise ValueError("Decoded output sampling rate must be 48 kHz.")
|
||||||
|
|
||||||
# Align lengths (use common overlap)
|
return _snr_db(x_ref, x_hat)
|
||||||
n = min(x_ref.shape[0], x_hat.shape[0])
|
|
||||||
x_ref = x_ref[:n, :]
|
|
||||||
x_hat = x_hat[:n, :]
|
|
||||||
|
|
||||||
# Match channel count conservatively (common channels)
|
|
||||||
c = min(x_ref.shape[1], x_hat.shape[1])
|
|
||||||
x_ref = x_ref[:, :c]
|
|
||||||
x_hat = x_hat[:, :c]
|
|
||||||
|
|
||||||
# Compute overall SNR over all samples and channels
|
|
||||||
err = x_ref - x_hat
|
|
||||||
p_signal = float(np.sum(x_ref * x_ref))
|
|
||||||
p_noise = float(np.sum(err * err))
|
|
||||||
|
|
||||||
if p_noise <= 0.0:
|
|
||||||
return float("inf")
|
|
||||||
if p_signal <= 0.0:
|
|
||||||
# Degenerate case: silent input
|
|
||||||
return -float("inf")
|
|
||||||
# else:
|
|
||||||
snr_db = 10.0 * np.log10(p_signal / p_noise)
|
|
||||||
return float(snr_db)
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Example usage:
|
# Example:
|
||||||
# python -m level_1.level_1 input.wav output.wav
|
# python -m level_1.level_1 input.wav output.wav
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
raise SystemExit("Usage: python -m level_1.level_1 <input.wav> <output.wav>")
|
raise SystemExit("Usage: python -m level_1.level_1 <input.wav> <output.wav>")
|
||||||
|
|
||||||
in_wav = sys.argv[1]
|
in_wav = Path(sys.argv[1])
|
||||||
out_wav = sys.argv[2]
|
out_wav = Path(sys.argv[2])
|
||||||
|
|
||||||
print(f"Encoding/Decoding {in_wav} to {out_wav}")
|
print(f"Encoding/Decoding {in_wav} to {out_wav}")
|
||||||
snr = demo_aac_1(in_wav, out_wav)
|
snr = demo_aac_1(in_wav, out_wav)
|
||||||
print(f"SNR = {snr:.3f} dB")
|
print(f"SNR = {snr:.3f} dB")
|
||||||
|
|
||||||
|
|||||||
@ -1,199 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
# Adjust the import based on package/module layout.
|
|
||||||
from level_1.level_1 import SSC
|
|
||||||
|
|
||||||
# Helper "fixtures" for SSC
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _next_frame_no_attack() -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Build a next_frame_T that should NOT trigger ESH detection.
|
|
||||||
|
|
||||||
Uses exact zeros so all s2l are zero and the ESH condition (s2l > 1e-3) cannot hold.
|
|
||||||
"""
|
|
||||||
return np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
|
|
||||||
|
|
||||||
def _next_frame_strong_attack(
|
|
||||||
*,
|
|
||||||
attack_left: bool,
|
|
||||||
attack_right: bool,
|
|
||||||
segment_l: int = 4,
|
|
||||||
baseline: float = 1e-6,
|
|
||||||
burst_amp: float = 1.0,
|
|
||||||
) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
|
|
||||||
|
|
||||||
Spec: ESH if exists l in {1..7} with s2l > 1e-3 AND ds2l > 10.
|
|
||||||
We create:
|
|
||||||
- small baseline energy in all samples (avoids division by zero in ds2l),
|
|
||||||
- a strong burst inside one 128-sample segment l in 1..7.
|
|
||||||
"""
|
|
||||||
assert 1 <= segment_l <= 7
|
|
||||||
x = np.full((2048, 2), baseline, dtype=np.float64)
|
|
||||||
|
|
||||||
a = segment_l * 128
|
|
||||||
b = (segment_l + 1) * 128
|
|
||||||
|
|
||||||
if attack_left:
|
|
||||||
x[a:b, 0] += burst_amp
|
|
||||||
if attack_right:
|
|
||||||
x[a:b, 1] += burst_amp
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def _next_frame_below_s2l_threshold(
|
|
||||||
*,
|
|
||||||
left: bool,
|
|
||||||
right: bool,
|
|
||||||
segment_l: int = 4,
|
|
||||||
impulse_amp: float = 0.01,
|
|
||||||
) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Construct a next_frame_T where s2l is below 1e-3, so ESH must NOT be triggered,
|
|
||||||
even if ds2l could be large.
|
|
||||||
|
|
||||||
Put a single impulse of amplitude 'impulse_amp' inside a segment.
|
|
||||||
Energy in the 128-sample segment: s2l ~= impulse_amp^2.
|
|
||||||
With impulse_amp=0.01 => s2l ~= 1e-4 < 1e-3.
|
|
||||||
"""
|
|
||||||
assert 1 <= segment_l <= 7
|
|
||||||
x = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
|
|
||||||
idx = segment_l * 128 + 10 # inside segment
|
|
||||||
if left:
|
|
||||||
x[idx, 0] = impulse_amp
|
|
||||||
if right:
|
|
||||||
x[idx, 1] = impulse_amp
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
# 1) Fixed/mandatory cases (prev frame type forces current type)
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
|
|
||||||
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
|
|
||||||
"""
|
|
||||||
Spec: if prev was:
|
|
||||||
- LSS => current MUST be ESH
|
|
||||||
- LPS => current MUST be OLS
|
|
||||||
independent of next frame check.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
|
|
||||||
# Even if next frame has a strong attack, LSS must force ESH.
|
|
||||||
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
|
||||||
out1 = SSC(frame_t, next_attack, "LSS")
|
|
||||||
assert out1 == "ESH"
|
|
||||||
|
|
||||||
# Even if next frame has a strong attack, LPS must force OLS.
|
|
||||||
out2 = SSC(frame_t, next_attack, "LPS")
|
|
||||||
assert out2 == "OLS"
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
# 2) Cases requiring next-frame ESH prediction (energy/attack computation)
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
|
|
||||||
def test_prev_ols_next_not_esh_returns_ols() -> None:
|
|
||||||
"""
|
|
||||||
Spec: if prev=OLS, current is OLS or LSS.
|
|
||||||
Choose LSS iff (i+1) predicted ESH, else OLS.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
next_t = _next_frame_no_attack()
|
|
||||||
|
|
||||||
out = SSC(frame_t, next_t, "OLS")
|
|
||||||
assert out == "OLS"
|
|
||||||
|
|
||||||
|
|
||||||
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
|
|
||||||
"""
|
|
||||||
prev=OLS, next predicted ESH (both channels) => per-channel decisions are LSS and LSS
|
|
||||||
and merge table keeps LSS.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
|
||||||
|
|
||||||
out = SSC(frame_t, next_t, "OLS")
|
|
||||||
assert out == "LSS"
|
|
||||||
|
|
||||||
|
|
||||||
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
|
|
||||||
"""
|
|
||||||
prev=OLS:
|
|
||||||
- one channel predicts ESH => LSS
|
|
||||||
- other channel predicts not ESH => OLS
|
|
||||||
Merge table: OLS + LSS => LSS.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
|
|
||||||
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
|
||||||
out1 = SSC(frame_t, next1_t, "OLS")
|
|
||||||
assert out1 == "LSS"
|
|
||||||
|
|
||||||
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
|
||||||
out2 = SSC(frame_t, next2_t, "OLS")
|
|
||||||
assert out2 == "LSS"
|
|
||||||
|
|
||||||
|
|
||||||
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
|
|
||||||
"""
|
|
||||||
prev=ESH:
|
|
||||||
- next predicted ESH => current ESH (per-channel)
|
|
||||||
Merge table: ESH + ESH => ESH.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
|
||||||
|
|
||||||
out = SSC(frame_t, next_t, "ESH")
|
|
||||||
assert out == "ESH"
|
|
||||||
|
|
||||||
|
|
||||||
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
|
|
||||||
"""
|
|
||||||
prev=ESH:
|
|
||||||
- next not predicted ESH => current LPS (per-channel)
|
|
||||||
Merge table: LPS + LPS => LPS.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
next_t = _next_frame_no_attack()
|
|
||||||
|
|
||||||
out = SSC(frame_t, next_t, "ESH")
|
|
||||||
assert out == "LPS"
|
|
||||||
|
|
||||||
|
|
||||||
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
|
|
||||||
"""
|
|
||||||
prev=ESH:
|
|
||||||
- one channel predicts ESH => ESH
|
|
||||||
- other channel predicts not ESH => LPS
|
|
||||||
Merge table: ESH + LPS => ESH.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
|
|
||||||
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
|
||||||
out1 = SSC(frame_t, next1_t, "ESH")
|
|
||||||
assert out1 == "ESH"
|
|
||||||
|
|
||||||
next2_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
|
||||||
out2 = SSC(frame_t, next2_t, "ESH")
|
|
||||||
assert out2 == "ESH"
|
|
||||||
|
|
||||||
def test_threshold_s2l_must_exceed_1e_3() -> None:
|
|
||||||
"""
|
|
||||||
Spec: next frame is ESH only if s2l > 1e-3 AND ds2l > 10 for some l in 1..7.
|
|
||||||
This test checks the necessity of the s2l threshold:
|
|
||||||
- Create a frame with s2l ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
|
|
||||||
- Expect: not classified as ESH -> for prev=OLS return OLS.
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
next_t = _next_frame_below_s2l_threshold(left=True, right=True, impulse_amp=0.01)
|
|
||||||
|
|
||||||
out = SSC(frame_t, next_t, "OLS")
|
|
||||||
assert out == "OLS"
|
|
||||||
@ -1,235 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from level_1.level_1 import FrameType, WinType, filter_bank, i_filter_bank
|
|
||||||
|
|
||||||
# Helper "fixtures" for filterbank
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _ola_reconstruct(x: np.ndarray, frame_types: list[str], win_type: str) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Analyze-synthesize each frame and overlap-add with hop=1024.
|
|
||||||
x: shape (N,2)
|
|
||||||
frame_types: length K, for frames starting at i*1024
|
|
||||||
"""
|
|
||||||
hop = 1024
|
|
||||||
win = 2048
|
|
||||||
K = len(frame_types)
|
|
||||||
|
|
||||||
y = np.zeros_like(x, dtype=np.float64)
|
|
||||||
|
|
||||||
for i in range(K):
|
|
||||||
start = i * hop
|
|
||||||
frame_t = x[start:start + win, :]
|
|
||||||
frame_f = filter_bank(frame_t, frame_types[i], win_type)
|
|
||||||
frame_t_hat = i_filter_bank(frame_f, frame_types[i], win_type)
|
|
||||||
y[start:start + win, :] += frame_t_hat
|
|
||||||
|
|
||||||
return y
|
|
||||||
|
|
||||||
|
|
||||||
def _snr_db(x: np.ndarray, y: np.ndarray) -> float:
|
|
||||||
err = x - y
|
|
||||||
ps = float(np.sum(x * x))
|
|
||||||
pn = float(np.sum(err * err))
|
|
||||||
if pn <= 0.0:
|
|
||||||
return float("inf")
|
|
||||||
return 10.0 * np.log10(ps / pn)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
# Forward filterbank tests
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
|
||||||
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
|
|
||||||
"""
|
|
||||||
Contract test:
|
|
||||||
For OLS/LSS/LPS, filter_bank returns shape (1024, 2).
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
frame_f = filter_bank(frame_t, frame_type, win_type)
|
|
||||||
assert frame_f.shape == (1024, 2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_filterbank_shapes_esh(win_type: WinType) -> None:
|
|
||||||
"""
|
|
||||||
Contract test:
|
|
||||||
For ESH, filter_bank returns shape (128, 16).
|
|
||||||
"""
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
frame_f = filter_bank(frame_t, "ESH", win_type)
|
|
||||||
assert frame_f.shape == (128, 16)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
|
|
||||||
"""
|
|
||||||
Module behavior test:
|
|
||||||
For OLS (representative long-sequence), channels are processed independently:
|
|
||||||
- If right channel is zero and left is random, right spectrum should be near zero.
|
|
||||||
"""
|
|
||||||
rng = np.random.default_rng(0)
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
frame_t[:, 0] = rng.normal(size=2048)
|
|
||||||
|
|
||||||
frame_f = filter_bank(frame_t, "OLS", win_type)
|
|
||||||
|
|
||||||
# Right channel output should be (close to) zero
|
|
||||||
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
|
|
||||||
"""
|
|
||||||
Module behavior test:
|
|
||||||
For ESH, channels are processed independently:
|
|
||||||
- If right channel is zero and left is random, all odd columns (right) should be near zero.
|
|
||||||
"""
|
|
||||||
rng = np.random.default_rng(1)
|
|
||||||
frame_t = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
frame_t[:, 0] = rng.normal(size=2048)
|
|
||||||
|
|
||||||
frame_f = filter_bank(frame_t, "ESH", win_type)
|
|
||||||
|
|
||||||
# Right channel appears in columns 1,3,5,...,15
|
|
||||||
right_cols = frame_f[:, 1::2]
|
|
||||||
assert np.max(np.abs(right_cols)) < 1e-9
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
|
|
||||||
"""
|
|
||||||
Spec-driven behavior test:
|
|
||||||
ESH uses only the central 1152 samples (from 448 to 1599), split into 8 overlapping
|
|
||||||
windows of length 256 with 50% overlap.
|
|
||||||
|
|
||||||
Therefore, changing samples outside [448, 1600) must not affect the output.
|
|
||||||
"""
|
|
||||||
rng = np.random.default_rng(2)
|
|
||||||
|
|
||||||
frame_a = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
frame_b = np.zeros((2048, 2), dtype=np.float64)
|
|
||||||
|
|
||||||
# Same central region for both frames
|
|
||||||
center = rng.normal(size=(1152, 2))
|
|
||||||
frame_a[448:1600, :] = center
|
|
||||||
frame_b[448:1600, :] = center
|
|
||||||
|
|
||||||
# Modify only the outer regions of frame_b
|
|
||||||
frame_b[0:448, :] = rng.normal(size=(448, 2))
|
|
||||||
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
|
|
||||||
|
|
||||||
fa = filter_bank(frame_a, "ESH", win_type)
|
|
||||||
fb = filter_bank(frame_b, "ESH", win_type)
|
|
||||||
|
|
||||||
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=0.0)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_filterbank_output_is_finite(win_type: WinType) -> None:
|
|
||||||
"""
|
|
||||||
Sanity test:
|
|
||||||
Output must not contain NaN or inf for representative cases.
|
|
||||||
"""
|
|
||||||
rng = np.random.default_rng(3)
|
|
||||||
frame_t = rng.normal(size=(2048, 2)).astype(np.float64)
|
|
||||||
|
|
||||||
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
|
||||||
frame_f = filter_bank(frame_t, frame_type, win_type)
|
|
||||||
assert np.isfinite(frame_f).all()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
# Reverse i_filterbank tests
|
|
||||||
# ---------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_ifilterbank_shapes_long_sequences(win_type: str) -> None:
|
|
||||||
frame_f = np.zeros((1024, 2), dtype=np.float64)
|
|
||||||
for frame_type in ("OLS", "LSS", "LPS"):
|
|
||||||
frame_t = i_filter_bank(frame_f, frame_type, win_type)
|
|
||||||
assert frame_t.shape == (2048, 2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_ifilterbank_shapes_esh(win_type: str) -> None:
|
|
||||||
frame_f = np.zeros((128, 16), dtype=np.float64)
|
|
||||||
frame_t = i_filter_bank(frame_f, "ESH", win_type)
|
|
||||||
assert frame_t.shape == (2048, 2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_roundtrip_per_frame_is_finite(win_type: str) -> None:
|
|
||||||
rng = np.random.default_rng(0)
|
|
||||||
frame_t = rng.normal(size=(2048, 2)).astype(np.float64)
|
|
||||||
|
|
||||||
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
|
||||||
frame_f = filter_bank(frame_t, frame_type, win_type)
|
|
||||||
frame_t_hat = i_filter_bank(frame_f, frame_type, win_type)
|
|
||||||
assert np.isfinite(frame_t_hat).all()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_ola_reconstruction_ols_high_snr(win_type: str) -> None:
|
|
||||||
"""
|
|
||||||
Core module-level test:
|
|
||||||
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
|
|
||||||
in the steady-state region.
|
|
||||||
"""
|
|
||||||
rng = np.random.default_rng(1)
|
|
||||||
|
|
||||||
K = 6
|
|
||||||
N = 1024 * (K + 1)
|
|
||||||
x = rng.normal(size=(N, 2)).astype(np.float64)
|
|
||||||
|
|
||||||
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
|
|
||||||
|
|
||||||
# Exclude edges (first and last hop) where full overlap is not available
|
|
||||||
a = 1024
|
|
||||||
b = N - 1024
|
|
||||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
|
||||||
assert snr > 50.0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_ola_reconstruction_esh_high_snr(win_type: str) -> None:
|
|
||||||
"""
|
|
||||||
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
|
|
||||||
in the steady-state region.
|
|
||||||
"""
|
|
||||||
rng = np.random.default_rng(2)
|
|
||||||
|
|
||||||
K = 6
|
|
||||||
N = 1024 * (K + 1)
|
|
||||||
x = rng.normal(size=(N, 2)).astype(np.float64)
|
|
||||||
|
|
||||||
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
|
|
||||||
|
|
||||||
a = 1024
|
|
||||||
b = N - 1024
|
|
||||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
|
||||||
assert snr > 45.0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
|
||||||
def test_ola_reconstruction_transition_sequence(win_type: str) -> None:
|
|
||||||
"""
|
|
||||||
Transition sequence test matching the windowing logic:
|
|
||||||
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
|
|
||||||
"""
|
|
||||||
rng = np.random.default_rng(3)
|
|
||||||
|
|
||||||
frame_types = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
|
|
||||||
K = len(frame_types)
|
|
||||||
N = 1024 * (K + 1)
|
|
||||||
x = rng.normal(size=(N, 2)).astype(np.float64)
|
|
||||||
|
|
||||||
y = _ola_reconstruct(x, frame_types, win_type)
|
|
||||||
|
|
||||||
a = 1024
|
|
||||||
b = N - 1024
|
|
||||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
|
||||||
assert snr > 40.0
|
|
||||||
21
source/level_2/level_2.py
Normal file
21
source/level_2/level_2.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Level 2 Wrappers + Demo
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 2 wrapper module.
|
||||||
|
#
|
||||||
|
# This file provides:
|
||||||
|
# - Thin wrappers for Level 2 API functions (encode/decode) that delegate
|
||||||
|
# to the corresponding core implementations.
|
||||||
|
# - A demo function that runs end-to-end and computes SNR.
|
||||||
|
# - A small CLI entrypoint for convenience.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
4
source/pytest.ini
Normal file
4
source/pytest.ini
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
[pytest]
|
||||||
|
pythonpath = .
|
||||||
|
testpaths =
|
||||||
|
core/tests
|
||||||
Loading…
x
Reference in New Issue
Block a user