Level 2: Core functionality and level_2 script wrappers added

This commit is contained in:
Christos Choutouridis 2026-02-08 19:43:26 +02:00
parent 399abebd2c
commit 9931e3830a
39 changed files with 4720 additions and 913 deletions

View File

@ -30,6 +30,7 @@ import soundfile as sf
from core.aac_configuration import WIN_TYPE from core.aac_configuration import WIN_TYPE
from core.aac_filterbank import aac_filter_bank from core.aac_filterbank import aac_filter_bank
from core.aac_ssc import aac_SSC from core.aac_ssc import aac_SSC
from core.aac_tns import aac_tns
from core.aac_types import * from core.aac_types import *
@ -144,8 +145,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
AACSeq1 AACSeq1
List of encoded frames (Level 1 schema). List of encoded frames (Level 1 schema).
""" """
x, fs = aac_read_wav_stereo_48k(filename_in) x, _ = aac_read_wav_stereo_48k(filename_in)
_ = fs # kept for clarity; The assignment assumes 48 kHz # The assignment assumes 48 kHz
hop = 1024 hop = 1024
win = 2048 win = 2048
@ -196,3 +197,88 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
prev_frame_type = frame_type prev_frame_type = frame_type
return aac_seq return aac_seq
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
"""
Level-2 AAC encoder (Level 1 + TNS).
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
Returns
-------
AACSeq2
Encoded AAC sequence (Level 2 payload schema).
For each frame i:
- "frame_type": FrameType
- "win_type": WinType
- "chl"/"chr":
- "frame_F": FrameChannelF (after TNS)
- "tns_coeffs": TnsCoeffs
"""
filename_in = Path(filename_in)
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS"
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start : start + win, :]
if frame_t.shape != (win, 2):
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop : start + hop + win, :]
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
# Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
# Unpack to per-channel (as you already do in Level 1)
if frame_type == "ESH":
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
else:
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
# Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
aac_seq.append(
{
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
}
)
prev_frame_type = frame_type
return aac_seq

View File

@ -17,6 +17,15 @@ from __future__ import annotations
# Imports # Imports
from core.aac_types import WinType from core.aac_types import WinType
# Filterbank
# ------------------------------------------------------------
# Window type # Window type
# Options: "SIN", "KBD" # Options: "SIN", "KBD"
WIN_TYPE: WinType = "SIN" WIN_TYPE: WinType = "SIN"
# TNS
# ------------------------------------------------------------
PRED_ORDER = 4
QUANT_STEP = 0.1
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]

View File

@ -28,6 +28,7 @@ from typing import Union
import soundfile as sf import soundfile as sf
from core.aac_filterbank import aac_i_filter_bank from core.aac_filterbank import aac_i_filter_bank
from core.aac_tns import aac_i_tns
from core.aac_types import * from core.aac_types import *
@ -164,3 +165,93 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-2 AAC decoder (inverse of aac_coder_2).
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
- Per frame/channel: inverse TNS using stored coefficients
- Re-pack to stereo frame_F
- IMDCT + windowing
- Overlap-add over frames
- Remove Level-1 padding (hop samples start/end)
- Write output WAV (48 kHz)
Parameters
----------
aac_seq_2 : AACSeq2
Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path]
Output WAV filename.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_2)
if K <= 0:
raise ValueError("aac_seq_2 must contain at least one frame.")
n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
for i, fr in enumerate(aac_seq_2):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
# Inverse TNS per channel
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
# Re-pack to the stereo container expected by aac_i_filter_bank
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
else:
# Accept either (1024,1) or (1024,) from your internal convention.
if chl_f.shape == (1024,):
chl_col = chl_f.reshape(1024, 1)
elif chl_f.shape == (1024, 1):
chl_col = chl_f
else:
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
if chr_f.shape == (1024,):
chr_col = chr_f.reshape(1024, 1)
elif chr_f.shape == (1024, 1):
chr_col = chr_f
else:
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_col[:, 0]
frame_f[:, 1] = chr_col[:, 0]
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
start = i * hop
y_pad[start : start + win, :] += frame_t_hat
y = aac_remove_padding(y_pad, hop=hop)
sf.write(str(filename_out), y, 48000)
return y

60
source/core/aac_snr_db.py Normal file
View File

@ -0,0 +1,60 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - SNR dB calculator
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements SNR calculation in dB
# ------------------------------------------------------------
from __future__ import annotations
from core.aac_types import StereoSignal
import numpy as np
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference stereo stream.
x_hat : StereoSignal
Reconstructed stereo stream.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))

549
source/core/aac_tns.py Normal file
View File

@ -0,0 +1,549 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Temporal Noise Shaping (TNS)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Temporal Noise Shaping (TNS) module (Level 2).
#
# Public API:
# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs)
#
# Notes (per assignment):
# - TNS is applied per channel (not stereo).
# - For ESH, TNS is applied independently to each of the 8 short subframes.
# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short)
# provided in TableB219.mat.
# - Predictor order is fixed to p = 4.
# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1.
# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Tuple
import numpy as np
from scipy.io import loadmat
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
from core.aac_types import *
# -----------------------------------------------------------------------------
# Private helpers
# -----------------------------------------------------------------------------
_B219_CACHE: dict[str, FloatArray] | None = None
def _load_b219_tables() -> dict[str, FloatArray]:
"""
Load TableB219.mat and cache the contents.
The project layout guarantees that a 'material' directory is discoverable
from the current working directory (tests and level_123 entrypoints).
Returns
-------
dict[str, FloatArray]
Keys:
- "B219a": long bands table (for K=1024 MDCT lines)
- "B219b": short bands table (for K=128 MDCT lines)
"""
global _B219_CACHE
if _B219_CACHE is not None:
return _B219_CACHE
mat_path = Path("material") / "TableB219.mat"
if not mat_path.exists():
raise FileNotFoundError("Could not locate material/TableB219.mat in the current working directory.")
d = loadmat(str(mat_path))
if "B219a" not in d or "B219b" not in d:
raise ValueError("TableB219.mat missing required variables B219a and/or B219b.")
_B219_CACHE = {
"B219a": np.asarray(d["B219a"], dtype=np.float64),
"B219b": np.asarray(d["B219b"], dtype=np.float64),
}
return _B219_CACHE
def _band_ranges_for_kcount(k_count: int) -> BandRanges:
"""
Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count.
Parameters
----------
k_count : int
Number of MDCT lines:
- 1024 for long frames
- 128 for short subframes (ESH)
Returns
-------
BandRanges (list[tuple[int, int]])
Each tuple is (start_k, end_k) inclusive.
"""
tables = _load_b219_tables()
if k_count == 1024:
tbl = tables["B219a"]
elif k_count == 128:
tbl = tables["B219b"]
else:
raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).")
start = tbl[:, 1].astype(int)
end = tbl[:, 2].astype(int)
ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)]
for s, e in ranges:
if s < 0 or e < s or e >= k_count:
raise ValueError("Invalid band table ranges for given k_count.")
return ranges
# -----------------------------------------------------------------------------
# Core DSP helpers
# -----------------------------------------------------------------------------
def _smooth_sw_inplace(sw: MdctCoeffs) -> None:
"""
Smooth Sw(k) to reduce discontinuities between adjacent Bark bands.
The assignment applies two passes:
- Backward: Sw(k) = (Sw(k) + Sw(k+1))/2
- Forward: Sw(k) = (Sw(k) + Sw(k-1))/2
Parameters
----------
sw : MdctCoeffs
1-D array of length K (float64). Modified in-place.
"""
k_count = int(sw.shape[0])
for k in range(k_count - 2, -1, -1):
sw[k] = 0.5 * (sw[k] + sw[k + 1])
for k in range(1, k_count):
sw[k] = 0.5 * (sw[k] + sw[k - 1])
def _compute_sw(x: MdctCoeffs) -> MdctCoeffs:
"""
Compute Sw(k) from band energies P(j) and apply boundary smoothing.
Parameters
----------
x : MdctCoeffs
1-D MDCT line array, length K.
Returns
-------
MdctCoeffs
Sw(k), 1-D array of length K, float64.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
k_count = int(x.shape[0])
bands = _band_ranges_for_kcount(k_count)
sw = np.zeros(k_count, dtype=np.float64)
for s, e in bands:
seg = x[s : e + 1]
p_j = float(np.sum(seg * seg))
sw_val = float(np.sqrt(p_j))
sw[s : e + 1] = sw_val
_smooth_sw_inplace(sw)
return sw
def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs:
"""
Autocorrelation r(m) for m=0..p.
Parameters
----------
x : MdctCoeffs
1-D signal.
p : int
Maximum lag.
Returns
-------
MdctCoeffs
r, shape (p+1,), float64.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
n = int(x.shape[0])
r = np.zeros(p + 1, dtype=np.float64)
for m in range(p + 1):
r[m] = float(np.dot(x[m:], x[: n - m]))
return r
def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs:
"""
Solve Yule-Walker normal equations for LPC coefficients of order p.
Parameters
----------
xw : MdctCoeffs
1-D normalized sequence Xw(k).
p : int
Predictor order.
Returns
-------
MdctCoeffs
LPC coefficients a[0..p-1], shape (p,), float64.
"""
r = _autocorr(xw, p)
R = np.empty((p, p), dtype=np.float64)
for i in range(p):
for j in range(p):
R[i, j] = r[abs(i - j)]
rhs = r[1 : p + 1].reshape(p)
reg = 1e-12
R_reg = R + reg * np.eye(p, dtype=np.float64)
a = np.linalg.solve(R_reg, rhs)
return a
def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs:
"""
Quantize LPC coefficients with uniform symmetric quantizer and clamp.
Parameters
----------
a : MdctCoeffs
LPC coefficient array, shape (p,).
Returns
-------
MdctCoeffs
Quantized coefficients, shape (p,), float64.
"""
a = np.asarray(a, dtype=np.float64).reshape(-1)
q = np.round(a / QUANT_STEP) * QUANT_STEP
q = np.clip(q, -QUANT_MAX, QUANT_MAX)
return q.astype(np.float64, copy=False)
def _is_inverse_stable(a_q: MdctCoeffs) -> bool:
"""
Check stability of the inverse TNS filter H_TNS^{-1}.
Forward filter:
H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
Inverse filter poles are roots of:
A(z) = 1 - a1 z^-1 - ... - ap z^-p
Multiply by z^p:
z^p - a1 z^{p-1} - ... - ap = 0
Stability condition:
all roots satisfy |z| < 1.
Parameters
----------
a_q : MdctCoeffs
Quantized predictor coefficients, shape (p,).
Returns
-------
bool
True if stable, else False.
"""
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
# Polynomial in z: z^p - a1 z^{p-1} - ... - ap
poly = np.empty(p + 1, dtype=np.float64)
poly[0] = 1.0
poly[1:] = -a_q
roots = np.roots(poly)
# Strictly inside unit circle for stability. Add tiny margin for numeric safety.
margin = 1e-12
return bool(np.all(np.abs(roots) < (1.0 - margin)))
def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs:
"""
Make quantized predictor coefficients stable for inverse filtering.
Policy:
- If already stable: return as-is.
- Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid.
- If still unstable after attempts: fall back to all-zero coefficients (disable TNS).
Parameters
----------
a_q : MdctCoeffs
Quantized predictor coefficients, shape (p,).
Returns
-------
MdctCoeffs
Stable quantized coefficients, shape (p,).
"""
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
if _is_inverse_stable(a_q):
return a_q
# Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid.
gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1)
for g in gammas:
cand = _quantize_coeffs(g * a_q)
if _is_inverse_stable(cand):
return cand
# Last resort: disable TNS for this vector
return np.zeros_like(a_q, dtype=np.float64)
def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
"""
Apply forward TNS FIR filter:
y[k] = x[k] - sum_{l=1..p} a_l * x[k-l]
Parameters
----------
x : MdctCoeffs
1-D MDCT lines, length K.
a_q : MdctCoeffs
Quantized LPC coefficients, shape (p,).
Returns
-------
MdctCoeffs
Filtered MDCT lines y, length K.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
k_count = int(x.shape[0])
y = np.zeros(k_count, dtype=np.float64)
for k in range(k_count):
acc = x[k]
for l in range(1, p + 1):
if k - l >= 0:
acc -= a_q[l - 1] * x[k - l]
y[k] = acc
return y
def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
"""
Apply inverse TNS IIR filter:
x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l]
Parameters
----------
y : MdctCoeffs
1-D MDCT lines after TNS, length K.
a_q : MdctCoeffs
Quantized LPC coefficients, shape (p,).
Returns
-------
MdctCoeffs
Reconstructed MDCT lines x_hat, length K.
"""
y = np.asarray(y, dtype=np.float64).reshape(-1)
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
k_count = int(y.shape[0])
x_hat = np.zeros(k_count, dtype=np.float64)
for k in range(k_count):
acc = y[k]
for l in range(1, p + 1):
if k - l >= 0:
acc += a_q[l - 1] * x_hat[k - l]
x_hat[k] = acc
return x_hat
def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
"""
TNS for a single MDCT vector (one long frame or one short subframe).
Steps:
1) Compute Sw(k) from Bark band energies and smooth it.
2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0).
3) Compute LPC coefficients (order p=PRED_ORDER) on Xw.
4) Quantize coefficients (4-bit symmetric, step QUANT_STEP).
5) Apply FIR filter on original X(k) using quantized coefficients.
Parameters
----------
x : MdctCoeffs
1-D MDCT vector.
Returns
-------
y : MdctCoeffs
TNS-processed MDCT vector (same length).
a_q : MdctCoeffs
Quantized LPC coefficients, shape (PRED_ORDER,).
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
sw = _compute_sw(x)
eps = 1e-12
xw = np.where(sw > eps, x / sw, 0.0)
a = _lpc_coeffs(xw, PRED_ORDER)
a_q = _quantize_coeffs(a)
# Ensure inverse stability (assignment requirement)
a_q = _stabilize_quantized_coeffs(a_q)
y = _apply_tns_fir(x, a_q)
return y, a_q
# -----------------------------------------------------------------------------
# Public Functions (Level 2)
# -----------------------------------------------------------------------------
def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]:
"""
Temporal Noise Shaping (TNS) for ONE channel.
Parameters
----------
frame_F_in : FrameChannelF
Per-channel MDCT coefficients.
Expected (typical) shapes:
- If frame_type == "ESH": (128, 8)
- Else: (1024, 1) or (1024,)
frame_type : FrameType
Frame type code ("OLS", "LSS", "ESH", "LPS").
Returns
-------
frame_F_out : FrameChannelF
Per-channel MDCT coefficients after applying TNS.
Same shape convention as input.
tns_coeffs : TnsCoeffs
Quantized TNS predictor coefficients.
Expected shapes:
- If frame_type == "ESH": (PRED_ORDER, 8)
- Else: (PRED_ORDER, 1)
"""
x = np.asarray(frame_F_in, dtype=np.float64)
if frame_type == "ESH":
if x.shape != (128, 8):
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
y = np.empty_like(x, dtype=np.float64)
a_out = np.empty((PRED_ORDER, 8), dtype=np.float64)
for j in range(8):
y[:, j], a_out[:, j] = _tns_one_vector(x[:, j])
return y, a_out
if x.shape == (1024,):
x_vec = x
out_shape = (1024,)
elif x.shape == (1024, 1):
x_vec = x[:, 0]
out_shape = (1024, 1)
else:
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
y_vec, a_q = _tns_one_vector(x_vec)
if out_shape == (1024,):
y_out = y_vec
else:
y_out = y_vec.reshape(1024, 1)
a_out = a_q.reshape(PRED_ORDER, 1)
return y_out, a_out
def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF:
"""
Inverse Temporal Noise Shaping (iTNS) for ONE channel.
Parameters
----------
frame_F_in : FrameChannelF
Per-channel MDCT coefficients after TNS.
Expected (typical) shapes:
- If frame_type == "ESH": (128, 8)
- Else: (1024, 1) or (1024,)
frame_type : FrameType
Frame type code ("OLS", "LSS", "ESH", "LPS").
tns_coeffs : TnsCoeffs
Quantized TNS predictor coefficients.
Expected shapes:
- If frame_type == "ESH": (PRED_ORDER, 8)
- Else: (PRED_ORDER, 1)
Returns
-------
FrameChannelF
Per-channel MDCT coefficients after inverse TNS.
Same shape convention as input frame_F_in.
"""
x = np.asarray(frame_F_in, dtype=np.float64)
a = np.asarray(tns_coeffs, dtype=np.float64)
if frame_type == "ESH":
if x.shape != (128, 8):
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
if a.shape != (PRED_ORDER, 8):
raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).")
y = np.empty_like(x, dtype=np.float64)
for j in range(8):
y[:, j] = _apply_itns_iir(x[:, j], a[:, j])
return y
if a.shape != (PRED_ORDER, 1):
raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).")
if x.shape == (1024,):
x_vec = x
out_shape = (1024,)
elif x.shape == (1024, 1):
x_vec = x[:, 0]
out_shape = (1024, 1)
else:
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
y_vec = _apply_itns_iir(x_vec, a[:, 0])
if out_shape == (1024,):
return y_vec
return y_vec.reshape(1024, 1)

View File

@ -10,7 +10,6 @@
# #
# Description: # Description:
# This module implements Public Type aliases # This module implements Public Type aliases
#
# ------------------------------------------------------------ # ------------------------------------------------------------
from __future__ import annotations from __future__ import annotations
@ -39,7 +38,7 @@ Window type codes (AAC):
""" """
ChannelKey: TypeAlias = Literal["chl", "chr"] ChannelKey: TypeAlias = Literal["chl", "chr"]
"""Channel dictionary keys used in Level 1 payloads.""" """Channel dictionary keys used in Level payloads."""
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -105,6 +104,40 @@ Examples:
dtype: float64 dtype: float64
""" """
MdctFrameChannel: TypeAlias = FloatArray
"""
Per-channel MDCT container used in Level-1/2 sequences.
Typical shapes:
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
dtype: float64
Notes
-----
Some parts of the assignment store long-frame coefficients as a column vector
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
when convenient, but the semantic meaning is identical.
"""
TnsCoeffs: TypeAlias = FloatArray
"""
Quantized TNS predictor coefficients (one channel).
Typical shapes (Level 2):
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
- Else: (4, 1) (order p=4 for the long frame)
dtype: float64
Notes
-----
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
We store the quantized coefficient values as float64 (typically multiples of 0.1)
to keep the pipeline simple and readable.
"""
FrameT: TypeAlias = FloatArray FrameT: TypeAlias = FloatArray
""" """
@ -142,17 +175,23 @@ Rationale for ESH (128, 16):
dtype: float64 dtype: float64
""" """
FrameChannelF: TypeAlias = FloatArray FrameChannelF: TypeAlias = MdctFrameChannel
""" """
Frequency-domain single-channel frame (MDCT coefficients). Frequency-domain single-channel MDCT coefficients.
Typical shapes (Level 1): Typical shapes (Level 1/2):
- If frame_type in {"OLS","LSS","LPS"}: (1024,) - If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) - If frame_type == "ESH": (128, 8)
dtype: float64 dtype: float64
""" """
BandRanges: TypeAlias = list[tuple[int, int]]
"""
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
Used by TNS to map MDCT indices k to Bark bands.
"""
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Level 1 AAC sequence payload types # Level 1 AAC sequence payload types
@ -168,7 +207,7 @@ class AACChannelFrameF(TypedDict):
The MDCT coefficients for ONE channel. The MDCT coefficients for ONE channel.
Typical shapes: Typical shapes:
- ESH: (128, 8) (8 short subframes) - ESH: (128, 8) (8 short subframes)
- else: (1024, ) - else: (1024, 1) or (1024,)
""" """
frame_F: FrameChannelF frame_F: FrameChannelF
@ -191,3 +230,53 @@ List of length K (K = number of frames).
Each element is a dict with keys: Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr" - "frame_type", "win_type", "chl", "chr"
""" """
# -----------------------------------------------------------------------------
# Level 2 AAC sequence payload types (TNS)
# -----------------------------------------------------------------------------
class AACChannelFrameF2(TypedDict):
"""
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
Keys
----
frame_F:
The TNS-processed MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8)
- else: (1024, 1) or (1024,)
tns_coeffs:
Quantized TNS predictor coefficients for ONE channel.
Typical shapes:
- ESH: (PRED_ORDER, 8)
- else: (PRED_ORDER, 1)
"""
frame_F: FrameChannelF
tns_coeffs: TnsCoeffs
class AACSeq2Frame(TypedDict):
"""
One frame dictionary element of aac_seq_2 (Level 2).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF2
chr: AACChannelFrameF2
AACSeq2: TypeAlias = List[AACSeq2Frame]
"""
AAC sequence for Level 2:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
Level 2 adds:
- per-channel "tns_coeffs"
and stores:
- per-channel "frame_F" after applying TNS.
"""

View File

@ -19,59 +19,15 @@ import numpy as np
import pytest import pytest
import soundfile as sf import soundfile as sf
from core.aac_coder import aac_coder_1 from core.aac_coder import aac_coder_1, aac_coder_2, aac_read_wav_stereo_48k
from core.aac_decoder import aac_decoder_1 from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_remove_padding
from core.aac_types import * from core.aac_types import *
from core.aac_snr_db import snr_db
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1 # Helper "fixtures" for aac_coder_1 / i_aac_coder_1
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference signal, shape (N, 2) typical.
x_hat : StereoSignal
Reconstructed signal, shape (M, 2) typical.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
# Be conservative: align lengths and common channels.
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))
@pytest.fixture() @pytest.fixture()
def tmp_stereo_wav(tmp_path: Path) -> Path: def tmp_stereo_wav(tmp_path: Path) -> Path:
""" """
@ -89,6 +45,56 @@ def tmp_stereo_wav(tmp_path: Path) -> Path:
return wav_path return wav_path
# -----------------------------------------------------------------------------
# Helper-function tests
# -----------------------------------------------------------------------------
def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None:
"""
Contract test for aac_read_wav_stereo_48k():
- Reads stereo WAV
- Returns float64 array with shape (N,2)
- Returns fs = 48000
"""
x, fs = aac_read_wav_stereo_48k(tmp_stereo_wav)
assert int(fs) == 48000
assert isinstance(x, np.ndarray)
assert x.dtype == np.float64
assert x.ndim == 2
assert x.shape[1] == 2
assert x.shape[0] > 0
def test_aac_remove_padding_removes_hop_from_both_ends() -> None:
"""
Contract test for aac_remove_padding():
- Removes 'hop' samples from start and end.
"""
hop = 1024
n = 10000
y_pad: StereoSignal = np.zeros((n, 2), dtype=np.float64)
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
assert y.shape == (n - 2 * hop, 2)
assert y.dtype == np.float64
def test_aac_remove_padding_errors_on_too_short_input() -> None:
"""
aac_remove_padding must raise if y_pad is shorter than 2*hop.
"""
hop = 1024
y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64)
with pytest.raises(ValueError):
_ = aac_remove_padding(y_pad, hop=hop)
# -----------------------------------------------------------------------------
# Level 1 tests
# -----------------------------------------------------------------------------
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
""" """
Module-level contract test: Module-level contract test:
@ -152,5 +158,68 @@ def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: P
assert int(fs_hat) == 48000 assert int(fs_hat) == 48000
# SNR against returned array (file should match closely, but we do not require it here). # SNR against returned array (file should match closely, but we do not require it here).
snr = _snr_db(x_ref, x_hat) snr = snr_db(x_ref, x_hat)
assert snr > 80.0 assert snr > 80.0
# -----------------------------------------------------------------------------
# Level 2 tests (new)
# -----------------------------------------------------------------------------
def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
"""
Module-level contract test (Level 2):
Ensure aac_seq_2 follows the expected schema and per-frame shapes, including tns_coeffs.
"""
aac_seq: AACSeq2 = aac_coder_2(tmp_stereo_wav)
assert isinstance(aac_seq, list)
assert len(aac_seq) > 0
for fr in aac_seq:
assert "frame_type" in fr
assert "win_type" in fr
assert "chl" in fr
assert "chr" in fr
frame_type: FrameType = fr["frame_type"]
assert frame_type in ("OLS", "LSS", "ESH", "LPS")
for ch_key in ("chl", "chr"):
ch = fr[ch_key]
assert "frame_F" in ch
assert "tns_coeffs" in ch
frame_f = np.asarray(ch["frame_F"], dtype=np.float64)
coeffs = np.asarray(ch["tns_coeffs"], dtype=np.float64)
if frame_type == "ESH":
assert frame_f.shape == (128, 8)
assert coeffs.shape[0] == 4
assert coeffs.shape[1] == 8
else:
assert frame_f.shape == (1024, 1)
assert coeffs.shape == (4, 1)
def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
"""
End-to-end test (Level 2):
Encode + decode and check SNR remains very high.
Level 2 is still floating-point (TNS is reversible), so reconstruction
should remain numerical-noise only.
"""
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
x_ref = np.asarray(x_ref, dtype=np.float64)
assert int(fs) == 48000
out_wav = tmp_path / "out_l2.wav"
aac_seq = aac_coder_2(tmp_stereo_wav)
x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav)
assert out_wav.exists()
_, fs_hat = sf.read(str(out_wav), always_2d=True)
assert int(fs_hat) == 48000
snr = snr_db(x_ref, x_hat)
assert snr > 75.0

View File

@ -17,6 +17,7 @@ from typing import Sequence
import pytest import pytest
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
from core.aac_snr_db import snr_db
from core.aac_types import * from core.aac_types import *
# Helper fixtures for filterbank # Helper fixtures for filterbank
@ -56,20 +57,6 @@ def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type
return y return y
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
"""
Compute SNR in dB over all samples/channels.
"""
err = x - y
ps = float(np.sum(x * x))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return 10.0 * float(np.log10(ps / pn))
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Forward filterbank tests # Forward filterbank tests
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -223,7 +210,7 @@ def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
a = 1024 a = 1024
b = N - 1024 b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :]) snr = snr_db(x[a:b, :], y[a:b, :])
assert snr > 50.0 assert snr > 50.0
@ -244,7 +231,7 @@ def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
a = 1024 a = 1024
b = N - 1024 b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :]) snr = snr_db(x[a:b, :], y[a:b, :])
assert snr > 45.0 assert snr > 45.0
@ -265,5 +252,5 @@ def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
a = 1024 a = 1024
b = N - 1024 b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :]) snr = snr_db(x[a:b, :], y[a:b, :])
assert snr > 40.0 assert snr > 40.0

View File

@ -0,0 +1,98 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - SNR dB Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Basic tests for SNR calculation utility.
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
import pytest
from core.aac_snr_db import snr_db
from core.aac_types import StereoSignal
def test_snr_perfect_reconstruction_returns_inf() -> None:
"""
If x_hat == x_ref exactly, noise power is zero and SNR must be +inf.
"""
rng = np.random.default_rng(0)
x: StereoSignal = rng.normal(size=(1024, 2)).astype(np.float64)
snr = snr_db(x, x)
assert snr == float("inf")
def test_snr_zero_reference_returns_minus_inf() -> None:
"""
If reference signal is identically zero, signal power is zero
and SNR must be -inf (unless noise is also zero, which is degenerate).
"""
x_ref: StereoSignal = np.zeros((1024, 2), dtype=np.float64)
x_hat: StereoSignal = np.ones((1024, 2), dtype=np.float64)
snr = snr_db(x_ref, x_hat)
assert snr == float("-inf")
def test_snr_known_noise_level_matches_expected_value() -> None:
"""
Deterministic test with known signal and noise power.
Let:
x_ref = ones
x_hat = ones + noise
With noise variance sigma^2, expected SNR:
10 * log10(Ps / Pn)
"""
n = 1000
sigma = 0.1
x_ref: StereoSignal = np.ones((n, 2), dtype=np.float64)
noise = sigma * np.ones((n, 2), dtype=np.float64)
x_hat: StereoSignal = x_ref + noise
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(noise * noise))
expected = 10.0 * np.log10(ps / pn)
snr = snr_db(x_ref, x_hat)
assert np.isclose(snr, expected, rtol=1e-12, atol=1e-12)
def test_snr_aligns_different_lengths_and_channels() -> None:
"""
The function must:
- align to minimum length
- align to minimum channel count
without crashing.
"""
rng = np.random.default_rng(1)
x_ref: StereoSignal = rng.normal(size=(1000, 2)).astype(np.float64)
x_hat: StereoSignal = rng.normal(size=(800, 1)).astype(np.float64)
snr = snr_db(x_ref, x_hat)
assert np.isfinite(snr)
def test_snr_accepts_1d_inputs() -> None:
"""
1-D inputs must be accepted and treated as single-channel signals.
"""
rng = np.random.default_rng(2)
x_ref = rng.normal(size=1024).astype(np.float64)
x_hat = x_ref + 0.01 * rng.normal(size=1024).astype(np.float64)
snr = snr_db(x_ref, x_hat)
assert np.isfinite(snr)

View File

@ -0,0 +1,196 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - TNS Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Temporal Noise Shaping (TNS) module (Level 2).
#
# Validates:
# - I/O shapes for long and ESH modes
# - Quantization grid and clamping of predictor coefficients
# - Inverse-filter stability (all poles inside unit circle)
# - Functional correctness: iTNS(TNS(X)) ≈ X
# ------------------------------------------------------------
from __future__ import annotations
import pytest
from core.aac_configuration import PRED_ORDER, QUANT_MAX, QUANT_STEP
from core.aac_tns import aac_tns, aac_i_tns
from core.aac_types import *
# -----------------------------------------------------------------------------
# Helper utilities
# -----------------------------------------------------------------------------
def _is_inverse_stable_from_coeffs(a_q: MdctCoeffs) -> bool:
"""
Check stability of the inverse TNS filter H_TNS^{-1}.
Poles are roots of:
z^p - a1 z^{p-1} - ... - ap = 0
Stability condition:
|root| < 1 for all roots.
"""
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
poly = np.empty(p + 1, dtype=np.float64)
poly[0] = 1.0
poly[1:] = -a_q
roots = np.roots(poly)
margin = 1e-12
return bool(np.all(np.abs(roots) < (1.0 - margin)))
def _assert_quantized_and_clamped(a_q: MdctCoeffs) -> None:
"""
Assert that coefficients:
- lie on the QUANT_STEP grid
- are clamped to [-QUANT_MAX, +QUANT_MAX]
"""
a_q = np.asarray(a_q, dtype=np.float64)
assert np.max(np.abs(a_q)) <= (QUANT_MAX + 1e-12)
grid = a_q / float(QUANT_STEP)
assert np.max(np.abs(grid - np.round(grid))) < 1e-12
# -----------------------------------------------------------------------------
# Shape / contract tests
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
def test_tns_shapes_long_sequences(frame_type: FrameType) -> None:
"""
Contract test (long frames):
- Input shape: (1024, 1)
- Output shape preserved
- Predictor coeffs shape: (PRED_ORDER, 1)
"""
rng = np.random.default_rng(0)
frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
assert frame_F_out.shape == frame_F_in.shape
assert tns_coeffs.shape == (PRED_ORDER, 1)
def test_tns_shapes_esh() -> None:
"""
Contract test (ESH):
- Input shape: (128, 8)
- Output shape preserved
- Predictor coeffs shape: (PRED_ORDER, 8)
"""
rng = np.random.default_rng(1)
frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
frame_F_out, tns_coeffs = aac_tns(frame_F_in, "ESH")
assert frame_F_out.shape == (128, 8)
assert tns_coeffs.shape == (PRED_ORDER, 8)
# -----------------------------------------------------------------------------
# Coefficient properties
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
def test_tns_coeffs_quantized_clamped_and_stable_long(frame_type: FrameType) -> None:
"""
Long-frame predictor coefficients must be:
- quantized on QUANT_STEP grid
- clamped to [-QUANT_MAX, +QUANT_MAX]
- stable for inverse filtering
"""
rng = np.random.default_rng(2)
frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
_, tns_coeffs = aac_tns(frame_F_in, frame_type)
a_q: MdctCoeffs = tns_coeffs[:, 0]
_assert_quantized_and_clamped(a_q)
assert _is_inverse_stable_from_coeffs(a_q)
def test_tns_coeffs_quantized_clamped_and_stable_esh() -> None:
"""
ESH predictor coefficients must satisfy quantization and stability
independently for each of the 8 short subframes.
"""
rng = np.random.default_rng(3)
frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
_, tns_coeffs = aac_tns(frame_F_in, "ESH")
for j in range(8):
a_q: MdctCoeffs = tns_coeffs[:, j]
_assert_quantized_and_clamped(a_q)
assert _is_inverse_stable_from_coeffs(a_q)
# -----------------------------------------------------------------------------
# Functional correctness (round-trip)
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
def test_tns_roundtrip_long_is_close(frame_type: FrameType) -> None:
"""
Functional test:
iTNS(TNS(X)) X for long frames.
"""
rng = np.random.default_rng(4)
frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, frame_type)
frame_F_hat = aac_i_tns(frame_F_tns, frame_type, tns_coeffs)
np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-9, atol=1e-9)
def test_tns_roundtrip_esh_is_close() -> None:
"""
Functional test:
iTNS(TNS(X)) X for ESH frames (8 independent subframes).
"""
rng = np.random.default_rng(5)
frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, "ESH")
frame_F_hat = aac_i_tns(frame_F_tns, "ESH", tns_coeffs)
np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-9, atol=1e-9)
# -----------------------------------------------------------------------------
# Sanity
# -----------------------------------------------------------------------------
def test_tns_outputs_are_finite() -> None:
"""
Sanity test: no NaN or inf in outputs.
"""
rng = np.random.default_rng(6)
frame_F_long: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
out_long, coeffs_long = aac_tns(frame_F_long, "OLS")
assert np.isfinite(out_long).all()
assert np.isfinite(coeffs_long).all()
frame_F_esh: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
out_esh, coeffs_esh = aac_tns(frame_F_esh, "ESH")
assert np.isfinite(out_esh).all()
assert np.isfinite(coeffs_esh).all()

View File

@ -30,6 +30,7 @@ import soundfile as sf
from core.aac_configuration import WIN_TYPE from core.aac_configuration import WIN_TYPE
from core.aac_filterbank import aac_filter_bank from core.aac_filterbank import aac_filter_bank
from core.aac_ssc import aac_SSC from core.aac_ssc import aac_SSC
from core.aac_tns import aac_tns
from core.aac_types import * from core.aac_types import *
@ -144,8 +145,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
AACSeq1 AACSeq1
List of encoded frames (Level 1 schema). List of encoded frames (Level 1 schema).
""" """
x, fs = aac_read_wav_stereo_48k(filename_in) x, _ = aac_read_wav_stereo_48k(filename_in)
_ = fs # kept for clarity; The assignment assumes 48 kHz # The assignment assumes 48 kHz
hop = 1024 hop = 1024
win = 2048 win = 2048
@ -196,3 +197,88 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
prev_frame_type = frame_type prev_frame_type = frame_type
return aac_seq return aac_seq
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
"""
Level-2 AAC encoder (Level 1 + TNS).
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
Returns
-------
AACSeq2
Encoded AAC sequence (Level 2 payload schema).
For each frame i:
- "frame_type": FrameType
- "win_type": WinType
- "chl"/"chr":
- "frame_F": FrameChannelF (after TNS)
- "tns_coeffs": TnsCoeffs
"""
filename_in = Path(filename_in)
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS"
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start : start + win, :]
if frame_t.shape != (win, 2):
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop : start + hop + win, :]
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
# Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
# Unpack to per-channel (as you already do in Level 1)
if frame_type == "ESH":
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
else:
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
# Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
aac_seq.append(
{
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
}
)
prev_frame_type = frame_type
return aac_seq

View File

@ -17,6 +17,15 @@ from __future__ import annotations
# Imports # Imports
from core.aac_types import WinType from core.aac_types import WinType
# Filterbank
# ------------------------------------------------------------
# Window type # Window type
# Options: "SIN", "KBD" # Options: "SIN", "KBD"
WIN_TYPE: WinType = "SIN" WIN_TYPE: WinType = "SIN"
# TNS
# ------------------------------------------------------------
PRED_ORDER = 4
QUANT_STEP = 0.1
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]

View File

@ -28,6 +28,7 @@ from typing import Union
import soundfile as sf import soundfile as sf
from core.aac_filterbank import aac_i_filter_bank from core.aac_filterbank import aac_i_filter_bank
from core.aac_tns import aac_i_tns
from core.aac_types import * from core.aac_types import *
@ -164,3 +165,93 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-2 AAC decoder (inverse of aac_coder_2).
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
- Per frame/channel: inverse TNS using stored coefficients
- Re-pack to stereo frame_F
- IMDCT + windowing
- Overlap-add over frames
- Remove Level-1 padding (hop samples start/end)
- Write output WAV (48 kHz)
Parameters
----------
aac_seq_2 : AACSeq2
Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path]
Output WAV filename.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_2)
if K <= 0:
raise ValueError("aac_seq_2 must contain at least one frame.")
n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
for i, fr in enumerate(aac_seq_2):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
# Inverse TNS per channel
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
# Re-pack to the stereo container expected by aac_i_filter_bank
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
else:
# Accept either (1024,1) or (1024,) from your internal convention.
if chl_f.shape == (1024,):
chl_col = chl_f.reshape(1024, 1)
elif chl_f.shape == (1024, 1):
chl_col = chl_f
else:
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
if chr_f.shape == (1024,):
chr_col = chr_f.reshape(1024, 1)
elif chr_f.shape == (1024, 1):
chr_col = chr_f
else:
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_col[:, 0]
frame_f[:, 1] = chr_col[:, 0]
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
start = i * hop
y_pad[start : start + win, :] += frame_t_hat
y = aac_remove_padding(y_pad, hop=hop)
sf.write(str(filename_out), y, 48000)
return y

View File

@ -0,0 +1,60 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - SNR dB calculator
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements SNR calculation in dB
# ------------------------------------------------------------
from __future__ import annotations
from core.aac_types import StereoSignal
import numpy as np
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference stereo stream.
x_hat : StereoSignal
Reconstructed stereo stream.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))

View File

@ -10,7 +10,6 @@
# #
# Description: # Description:
# This module implements Public Type aliases # This module implements Public Type aliases
#
# ------------------------------------------------------------ # ------------------------------------------------------------
from __future__ import annotations from __future__ import annotations
@ -39,7 +38,7 @@ Window type codes (AAC):
""" """
ChannelKey: TypeAlias = Literal["chl", "chr"] ChannelKey: TypeAlias = Literal["chl", "chr"]
"""Channel dictionary keys used in Level 1 payloads.""" """Channel dictionary keys used in Level payloads."""
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -105,6 +104,40 @@ Examples:
dtype: float64 dtype: float64
""" """
MdctFrameChannel: TypeAlias = FloatArray
"""
Per-channel MDCT container used in Level-1/2 sequences.
Typical shapes:
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
dtype: float64
Notes
-----
Some parts of the assignment store long-frame coefficients as a column vector
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
when convenient, but the semantic meaning is identical.
"""
TnsCoeffs: TypeAlias = FloatArray
"""
Quantized TNS predictor coefficients (one channel).
Typical shapes (Level 2):
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
- Else: (4, 1) (order p=4 for the long frame)
dtype: float64
Notes
-----
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
We store the quantized coefficient values as float64 (typically multiples of 0.1)
to keep the pipeline simple and readable.
"""
FrameT: TypeAlias = FloatArray FrameT: TypeAlias = FloatArray
""" """
@ -142,17 +175,23 @@ Rationale for ESH (128, 16):
dtype: float64 dtype: float64
""" """
FrameChannelF: TypeAlias = FloatArray FrameChannelF: TypeAlias = MdctFrameChannel
""" """
Frequency-domain single-channel frame (MDCT coefficients). Frequency-domain single-channel MDCT coefficients.
Typical shapes (Level 1): Typical shapes (Level 1/2):
- If frame_type in {"OLS","LSS","LPS"}: (1024,) - If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) - If frame_type == "ESH": (128, 8)
dtype: float64 dtype: float64
""" """
BandRanges: TypeAlias = list[tuple[int, int]]
"""
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
Used by TNS to map MDCT indices k to Bark bands.
"""
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Level 1 AAC sequence payload types # Level 1 AAC sequence payload types
@ -168,7 +207,7 @@ class AACChannelFrameF(TypedDict):
The MDCT coefficients for ONE channel. The MDCT coefficients for ONE channel.
Typical shapes: Typical shapes:
- ESH: (128, 8) (8 short subframes) - ESH: (128, 8) (8 short subframes)
- else: (1024, ) - else: (1024, 1) or (1024,)
""" """
frame_F: FrameChannelF frame_F: FrameChannelF
@ -191,3 +230,53 @@ List of length K (K = number of frames).
Each element is a dict with keys: Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr" - "frame_type", "win_type", "chl", "chr"
""" """
# -----------------------------------------------------------------------------
# Level 2 AAC sequence payload types (TNS)
# -----------------------------------------------------------------------------
class AACChannelFrameF2(TypedDict):
"""
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
Keys
----
frame_F:
The TNS-processed MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8)
- else: (1024, 1) or (1024,)
tns_coeffs:
Quantized TNS predictor coefficients for ONE channel.
Typical shapes:
- ESH: (PRED_ORDER, 8)
- else: (PRED_ORDER, 1)
"""
frame_F: FrameChannelF
tns_coeffs: TnsCoeffs
class AACSeq2Frame(TypedDict):
"""
One frame dictionary element of aac_seq_2 (Level 2).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF2
chr: AACChannelFrameF2
AACSeq2: TypeAlias = List[AACSeq2Frame]
"""
AAC sequence for Level 2:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
Level 2 adds:
- per-channel "tns_coeffs"
and stores:
- per-channel "frame_F" after applying TNS.
"""

View File

@ -1,234 +0,0 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Sequence Segmentation Control Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Sequence Segmentation Control module (SSC).
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
from core.aac_ssc import aac_SSC
from core.aac_types import FrameT
# -----------------------------------------------------------------------------
# Helper fixtures for SSC
# -----------------------------------------------------------------------------
def _next_frame_no_attack() -> FrameT:
"""
Build a next_frame_T that must NOT trigger ESH detection.
Uses exact zeros so all segment energies are zero and the condition
s[l] > 1e-3 cannot hold for any l.
"""
return np.zeros((2048, 2), dtype=np.float64)
def _next_frame_strong_attack(
*,
attack_left: bool,
attack_right: bool,
segment_l: int = 4,
baseline: float = 1e-6,
burst_amp: float = 1.0,
) -> FrameT:
"""
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
Attack criterion (spec):
Attack exists if there exists l in {1..7} such that:
s[l] > 1e-3 and ds[l] > 10,
where s[l] is the energy of segment l (length 128) after high-pass filtering,
and ds[l] = s[l] / s[l-1].
Construction:
- A small baseline is added everywhere to avoid relying on the epsilon guard in ds,
keeping ds behavior stable/reproducible.
- A strong burst is added inside a chosen segment l in 1..7.
"""
if not (1 <= segment_l <= 7):
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
x = np.full((2048, 2), baseline, dtype=np.float64)
a = segment_l * 128
b = (segment_l + 1) * 128
if attack_left:
x[a:b, 0] += burst_amp
if attack_right:
x[a:b, 1] += burst_amp
return x
def _next_frame_below_s_threshold(
*,
left: bool,
right: bool,
segment_l: int = 4,
impulse_amp: float = 0.01,
) -> FrameT:
"""
Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered,
even if the ratio ds[l] could be large.
We place a single impulse of amplitude 'impulse_amp' inside one segment.
Approx. segment energy: s[l] ~= impulse_amp^2.
Example:
impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3
"""
if not (1 <= segment_l <= 7):
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
x = np.zeros((2048, 2), dtype=np.float64)
idx = segment_l * 128 + 10 # inside segment l
if left:
x[idx, 0] = impulse_amp
if right:
x[idx, 1] = impulse_amp
return x
# -----------------------------------------------------------------------------
# 1) Fixed/mandatory cases (prev frame type forces current type)
# -----------------------------------------------------------------------------
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
"""
Spec:
- If prev was LSS => current MUST be ESH
- If prev was LPS => current MUST be OLS
independent of attack detection on (i+1).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
out1 = aac_SSC(frame_t, next_attack, "LSS")
assert out1 == "ESH"
out2 = aac_SSC(frame_t, next_attack, "LPS")
assert out2 == "OLS"
# -----------------------------------------------------------------------------
# 2) Cases requiring next-frame ESH prediction (attack computation)
# -----------------------------------------------------------------------------
def test_prev_ols_next_not_esh_returns_ols() -> None:
"""
If prev=OLS, current is:
- LSS iff (i+1) is predicted ESH
- else OLS
Here: no attack => expect OLS.
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "OLS"
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
"""
prev=OLS and next predicted ESH for both channels:
per-channel: LSS, LSS
merged: LSS
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "LSS"
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
"""
prev=OLS:
- one channel predicts ESH => LSS
- other channel predicts not ESH => OLS
Merge table: OLS + LSS => LSS (either side).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = aac_SSC(frame_t, next1_t, "OLS")
assert out1 == "LSS"
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
out2 = aac_SSC(frame_t, next2_t, "OLS")
assert out2 == "LSS"
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
"""
prev=ESH and next predicted ESH for both channels:
per-channel: ESH, ESH
merged: ESH
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
out = aac_SSC(frame_t, next_t, "ESH")
assert out == "ESH"
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
"""
prev=ESH and next not predicted ESH for both channels:
per-channel: LPS, LPS
merged: LPS
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_no_attack()
out = aac_SSC(frame_t, next_t, "ESH")
assert out == "LPS"
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
"""
prev=ESH:
- one channel predicts ESH => ESH
- other channel predicts not ESH => LPS
Merge table: ESH + LPS => ESH (either side).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
out1 = aac_SSC(frame_t, next1_t, "ESH")
assert out1 == "ESH"
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
out2 = aac_SSC(frame_t, next2_t, "ESH")
assert out2 == "ESH"
def test_threshold_s_must_exceed_1e_3() -> None:
"""
Spec: next frame is predicted ESH only if:
s[l] > 1e-3 AND ds[l] > 10
for some l in 1..7.
This test checks the necessity of the s[l] threshold:
- Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
- Expect: not classified as ESH -> for prev=OLS return OLS.
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
out = aac_SSC(frame_t, next_t, "OLS")
assert out == "OLS"

View File

@ -1,156 +0,0 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder/DecoderTests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for AAC Coder/Decoder module.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
import numpy as np
import pytest
import soundfile as sf
from core.aac_coder import aac_coder_1
from core.aac_decoder import aac_decoder_1
from core.aac_types import *
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
# -----------------------------------------------------------------------------
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference signal, shape (N, 2) typical.
x_hat : StereoSignal
Reconstructed signal, shape (M, 2) typical.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
# Be conservative: align lengths and common channels.
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))
@pytest.fixture()
def tmp_stereo_wav(tmp_path: Path) -> Path:
"""
Create a temporary 48 kHz stereo WAV with random samples.
"""
rng = np.random.default_rng(123)
fs = 48000
# ~1 second of audio (kept small for test speed).
n = fs
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
wav_path = tmp_path / "in.wav"
sf.write(str(wav_path), x, fs)
return wav_path
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
"""
Module-level contract test:
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
"""
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
assert isinstance(aac_seq, list)
assert len(aac_seq) > 0
for fr in aac_seq:
assert isinstance(fr, dict)
# Required keys
assert "frame_type" in fr
assert "win_type" in fr
assert "chl" in fr
assert "chr" in fr
frame_type = fr["frame_type"]
win_type = fr["win_type"]
assert frame_type in ("OLS", "LSS", "ESH", "LPS")
assert win_type in ("SIN", "KBD")
assert isinstance(fr["chl"], dict)
assert isinstance(fr["chr"], dict)
assert "frame_F" in fr["chl"]
assert "frame_F" in fr["chr"]
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
if frame_type == "ESH":
assert chl_f.shape == (128, 8)
assert chr_f.shape == (128, 8)
else:
assert chl_f.shape == (1024, 1)
assert chr_f.shape == (1024, 1)
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
"""
End-to-end test:
Encode + decode and check SNR is very high (numerical-noise only).
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
"""
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
x_ref = np.asarray(x_ref, dtype=np.float64)
assert int(fs) == 48000
out_wav = tmp_path / "out.wav"
aac_seq = aac_coder_1(tmp_stereo_wav)
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
# Basic sanity: output file exists and is readable
assert out_wav.exists()
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
assert int(fs_hat) == 48000
# SNR against returned array (file should match closely, but we do not require it here).
snr = _snr_db(x_ref, x_hat)
assert snr > 80.0

View File

@ -1,269 +0,0 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Filterbank module.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Sequence
import pytest
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
from core.aac_types import *
# Helper fixtures for filterbank
# -----------------------------------------------------------------------------
def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal:
"""
Analyze-synthesize each frame and overlap-add with hop=1024.
Parameters
----------
x : StereoSignal
Input stereo stream, expected shape (N, 2).
frame_types : Sequence[FrameType]
Length K sequence of frame types for frames starting at i*1024.
win_type : WinType
Window type ("SIN" or "KBD").
Returns
-------
StereoSignal
Reconstructed stereo stream, same shape as x (N, 2).
"""
hop = 1024
win = 2048
K = len(frame_types)
y: StereoSignal = np.zeros_like(x, dtype=np.float64)
for i in range(K):
start = i * hop
frame_t: FrameT = x[start:start + win, :]
frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type)
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type)
y[start:start + win, :] += frame_t_hat
return y
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
"""
Compute SNR in dB over all samples/channels.
"""
err = x - y
ps = float(np.sum(x * x))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return 10.0 * float(np.log10(ps / pn))
# -----------------------------------------------------------------------------
# Forward filterbank tests
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
"""
Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
assert frame_f.shape == (1024, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_shapes_esh(win_type: WinType) -> None:
"""
Contract test: for ESH, aac_filter_bank returns shape (128, 16).
"""
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
assert frame_f.shape == (128, 16)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
"""
Behavior test: for OLS (representative long-sequence), channels are independent.
If right channel is zero and left is random, right spectrum should be near zero.
"""
rng = np.random.default_rng(0)
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = aac_filter_bank(frame_t, "OLS", win_type)
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
"""
Behavior test: for ESH, channels are independent.
If right channel is zero and left is random, all odd columns (right) should be near zero.
"""
rng = np.random.default_rng(1)
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_t[:, 0] = rng.normal(size=2048)
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15
assert np.max(np.abs(right_cols)) < 1e-9
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
"""
Spec-driven behavior test:
ESH uses only the central region [448, 1600), split into 8 overlapping
windows of length 256 with 50% overlap.
Therefore, changing samples outside [448, 1600) must not affect the output.
"""
rng = np.random.default_rng(2)
frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64)
frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64)
center = rng.normal(size=(1152, 2))
frame_a[448:1600, :] = center
frame_b[448:1600, :] = center
frame_b[0:448, :] = rng.normal(size=(448, 2))
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
fa = aac_filter_bank(frame_a, "ESH", win_type)
fb = aac_filter_bank(frame_b, "ESH", win_type)
# Use a tiny tolerance to avoid flaky failures due to floating-point minutiae.
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_filterbank_output_is_finite(win_type: WinType) -> None:
"""
Sanity test: output must not contain NaN or inf for representative cases.
"""
rng = np.random.default_rng(3)
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
assert np.isfinite(frame_f).all()
# -----------------------------------------------------------------------------
# Reverse i_filterbank tests
# -----------------------------------------------------------------------------
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None:
"""
Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2).
"""
frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64)
for frame_type in ("OLS", "LSS", "LPS"):
frame_t = aac_i_filter_bank(frame_f, frame_type, win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ifilterbank_shapes_esh(win_type: WinType) -> None:
"""
Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2).
"""
frame_f: FrameF = np.zeros((128, 16), dtype=np.float64)
frame_t = aac_i_filter_bank(frame_f, "ESH", win_type)
assert frame_t.shape == (2048, 2)
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None:
"""
Sanity test: per-frame analysis+synthesis must produce finite outputs.
"""
rng = np.random.default_rng(0)
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type)
assert np.isfinite(frame_t_hat).all()
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
"""
Module-level test:
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(1)
K = 6
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 50.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
"""
Module-level test:
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
in the steady-state region.
"""
rng = np.random.default_rng(2)
K = 6
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 45.0
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
"""
Transition sequence test matching the windowing logic:
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
"""
rng = np.random.default_rng(3)
frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
K = len(frame_types)
N = 1024 * (K + 1)
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
y = _ola_reconstruct(x, frame_types, win_type)
a = 1024
b = N - 1024
snr = _snr_db(x[a:b, :], y[a:b, :])
assert snr > 40.0

View File

@ -1,117 +0,0 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank internal (mdct) Tests
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Tests for Filterbank internal MDCT/IMDCT functionality.
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
import pytest
from core.aac_filterbank import _imdct, _mdct
from core.aac_types import FloatArray, TimeSignal, MdctCoeffs
def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None:
"""
Helper for consistent tolerances across tests.
"""
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float:
"""
Estimate scalar gain g such that y ~= g*x in least-squares sense.
"""
denom = float(np.dot(x, x))
if denom == 0.0:
return 0.0
return float(np.dot(y, x) / denom)
tolerance = 1e-10
@pytest.mark.parametrize("N", [256, 2048])
def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None:
"""
Consistency test in coefficient domain:
mdct(imdct(X)) ~= g * X
For the chosen (non-orthonormal) scaling, g is expected to be close to 2.
"""
rng = np.random.default_rng(0)
K = N // 2
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
x: TimeSignal = _imdct(X)
X_hat: MdctCoeffs = _mdct(x)
g = _estimate_gain(X_hat, X)
_assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance)
_assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance)
@pytest.mark.parametrize("N", [256, 2048])
def test_mdct_linearity(N: int) -> None:
"""
Linearity test:
mdct(a*x + b*y) == a*mdct(x) + b*mdct(y)
"""
rng = np.random.default_rng(1)
x: TimeSignal = rng.normal(size=N).astype(np.float64)
y: TimeSignal = rng.normal(size=N).astype(np.float64)
a = 0.37
b = -1.12
left: MdctCoeffs = _mdct(a * x + b * y)
right: MdctCoeffs = a * _mdct(x) + b * _mdct(y)
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
@pytest.mark.parametrize("N", [256, 2048])
def test_imdct_linearity(N: int) -> None:
"""
Linearity test for IMDCT:
imdct(a*X + b*Y) == a*imdct(X) + b*imdct(Y)
"""
rng = np.random.default_rng(2)
K = N // 2
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
Y: MdctCoeffs = rng.normal(size=K).astype(np.float64)
a = -0.5
b = 2.0
left: TimeSignal = _imdct(a * X + b * Y)
right: TimeSignal = a * _imdct(X) + b * _imdct(Y)
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
@pytest.mark.parametrize("N", [256, 2048])
def test_mdct_imdct_outputs_are_finite(N: int) -> None:
"""
Sanity test: no NaN/inf on random inputs.
"""
rng = np.random.default_rng(3)
K = N // 2
x: TimeSignal = rng.normal(size=N).astype(np.float64)
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
X1 = _mdct(x)
x1 = _imdct(X)
assert np.isfinite(X1).all()
assert np.isfinite(x1).all()

View File

@ -22,13 +22,13 @@ from __future__ import annotations
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union
import numpy as np
import soundfile as sf import soundfile as sf
from core.aac_types import AACSeq1, StereoSignal from core.aac_types import AACSeq1, StereoSignal
from core.aac_coder import aac_coder_1 as core_aac_coder_1 from core.aac_coder import aac_coder_1 as core_aac_coder_1
from core.aac_coder import aac_read_wav_stereo_48k from core.aac_coder import aac_read_wav_stereo_48k
from core.aac_decoder import aac_decoder_1 as core_aac_decoder_1 from core.aac_decoder import aac_decoder_1 as core_aac_decoder_1
from core.aac_snr_db import snr_db
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -80,50 +80,6 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
# Demo (Level 1) # Demo (Level 1)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference stereo stream.
x_hat : StereoSignal
Reconstructed stereo stream.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))
def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float: def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float:
""" """
Demonstration for the Level-1 codec. Demonstration for the Level-1 codec.
@ -158,12 +114,11 @@ def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) ->
x_hat = aac_decoder_1(aac_seq_1, filename_out) x_hat = aac_decoder_1(aac_seq_1, filename_out)
# Optional sanity: ensure output file exists and is readable # Optional sanity: ensure output file exists and is readable
x_hat_file, fs_hat = sf.read(str(filename_out), always_2d=True) _, fs_hat = sf.read(str(filename_out), always_2d=True)
_ = x_hat_file
if int(fs_hat) != 48000: if int(fs_hat) != 48000:
raise ValueError("Decoded output sampling rate must be 48 kHz.") raise ValueError("Decoded output sampling rate must be 48 kHz.")
return _snr_db(x_ref, x_hat) return snr_db(x_ref, x_hat)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -172,11 +127,14 @@ def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) ->
if __name__ == "__main__": if __name__ == "__main__":
# Example: # Example:
# python -m level_1.level_1 input.wav output.wav # cd level_1
# python -m level_1 input.wav output.wav
# or
# python -m level_1 material/LicorDeCalandraca.wav LicorDeCalandraca_out.wav
import sys import sys
if len(sys.argv) != 3: if len(sys.argv) != 3:
raise SystemExit("Usage: python -m level_1.level_1 <input.wav> <output.wav>") raise SystemExit("Usage: python -m level_1 <input.wav> <output.wav>")
in_wav = Path(sys.argv[1]) in_wav = Path(sys.argv[1])
out_wav = Path(sys.argv[2]) out_wav = Path(sys.argv[2])

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,400 @@
import numpy as np
import scipy.io as sio
import os
# ------------------ LOAD LUT ------------------
def load_LUT(mat_filename=None):
"""
Loads the list of Huffman Codebooks (LUTs)
Returns:
huffLUT : list (index 1..11 used, index 0 unused)
"""
if mat_filename is None:
current_dir = os.path.dirname(os.path.abspath(__file__))
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
mat = sio.loadmat(mat_filename)
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
huffCodebooks = []
for i in range(11):
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
# Build inverse VLC tables
invTable = [None] * 11
for i in range(11):
h = huffCodebooks[i][:, 2].astype(int) # column 3
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
hbin = []
for j in range(len(h)):
hbin.append(format(h[j], f'0{hlength[j]}b'))
invTable[i] = vlc_table(hbin)
# Build Huffman LUT dicts
huffLUT = [None] * 12 # index 0 unused
params = [
(4, 1, True),
(4, 1, True),
(4, 2, False),
(4, 2, False),
(2, 4, True),
(2, 4, True),
(2, 7, False),
(2, 7, False),
(2, 12, False),
(2, 12, False),
(2, 16, False),
]
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
huffLUT[i] = {
'LUT': huffCodebooks[i-1],
'invTable': invTable[i-1],
'codebook': i,
'nTupleSize': nTupleSize,
'maxAbsCodeVal': maxAbs,
'signedValues': signed
}
return huffLUT
def vlc_table(code_array):
"""
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
returns:
h : NumPy array of shape (num_nodes, 3)
columns:
[ next_if_0 , next_if_1 , symbol_index ]
"""
h = np.zeros((1, 3), dtype=int)
for code_index, code in enumerate(code_array, start=1):
word = [int(bit) for bit in code]
h_index = 0
for bit in word:
k = bit
next_node = h[h_index, k]
if next_node == 0:
h = np.vstack([h, [0, 0, 0]])
new_index = h.shape[0] - 1
h[h_index, k] = new_index
h_index = new_index
else:
h_index = next_node
h[h_index, 2] = code_index
return h
# ------------------ ENCODE ------------------
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
"""
Huffman-encode a sequence of quantized coefficients.
This function selects the appropriate Huffman codebook based on the
maximum absolute value of the input coefficients, encodes the coefficients
into a binary Huffman bitstream, and returns both the bitstream and the
selected codebook index.
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
in audio/image coding (e.g., scale factor band encoding). The input
coefficient sequence is grouped into fixed-size tuples as defined by
the chosen Huffman LUT. Zero-padding may be applied internally.
Parameters
----------
coeff_sec : array_like of int
1-D array of quantized integer coefficients to encode.
Typically corresponds to a "section" or scale-factor band.
huff_LUT_list : list
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
Index 1..11 correspond to valid Huffman codebooks.
Index 0 is unused.
Returns
-------
huffSec : str
Huffman-encoded bitstream represented as a string of '0' and '1'
characters.
huffCodebook : int
Index (1..11) of the Huffman codebook used for encoding.
A value of 0 indicates a special all-zero section.
"""
if force_codebook is not None:
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
maxAbsVal = np.max(np.abs(coeff_sec))
if maxAbsVal == 0:
huffCodebook = 0
huffSec = huff_LUT_code_0()
elif maxAbsVal == 1:
candidates = [1, 2]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal == 2:
candidates = [3, 4]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (3, 4):
candidates = [5, 6]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (5, 6, 7):
candidates = [7, 8]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (8, 9, 10, 11, 12):
candidates = [9, 10]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (13, 14, 15):
huffCodebook = 11
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
else:
huffCodebook = 11
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
return huffSec, huffCodebook
def huff_LUT_code_1(huff_LUT, coeff_sec):
LUT = huff_LUT['LUT']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
signedValues = huff_LUT['signedValues']
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
if signedValues:
coeff = coeff_sec + maxAbsCodeVal
base = 2 * maxAbsCodeVal + 1
else:
coeff = coeff_sec
base = maxAbsCodeVal + 1
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
coeffPad[:len(coeff)] = coeff
huffSec = []
powers = base ** np.arange(nTupleSize - 1, -1, -1)
for i in range(numTuples):
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
huffIndex = int(np.abs(nTuple) @ powers)
hexVal = LUT[huffIndex, 2]
huffLen = LUT[huffIndex, 1]
bits = format(int(hexVal), f'0{int(huffLen)}b')
if signedValues:
huffSec.append(bits)
else:
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
huffSec.append(bits + signBits)
return ''.join(huffSec)
def huff_LUT_code_0():
return ''
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
LUT = huff_LUT['LUT']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
base = maxAbsCodeVal + 1
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
coeffPad[:len(coeff_sec)] = coeff_sec
huffSec = []
powers = base ** np.arange(nTupleSize - 1, -1, -1)
for i in range(numTuples):
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
lnTuple = nTuple.astype(float)
lnTuple[lnTuple == 0] = np.finfo(float).eps
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
N = np.maximum(0, N4 - 4)
esc = np.abs(nTuple) > 15
nTupleESC = nTuple.copy()
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
huffIndex = int(np.abs(nTupleESC) @ powers)
hexVal = LUT[huffIndex, 2]
huffLen = LUT[huffIndex, 1]
bits = format(int(hexVal), f'0{int(huffLen)}b')
escSeq = ''
for k in range(nTupleSize):
if esc[k]:
escSeq += '1' * N[k]
escSeq += '0'
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
huffSec.append(bits + signBits + escSeq)
return ''.join(huffSec)
# ------------------ DECODE ------------------
def decode_huff(huff_sec, huff_LUT):
"""
Decode a Huffman-encoded stream.
Parameters
----------
huff_sec : array-like of int or str
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
huff_LUT : dict
Huffman lookup table with keys:
- 'invTable': inverse table (numpy array)
- 'codebook': codebook number
- 'nTupleSize': tuple size
- 'maxAbsCodeVal': maximum absolute code value
- 'signedValues': True/False
Returns
-------
decCoeffs : list of int
Decoded quantized coefficients.
"""
h = huff_LUT['invTable']
huffCodebook = huff_LUT['codebook']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
signedValues = huff_LUT['signedValues']
# Convert string to array of ints
if isinstance(huff_sec, str):
huff_sec = np.array([int(b) for b in huff_sec])
eos = False
decCoeffs = []
streamIndex = 0
while not eos:
wordbit = 0
r = 0 # start at root
# Decode Huffman word using inverse table
while True:
b = huff_sec[streamIndex + wordbit]
wordbit += 1
rOld = r
r = h[rOld, b]
if h[r, 0] == 0 and h[r, 1] == 0:
symbolIndex = h[r, 2] - 1 # zero-based
streamIndex += wordbit
break
# Decode n-tuple magnitudes
if signedValues:
base = 2 * maxAbsCodeVal + 1
nTupleDec = []
tmp = symbolIndex
for p in reversed(range(nTupleSize)):
val = tmp // (base ** p)
nTupleDec.append(val - maxAbsCodeVal)
tmp = tmp % (base ** p)
nTupleDec = np.array(nTupleDec)
else:
base = maxAbsCodeVal + 1
nTupleDec = []
tmp = symbolIndex
for p in reversed(range(nTupleSize)):
val = tmp // (base ** p)
nTupleDec.append(val)
tmp = tmp % (base ** p)
nTupleDec = np.array(nTupleDec)
# Apply sign bits
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
streamIndex += nTupleSize
nTupleDec = nTupleDec * nTupleSign
# Handle escape sequences
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
if huffCodebook == 11 and escIndex.size > 0:
for idx in escIndex:
N = 0
b = huff_sec[streamIndex]
while b:
N += 1
b = huff_sec[streamIndex + N]
streamIndex += N
N4 = N + 4
escape_word = huff_sec[streamIndex:streamIndex + N4]
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
nTupleDec[idx] = escape_value
streamIndex += N4 + 1
# Apply signs again
nTupleDec[escIndex] *= nTupleSign[escIndex]
decCoeffs.extend(nTupleDec.tolist())
if streamIndex >= len(huff_sec):
eos = True
return decCoeffs

View File

@ -0,0 +1,284 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC encoder orchestration.
# Keeps the same functional behavior as the original level_1 implementation:
# - Reads WAV via soundfile
# - Validates stereo and 48 kHz
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
# - SSC decision uses next-frame attack detection
# - Filterbank analysis (MDCT)
# - Stores per-channel spectra in AACSeq1 schema:
# * ESH: (128, 8)
# * else: (1024, 1)
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_configuration import WIN_TYPE
from core.aac_filterbank import aac_filter_bank
from core.aac_ssc import aac_SSC
from core.aac_tns import aac_tns
from core.aac_types import *
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
"""
Read a WAV file using soundfile and validate the Level-1 assumptions.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Returns
-------
x : StereoSignal (np.ndarray)
Stereo samples as float64, shape (N, 2).
fs : int
Sampling rate (Hz). Must be 48000.
Raises
------
ValueError
If the input is not stereo or the sampling rate is not 48 kHz.
"""
filename_in = Path(filename_in)
x, fs = sf.read(str(filename_in), always_2d=True)
x = np.asarray(x, dtype=np.float64)
if x.shape[1] != 2:
raise ValueError("Input must be stereo (2 channels).")
if int(fs) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
return x, int(fs)
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
as required by the Level-1 AACSeq1 schema.
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
frame_f : FrameF
Output of aac_filter_bank():
- If frame_type != "ESH": shape (1024, 2)
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
Returns
-------
chl_f : FrameChannelF
Left channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
"""
if frame_type == "ESH":
if frame_f.shape != (128, 16):
raise ValueError("For ESH, frame_f must have shape (128, 16).")
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f[:, 2 * j + 0]
chr_f[:, j] = frame_f[:, 2 * j + 1]
return chl_f, chr_f
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
if frame_f.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
return chl_f, chr_f
# -----------------------------------------------------------------------------
# Level 1 encoder
# -----------------------------------------------------------------------------
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
"""
Level-1 AAC encoder.
This function preserves the behavior of the original level_1 implementation:
- Read stereo 48 kHz WAV
- Pad hop samples at start and hop samples at end
- Frame with win=2048, hop=1024
- Use SSC with next-frame lookahead
- Apply filterbank analysis
- Store per-channel coefficients using AACSeq1 schema
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
Returns
-------
AACSeq1
List of encoded frames (Level 1 schema).
"""
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
# Pad at the beginning to support the first overlap region.
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
# Number of frames such that current frame fits; next frame will be padded if needed.
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS"
win_type: WinType = WIN_TYPE
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start:start + win, :]
if frame_t.shape != (win, 2):
# This should not happen due to K definition, but keep it explicit.
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop:start + hop + win, :]
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({
"frame_type": frame_type,
"win_type": win_type,
"chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f},
})
prev_frame_type = frame_type
return aac_seq
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
"""
Level-2 AAC encoder (Level 1 + TNS).
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
Returns
-------
AACSeq2
Encoded AAC sequence (Level 2 payload schema).
For each frame i:
- "frame_type": FrameType
- "win_type": WinType
- "chl"/"chr":
- "frame_F": FrameChannelF (after TNS)
- "tns_coeffs": TnsCoeffs
"""
filename_in = Path(filename_in)
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS"
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start : start + win, :]
if frame_t.shape != (win, 2):
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop : start + hop + win, :]
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
# Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
# Unpack to per-channel (as you already do in Level 1)
if frame_type == "ESH":
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
else:
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
# Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
aac_seq.append(
{
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
}
)
prev_frame_type = frame_type
return aac_seq

View File

@ -0,0 +1,31 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Configuration
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module contains the global configurations
#
# ------------------------------------------------------------
from __future__ import annotations
# Imports
from core.aac_types import WinType
# Filterbank
# ------------------------------------------------------------
# Window type
# Options: "SIN", "KBD"
WIN_TYPE: WinType = "SIN"
# TNS
# ------------------------------------------------------------
PRED_ORDER = 4
QUANT_STEP = 0.1
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]

View File

@ -0,0 +1,257 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Inverse AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
# Keeps the same functional behavior as the original level_1 implementation:
# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
# - IMDCT synthesis per frame
# - Overlap-add with hop=1024
# - Remove encoder boundary padding: hop at start and hop at end
#
# Note:
# This core module returns the reconstructed samples. Writing to disk is kept
# in level_x demos.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_filterbank import aac_i_filter_bank
from core.aac_tns import aac_i_tns
from core.aac_types import *
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
"""
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
FrameF container expected by aac_i_filter_bank().
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
chl_f : FrameChannelF
Left channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
Returns
-------
FrameF
Stereo coefficients:
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
- else: (1024, 2)
"""
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
return frame_f
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_f[:, 0]
frame_f[:, 1] = chr_f[:, 0]
return frame_f
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
"""
Remove the boundary padding that the Level-1 encoder adds:
hop samples at start and hop samples at end.
Parameters
----------
y_pad : StereoSignal (np.ndarray)
Reconstructed padded stream, shape (N_pad, 2).
hop : int
Hop size in samples (default 1024).
Returns
-------
StereoSignal (np.ndarray)
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
Raises
------
ValueError
If y_pad is too short to unpad.
"""
if y_pad.shape[0] < 2 * hop:
raise ValueError("Decoded stream too short to unpad.")
return y_pad[hop:-hop, :]
# -----------------------------------------------------------------------------
# Level 1 decoder (core)
# -----------------------------------------------------------------------------
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-1 AAC decoder (inverse of aac_coder_1()).
This function preserves the behavior of the original level_1 implementation:
- Reconstruct the full padded stream by overlap-adding K synthesized frames
- Remove hop padding at the beginning and hop padding at the end
- Write the reconstructed stereo WAV file (48 kHz)
- Return reconstructed stereo samples as float64
Parameters
----------
aac_seq_1 : AACSeq1
Encoded sequence as produced by aac_coder_1().
filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_1)
# Output includes the encoder padding region, so we reconstruct the full padded stream.
# For K frames: last frame starts at (K-1)*hop and spans win,
# so total length = (K-1)*hop + win.
n_pad = (K - 1) * hop + win
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
for i, fr in enumerate(aac_seq_1):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
start = i * hop
y_pad[start:start + win, :] += frame_t_hat
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
# Level 1 assumption: 48 kHz output.
sf.write(str(filename_out), y, 48000)
return y
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-2 AAC decoder (inverse of aac_coder_2).
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
- Per frame/channel: inverse TNS using stored coefficients
- Re-pack to stereo frame_F
- IMDCT + windowing
- Overlap-add over frames
- Remove Level-1 padding (hop samples start/end)
- Write output WAV (48 kHz)
Parameters
----------
aac_seq_2 : AACSeq2
Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path]
Output WAV filename.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_2)
if K <= 0:
raise ValueError("aac_seq_2 must contain at least one frame.")
n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
for i, fr in enumerate(aac_seq_2):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
# Inverse TNS per channel
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
# Re-pack to the stereo container expected by aac_i_filter_bank
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
else:
# Accept either (1024,1) or (1024,) from your internal convention.
if chl_f.shape == (1024,):
chl_col = chl_f.reshape(1024, 1)
elif chl_f.shape == (1024, 1):
chl_col = chl_f
else:
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
if chr_f.shape == (1024,):
chr_col = chr_f.reshape(1024, 1)
elif chr_f.shape == (1024, 1):
chr_col = chr_f
else:
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_col[:, 0]
frame_f[:, 1] = chr_col[:, 0]
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
start = i * hop
y_pad[start : start + win, :] += frame_t_hat
y = aac_remove_padding(y_pad, hop=hop)
sf.write(str(filename_out), y, 48000)
return y

View File

@ -0,0 +1,454 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
#
# ------------------------------------------------------------
from __future__ import annotations
from core.aac_types import *
from scipy.signal.windows import kaiser
# Private helpers for Filterbank
# ------------------------------------------------------------
def _sin_window(N: int) -> Window:
"""
Build a sinusoidal (SIN) window of length N.
The AAC sinusoid window is:
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
Parameters
----------
N : int
Window length in samples.
Returns
-------
Window
1-D array of shape (N, ) with dtype float64.
"""
n = np.arange(N, dtype=np.float64)
return np.sin((np.pi / N) * (n + 0.5))
def _kbd_window(N: int, alpha: float) -> Window:
"""
Build a Kaiser-Bessel-Derived (KBD) window of length N.
This follows the standard KBD construction used in AAC:
1) Build a Kaiser kernel of length (N/2 + 1).
2) Form the left half by cumulative summation, normalization, and sqrt.
3) Mirror the left half to form the right half (symmetric full-length window).
Notes
-----
- N must be even (AAC uses N=2048 for long and N=256 for short).
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
Parameters
----------
N : int
Window length in samples (must be even).
alpha : float
KBD alpha parameter.
Returns
-------
Window
1-D array of shape (N,) with dtype float64.
"""
half = N // 2
# Kaiser kernel length: half + 1 samples (0 .. half)
# beta = pi * alpha per the usual correspondence with the ISO definition
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
csum = np.cumsum(kernel)
denom = csum[-1]
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
w_right = w_left[::-1] # mirror for second half
return np.concatenate([w_left, w_right])
def _long_window(win_type: WinType) -> Window:
"""
Return the long AAC window (length 2048) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(2048)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(2048, alpha=6.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _short_window(win_type: WinType) -> Window:
"""
Return the short AAC window (length 256) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (256,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(256)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(256, alpha=4.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
"""
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
In this assignment we assume a single window family is used globally
(no mixed KBD/SIN halves). Therefore, both the long and short windows
are drawn from the same family.
For frame_type:
- "OLS": return the long window Wl (2048).
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
Parameters
----------
frame_type : FrameType
One of "OLS", "LSS", "LPS".
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
wL = _long_window(win_type) # length 2048
wS = _short_window(win_type) # length 256
if frame_type == "OLS":
return wL
if frame_type == "LSS":
# 0..1023: left half of long window
# 1024..1471: ones (448 samples)
# 1472..1599: right half of short window (128 samples)
# 1600..2047: zeros (448 samples)
out = np.zeros(2048, dtype=np.float64)
out[0:1024] = wL[0:1024]
out[1024:1472] = 1.0
out[1472:1600] = wS[128:256]
out[1600:2048] = 0.0
return out
if frame_type == "LPS":
# 0..447: zeros (448)
# 448..575: left half of short window (128)
# 576..1023: ones (448)
# 1024..2047: right half of long window (1024)
out = np.zeros(2048, dtype=np.float64)
out[0:448] = 0.0
out[448:576] = wS[0:128]
out[576:1024] = 1.0
out[1024:2048] = wL[1024:2048]
return out
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
def _mdct(s: TimeSignal) -> MdctCoeffs:
"""
MDCT (direct form) as specified in the assignment.
Parameters
----------
s : TimeSignal
Windowed time samples, 1-D array of length N (N = 2048 or 256).
Returns
-------
MdctCoeffs
MDCT coefficients, 1-D array of length N/2.
Definition
----------
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
s = np.asarray(s, dtype=np.float64).reshape(-1)
N = int(s.shape[0])
if N not in (2048, 256):
raise ValueError("MDCT input length must be 2048 or 256.")
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(N // 2, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
X = 2.0 * (s @ C) # (N/2,)
return X
def _imdct(X: MdctCoeffs) -> TimeSignal:
"""
IMDCT (direct form) as specified in the assignment.
Parameters
----------
X : MdctCoeffs
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
Returns
-------
TimeSignal
Reconstructed time samples, 1-D array of length N = 2K.
Definition
----------
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
X = np.asarray(X, dtype=np.float64).reshape(-1)
K = int(X.shape[0])
if K not in (1024, 128):
raise ValueError("IMDCT input length must be 1024 or 128.")
N = 2 * K
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(K, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
s = (2.0 / N) * (C @ X) # (N,)
return s
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
"""
ESH analysis for one channel.
Parameters
----------
x_ch : FrameChannelT
Time-domain channel frame (expected shape: (2048,)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelF
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
of the j-th short window.
"""
wS = _short_window(win_type) # (256,)
X_esh = np.empty((128, 8), dtype=np.float64)
# ESH subwindows are taken from the central region:
# start positions: 448 + 128*j, j = 0..7
for j in range(8):
start = 448 + 128 * j
seg = x_ch[start:start + 256] * wS # (256,)
X_esh[:, j] = _mdct(seg) # (128,)
return X_esh
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
Parameters
----------
frame_F : FrameF
Packed ESH spectrum (expected shape: (128, 16)).
Returns
-------
left : FrameChannelF
Left channel spectrum, shape (128, 8).
right : FrameChannelF
Right channel spectrum, shape (128, 8).
Notes
-----
Inverse mapping of the packing used in aac_filter_bank():
packed[:, 2*j] = left[:, j]
packed[:, 2*j+1] = right[:, j]
"""
if frame_F.shape != (128, 16):
raise ValueError("ESH frame_F must have shape (128, 16).")
left = np.empty((128, 8), dtype=np.float64)
right = np.empty((128, 8), dtype=np.float64)
for j in range(8):
left[:, j] = frame_F[:, 2 * j + 0]
right[:, j] = frame_F[:, 2 * j + 1]
return left, right
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
"""
ESH synthesis for one channel.
Parameters
----------
X_esh : FrameChannelF
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelT
Time-domain channel contribution, shape (2048,).
This is already overlap-added internally for the 8 short blocks and
ready for OLA at the caller level.
"""
if X_esh.shape != (128, 8):
raise ValueError("X_esh must have shape (128, 8).")
wS = _short_window(win_type) # (256,)
out = np.zeros(2048, dtype=np.float64)
# Each short IMDCT returns 256 samples. Place them at:
# start = 448 + 128*j, j=0..7 (50% overlap)
for j in range(8):
seg = _imdct(X_esh[:, j]) * wS # (256,)
start = 448 + 128 * j
out[start:start + 256] += seg
return out
# -----------------------------------------------------------------------------
# Public Function prototypes (Level 1)
# -----------------------------------------------------------------------------
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
"""
Filterbank stage (MDCT analysis).
Parameters
----------
frame_T : FrameT
Time-domain frame, stereo, shape (2048, 2).
frame_type : FrameType
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN") used for the current frame.
Returns
-------
frame_F : FrameF
Frequency-domain MDCT coefficients:
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
containing MDCT coefficients for both channels.
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
placed in columns according to subframe order, i.e. overall shape (128, 16).
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
if frame_type in ("OLS", "LSS", "LPS"):
w = _window_sequence(frame_type, win_type) # length 2048
XL = _mdct(xL * w) # length 1024
XR = _mdct(xR * w) # length 1024
out = np.empty((1024, 2), dtype=np.float64)
out[:, 0] = XL
out[:, 1] = XR
return out
if frame_type == "ESH":
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
# Pack into (128, 16): each subframe as (128,2) placed in columns
out = np.empty((128, 16), dtype=np.float64)
for j in range(8):
out[:, 2 * j + 0] = Xl[:, j]
out[:, 2 * j + 1] = Xr[:, j]
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
"""
Inverse filterbank (IMDCT synthesis).
Parameters
----------
frame_F : FrameF
Frequency-domain MDCT coefficients as produced by filter_bank().
frame_type : FrameType
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN").
Returns
-------
frame_T : FrameT
Reconstructed time-domain frame, stereo, shape (2048, 2).
"""
if frame_type in ("OLS", "LSS", "LPS"):
if frame_F.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
w = _window_sequence(frame_type, win_type)
xL = _imdct(frame_F[:, 0]) * w
xR = _imdct(frame_F[:, 1]) * w
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
if frame_type == "ESH":
if frame_F.shape != (128, 16):
raise ValueError("For ESH, frame_F must have shape (128, 16).")
Xl, Xr = _unpack_esh(frame_F)
xL = _i_filter_bank_esh_channel(Xl, win_type)
xR = _i_filter_bank_esh_channel(Xr, win_type)
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")

View File

@ -0,0 +1,60 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - SNR dB calculator
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements SNR calculation in dB
# ------------------------------------------------------------
from __future__ import annotations
from core.aac_types import StereoSignal
import numpy as np
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute overall SNR (dB) over all samples and channels after aligning lengths.
Parameters
----------
x_ref : StereoSignal
Reference stereo stream.
x_hat : StereoSignal
Reconstructed stereo stream.
Returns
-------
float
SNR in dB.
- Returns +inf if noise power is zero.
- Returns -inf if signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref))
pn = float(np.sum(err * err))
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))

View File

@ -0,0 +1,217 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Sequence Segmentation Control module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Sequence Segmentation Control module (SSC).
# Selects and returns the frame type based on input parameters.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Dict, Tuple
from core.aac_types import FrameType, FrameT, FrameChannelT
import numpy as np
# -----------------------------------------------------------------------------
# Private helpers for SSC
# -----------------------------------------------------------------------------
# See Table 1 in mm-2025-hw-v0.1.pdf
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
("OLS", "OLS"): "OLS",
("OLS", "LSS"): "LSS",
("OLS", "ESH"): "ESH",
("OLS", "LPS"): "LPS",
("LSS", "OLS"): "LSS",
("LSS", "LSS"): "LSS",
("LSS", "ESH"): "ESH",
("LSS", "LPS"): "ESH",
("ESH", "OLS"): "ESH",
("ESH", "LSS"): "ESH",
("ESH", "ESH"): "ESH",
("ESH", "LPS"): "ESH",
("LPS", "OLS"): "LPS",
("LPS", "LSS"): "ESH",
("LPS", "ESH"): "ESH",
("LPS", "LPS"): "LPS",
}
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
"""
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
according to the assignment's criterion.
Parameters
----------
next_frame_channel : FrameChannelT
One channel of next_frame_T (expected shape: (2048,)).
Returns
-------
bool
True if an attack is detected (=> next frame predicted ESH), else False.
Notes
-----
The criterion is implemented as described in the spec:
1) Apply the high-pass filter:
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
implemented in the time domain as:
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
2) Split y into 16 segments of length 128 and compute segment energies s[l].
3) Compute the ratio:
ds[l] = s[l] / s[l-1]
4) An attack exists if there exists l in {1..7} such that:
s[l] > 1e-3 and ds[l] > 10
"""
# Local alias; expected to be a 1-D array of length 2048.
x = next_frame_channel
# High-pass filter reference implementation (scalar recurrence).
y = np.zeros_like(x)
prev_x = 0.0
prev_y = 0.0
for n in range(x.shape[0]):
xn = float(x[n])
yn = (xn - prev_x) + 0.5 * prev_y
y[n] = yn
prev_x = xn
prev_y = yn
# Segment energies over 16 blocks of 128 samples.
s = np.empty(16, dtype=np.float64)
for l in range(16):
a = l * 128
b = (l + 1) * 128
seg = y[a:b]
s[l] = float(np.sum(seg * seg))
# ds[l] for l>=1. For l=0 not defined, keep 0.
ds = np.zeros(16, dtype=np.float64)
eps = 1e-12 # Avoid division by zero without materially changing the logic.
for l in range(1, 16):
ds[l] = s[l] / max(s[l - 1], eps)
# Spec: check l in {1..7}.
for l in range(1, 8):
if (s[l] > 1e-3) and (ds[l] > 10.0):
return True
return False
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
"""
Decide the current frame type for a single channel based on the previous
frame type and whether the next frame is predicted to be ESH.
Rules (spec):
- If prev is "LSS" => current is "ESH"
- If prev is "LPS" => current is "OLS"
- If prev is "OLS" => current is "LSS" if attack else "OLS"
- If prev is "ESH" => current is "ESH" if attack else "LPS"
Parameters
----------
prev_frame_type : FrameType
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
attack : bool
True if the next frame is predicted ESH for this channel.
Returns
-------
FrameType
The per-channel decision for the current frame.
"""
if prev_frame_type == "LSS":
return "ESH"
if prev_frame_type == "LPS":
return "OLS"
if prev_frame_type == "OLS":
return "LSS" if attack else "OLS"
if prev_frame_type == "ESH":
return "ESH" if attack else "LPS"
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
"""
Merge per-channel frame type decisions into one common frame type using
the stereo merge table from the spec.
Parameters
----------
ft_l : FrameType
Frame type decision for the left channel.
ft_r : FrameType
Frame type decision for the right channel.
Returns
-------
FrameType
The merged common frame type.
"""
try:
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
except KeyError as e:
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
# -----------------------------------------------------------------------------
# Public Function prototypes (Level 1)
# -----------------------------------------------------------------------------
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
"""
Sequence Segmentation Control (SSC).
Select and return the frame type for the current frame (i) based on:
- the current time-domain frame (stereo),
- the next time-domain frame (stereo), used for attack detection,
- the previous frame type.
Parameters
----------
frame_T : FrameT
Current time-domain frame i (expected shape: (2048, 2)).
next_frame_T : FrameT
Next time-domain frame (i+1), used to decide transitions to/from ESH
(expected shape: (2048, 2)).
prev_frame_type : FrameType
Frame type chosen for the previous frame (i-1).
Returns
-------
FrameType
One of: "OLS", "LSS", "ESH", "LPS".
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
if next_frame_T.shape != (2048, 2):
raise ValueError("next_frame_T must have shape (2048, 2).")
# Detect attack independently per channel on the next frame.
attack_l = _detect_attack(next_frame_T[:, 0])
attack_r = _detect_attack(next_frame_T[:, 1])
# Decide per-channel type based on shared prev_frame_type.
ft_l = _decide_frame_type(prev_frame_type, attack_l)
ft_r = _decide_frame_type(prev_frame_type, attack_r)
# Stereo merge as per the spec table.
return _stereo_merge(ft_l, ft_r)

View File

@ -0,0 +1,549 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Temporal Noise Shaping (TNS)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Temporal Noise Shaping (TNS) module (Level 2).
#
# Public API:
# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs)
#
# Notes (per assignment):
# - TNS is applied per channel (not stereo).
# - For ESH, TNS is applied independently to each of the 8 short subframes.
# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short)
# provided in TableB219.mat.
# - Predictor order is fixed to p = 4.
# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1.
# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Tuple
import numpy as np
from scipy.io import loadmat
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
from core.aac_types import *
# -----------------------------------------------------------------------------
# Private helpers
# -----------------------------------------------------------------------------
_B219_CACHE: dict[str, FloatArray] | None = None
def _load_b219_tables() -> dict[str, FloatArray]:
"""
Load TableB219.mat and cache the contents.
The project layout guarantees that a 'material' directory is discoverable
from the current working directory (tests and level_123 entrypoints).
Returns
-------
dict[str, FloatArray]
Keys:
- "B219a": long bands table (for K=1024 MDCT lines)
- "B219b": short bands table (for K=128 MDCT lines)
"""
global _B219_CACHE
if _B219_CACHE is not None:
return _B219_CACHE
mat_path = Path("material") / "TableB219.mat"
if not mat_path.exists():
raise FileNotFoundError("Could not locate material/TableB219.mat in the current working directory.")
d = loadmat(str(mat_path))
if "B219a" not in d or "B219b" not in d:
raise ValueError("TableB219.mat missing required variables B219a and/or B219b.")
_B219_CACHE = {
"B219a": np.asarray(d["B219a"], dtype=np.float64),
"B219b": np.asarray(d["B219b"], dtype=np.float64),
}
return _B219_CACHE
def _band_ranges_for_kcount(k_count: int) -> BandRanges:
"""
Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count.
Parameters
----------
k_count : int
Number of MDCT lines:
- 1024 for long frames
- 128 for short subframes (ESH)
Returns
-------
BandRanges (list[tuple[int, int]])
Each tuple is (start_k, end_k) inclusive.
"""
tables = _load_b219_tables()
if k_count == 1024:
tbl = tables["B219a"]
elif k_count == 128:
tbl = tables["B219b"]
else:
raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).")
start = tbl[:, 1].astype(int)
end = tbl[:, 2].astype(int)
ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)]
for s, e in ranges:
if s < 0 or e < s or e >= k_count:
raise ValueError("Invalid band table ranges for given k_count.")
return ranges
# -----------------------------------------------------------------------------
# Core DSP helpers
# -----------------------------------------------------------------------------
def _smooth_sw_inplace(sw: MdctCoeffs) -> None:
"""
Smooth Sw(k) to reduce discontinuities between adjacent Bark bands.
The assignment applies two passes:
- Backward: Sw(k) = (Sw(k) + Sw(k+1))/2
- Forward: Sw(k) = (Sw(k) + Sw(k-1))/2
Parameters
----------
sw : MdctCoeffs
1-D array of length K (float64). Modified in-place.
"""
k_count = int(sw.shape[0])
for k in range(k_count - 2, -1, -1):
sw[k] = 0.5 * (sw[k] + sw[k + 1])
for k in range(1, k_count):
sw[k] = 0.5 * (sw[k] + sw[k - 1])
def _compute_sw(x: MdctCoeffs) -> MdctCoeffs:
"""
Compute Sw(k) from band energies P(j) and apply boundary smoothing.
Parameters
----------
x : MdctCoeffs
1-D MDCT line array, length K.
Returns
-------
MdctCoeffs
Sw(k), 1-D array of length K, float64.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
k_count = int(x.shape[0])
bands = _band_ranges_for_kcount(k_count)
sw = np.zeros(k_count, dtype=np.float64)
for s, e in bands:
seg = x[s : e + 1]
p_j = float(np.sum(seg * seg))
sw_val = float(np.sqrt(p_j))
sw[s : e + 1] = sw_val
_smooth_sw_inplace(sw)
return sw
def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs:
"""
Autocorrelation r(m) for m=0..p.
Parameters
----------
x : MdctCoeffs
1-D signal.
p : int
Maximum lag.
Returns
-------
MdctCoeffs
r, shape (p+1,), float64.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
n = int(x.shape[0])
r = np.zeros(p + 1, dtype=np.float64)
for m in range(p + 1):
r[m] = float(np.dot(x[m:], x[: n - m]))
return r
def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs:
"""
Solve Yule-Walker normal equations for LPC coefficients of order p.
Parameters
----------
xw : MdctCoeffs
1-D normalized sequence Xw(k).
p : int
Predictor order.
Returns
-------
MdctCoeffs
LPC coefficients a[0..p-1], shape (p,), float64.
"""
r = _autocorr(xw, p)
R = np.empty((p, p), dtype=np.float64)
for i in range(p):
for j in range(p):
R[i, j] = r[abs(i - j)]
rhs = r[1 : p + 1].reshape(p)
reg = 1e-12
R_reg = R + reg * np.eye(p, dtype=np.float64)
a = np.linalg.solve(R_reg, rhs)
return a
def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs:
"""
Quantize LPC coefficients with uniform symmetric quantizer and clamp.
Parameters
----------
a : MdctCoeffs
LPC coefficient array, shape (p,).
Returns
-------
MdctCoeffs
Quantized coefficients, shape (p,), float64.
"""
a = np.asarray(a, dtype=np.float64).reshape(-1)
q = np.round(a / QUANT_STEP) * QUANT_STEP
q = np.clip(q, -QUANT_MAX, QUANT_MAX)
return q.astype(np.float64, copy=False)
def _is_inverse_stable(a_q: MdctCoeffs) -> bool:
"""
Check stability of the inverse TNS filter H_TNS^{-1}.
Forward filter:
H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
Inverse filter poles are roots of:
A(z) = 1 - a1 z^-1 - ... - ap z^-p
Multiply by z^p:
z^p - a1 z^{p-1} - ... - ap = 0
Stability condition:
all roots satisfy |z| < 1.
Parameters
----------
a_q : MdctCoeffs
Quantized predictor coefficients, shape (p,).
Returns
-------
bool
True if stable, else False.
"""
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
# Polynomial in z: z^p - a1 z^{p-1} - ... - ap
poly = np.empty(p + 1, dtype=np.float64)
poly[0] = 1.0
poly[1:] = -a_q
roots = np.roots(poly)
# Strictly inside unit circle for stability. Add tiny margin for numeric safety.
margin = 1e-12
return bool(np.all(np.abs(roots) < (1.0 - margin)))
def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs:
"""
Make quantized predictor coefficients stable for inverse filtering.
Policy:
- If already stable: return as-is.
- Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid.
- If still unstable after attempts: fall back to all-zero coefficients (disable TNS).
Parameters
----------
a_q : MdctCoeffs
Quantized predictor coefficients, shape (p,).
Returns
-------
MdctCoeffs
Stable quantized coefficients, shape (p,).
"""
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
if _is_inverse_stable(a_q):
return a_q
# Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid.
gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1)
for g in gammas:
cand = _quantize_coeffs(g * a_q)
if _is_inverse_stable(cand):
return cand
# Last resort: disable TNS for this vector
return np.zeros_like(a_q, dtype=np.float64)
def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
"""
Apply forward TNS FIR filter:
y[k] = x[k] - sum_{l=1..p} a_l * x[k-l]
Parameters
----------
x : MdctCoeffs
1-D MDCT lines, length K.
a_q : MdctCoeffs
Quantized LPC coefficients, shape (p,).
Returns
-------
MdctCoeffs
Filtered MDCT lines y, length K.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
k_count = int(x.shape[0])
y = np.zeros(k_count, dtype=np.float64)
for k in range(k_count):
acc = x[k]
for l in range(1, p + 1):
if k - l >= 0:
acc -= a_q[l - 1] * x[k - l]
y[k] = acc
return y
def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
"""
Apply inverse TNS IIR filter:
x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l]
Parameters
----------
y : MdctCoeffs
1-D MDCT lines after TNS, length K.
a_q : MdctCoeffs
Quantized LPC coefficients, shape (p,).
Returns
-------
MdctCoeffs
Reconstructed MDCT lines x_hat, length K.
"""
y = np.asarray(y, dtype=np.float64).reshape(-1)
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
k_count = int(y.shape[0])
x_hat = np.zeros(k_count, dtype=np.float64)
for k in range(k_count):
acc = y[k]
for l in range(1, p + 1):
if k - l >= 0:
acc += a_q[l - 1] * x_hat[k - l]
x_hat[k] = acc
return x_hat
def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
"""
TNS for a single MDCT vector (one long frame or one short subframe).
Steps:
1) Compute Sw(k) from Bark band energies and smooth it.
2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0).
3) Compute LPC coefficients (order p=PRED_ORDER) on Xw.
4) Quantize coefficients (4-bit symmetric, step QUANT_STEP).
5) Apply FIR filter on original X(k) using quantized coefficients.
Parameters
----------
x : MdctCoeffs
1-D MDCT vector.
Returns
-------
y : MdctCoeffs
TNS-processed MDCT vector (same length).
a_q : MdctCoeffs
Quantized LPC coefficients, shape (PRED_ORDER,).
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
sw = _compute_sw(x)
eps = 1e-12
xw = np.where(sw > eps, x / sw, 0.0)
a = _lpc_coeffs(xw, PRED_ORDER)
a_q = _quantize_coeffs(a)
# Ensure inverse stability (assignment requirement)
a_q = _stabilize_quantized_coeffs(a_q)
y = _apply_tns_fir(x, a_q)
return y, a_q
# -----------------------------------------------------------------------------
# Public Functions (Level 2)
# -----------------------------------------------------------------------------
def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]:
"""
Temporal Noise Shaping (TNS) for ONE channel.
Parameters
----------
frame_F_in : FrameChannelF
Per-channel MDCT coefficients.
Expected (typical) shapes:
- If frame_type == "ESH": (128, 8)
- Else: (1024, 1) or (1024,)
frame_type : FrameType
Frame type code ("OLS", "LSS", "ESH", "LPS").
Returns
-------
frame_F_out : FrameChannelF
Per-channel MDCT coefficients after applying TNS.
Same shape convention as input.
tns_coeffs : TnsCoeffs
Quantized TNS predictor coefficients.
Expected shapes:
- If frame_type == "ESH": (PRED_ORDER, 8)
- Else: (PRED_ORDER, 1)
"""
x = np.asarray(frame_F_in, dtype=np.float64)
if frame_type == "ESH":
if x.shape != (128, 8):
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
y = np.empty_like(x, dtype=np.float64)
a_out = np.empty((PRED_ORDER, 8), dtype=np.float64)
for j in range(8):
y[:, j], a_out[:, j] = _tns_one_vector(x[:, j])
return y, a_out
if x.shape == (1024,):
x_vec = x
out_shape = (1024,)
elif x.shape == (1024, 1):
x_vec = x[:, 0]
out_shape = (1024, 1)
else:
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
y_vec, a_q = _tns_one_vector(x_vec)
if out_shape == (1024,):
y_out = y_vec
else:
y_out = y_vec.reshape(1024, 1)
a_out = a_q.reshape(PRED_ORDER, 1)
return y_out, a_out
def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF:
"""
Inverse Temporal Noise Shaping (iTNS) for ONE channel.
Parameters
----------
frame_F_in : FrameChannelF
Per-channel MDCT coefficients after TNS.
Expected (typical) shapes:
- If frame_type == "ESH": (128, 8)
- Else: (1024, 1) or (1024,)
frame_type : FrameType
Frame type code ("OLS", "LSS", "ESH", "LPS").
tns_coeffs : TnsCoeffs
Quantized TNS predictor coefficients.
Expected shapes:
- If frame_type == "ESH": (PRED_ORDER, 8)
- Else: (PRED_ORDER, 1)
Returns
-------
FrameChannelF
Per-channel MDCT coefficients after inverse TNS.
Same shape convention as input frame_F_in.
"""
x = np.asarray(frame_F_in, dtype=np.float64)
a = np.asarray(tns_coeffs, dtype=np.float64)
if frame_type == "ESH":
if x.shape != (128, 8):
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
if a.shape != (PRED_ORDER, 8):
raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).")
y = np.empty_like(x, dtype=np.float64)
for j in range(8):
y[:, j] = _apply_itns_iir(x[:, j], a[:, j])
return y
if a.shape != (PRED_ORDER, 1):
raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).")
if x.shape == (1024,):
x_vec = x
out_shape = (1024,)
elif x.shape == (1024, 1):
x_vec = x[:, 0]
out_shape = (1024, 1)
else:
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
y_vec = _apply_itns_iir(x_vec, a[:, 0])
if out_shape == (1024,):
return y_vec
return y_vec.reshape(1024, 1)

View File

@ -0,0 +1,282 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Public Type Aliases
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements Public Type aliases
# ------------------------------------------------------------
from __future__ import annotations
from typing import List, Literal, TypeAlias, TypedDict
import numpy as np
from numpy.typing import NDArray
# -----------------------------------------------------------------------------
# Code enums (for readability; not intended to enforce shapes/lengths)
# -----------------------------------------------------------------------------
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
"""
Frame type codes (AAC):
- "OLS": ONLY_LONG_SEQUENCE
- "LSS": LONG_START_SEQUENCE
- "ESH": EIGHT_SHORT_SEQUENCE
- "LPS": LONG_STOP_SEQUENCE
"""
WinType: TypeAlias = Literal["KBD", "SIN"]
"""
Window type codes (AAC):
- "KBD": Kaiser-Bessel-Derived
- "SIN": sinusoid
"""
ChannelKey: TypeAlias = Literal["chl", "chr"]
"""Channel dictionary keys used in Level payloads."""
# -----------------------------------------------------------------------------
# Array “semantic” aliases
#
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
# forcing strict shapes in the type system.
# -----------------------------------------------------------------------------
FloatArray: TypeAlias = NDArray[np.float64]
"""
Generic float64 NumPy array.
Note:
- We standardize internal numeric computations to float64 for stability and
reproducibility. External I/O can still be float32, but we convert at the
boundaries.
"""
Window: TypeAlias = FloatArray
"""
Time-domain window (weighting sequence), 1-D.
Typical lengths in this assignment:
- Long: 2048
- Short: 256
- Window sequences for LSS/LPS are also 2048
Expected shape: (N,)
dtype: float64
"""
TimeSignal: TypeAlias = FloatArray
"""
Time-domain signal samples, typically 1-D.
Examples:
- Windowed MDCT input: shape (N,)
- IMDCT output: shape (N,)
dtype: float64
"""
StereoSignal: TypeAlias = FloatArray
"""
Time-domain stereo signal stream.
Expected (typical) shape: (N, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
MdctCoeffs: TypeAlias = FloatArray
"""
MDCT coefficient vector, typically 1-D.
Examples:
- Long: shape (1024,)
- Short: shape (128,)
dtype: float64
"""
MdctFrameChannel: TypeAlias = FloatArray
"""
Per-channel MDCT container used in Level-1/2 sequences.
Typical shapes:
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
dtype: float64
Notes
-----
Some parts of the assignment store long-frame coefficients as a column vector
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
when convenient, but the semantic meaning is identical.
"""
TnsCoeffs: TypeAlias = FloatArray
"""
Quantized TNS predictor coefficients (one channel).
Typical shapes (Level 2):
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
- Else: (4, 1) (order p=4 for the long frame)
dtype: float64
Notes
-----
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
We store the quantized coefficient values as float64 (typically multiples of 0.1)
to keep the pipeline simple and readable.
"""
FrameT: TypeAlias = FloatArray
"""
Time-domain frame (stereo), as used by the filterbank input/output.
Expected (typical) shape for stereo: (2048, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
FrameChannelT: TypeAlias = FloatArray
"""
Time-domain single-channel frame.
Expected (typical) shape: (2048,)
dtype: float64
"""
FrameF: TypeAlias = FloatArray
"""
Frequency-domain frame (MDCT coefficients), stereo container.
Typical shapes (Level 1):
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
- If frame_type == "ESH": (128, 16)
Rationale for ESH (128, 16):
- 8 short subframes per channel => 8 * 2 = 16 columns total
- Each short subframe per stereo is (128, 2), flattened into columns
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
dtype: float64
"""
FrameChannelF: TypeAlias = MdctFrameChannel
"""
Frequency-domain single-channel MDCT coefficients.
Typical shapes (Level 1/2):
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8)
dtype: float64
"""
BandRanges: TypeAlias = list[tuple[int, int]]
"""
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
Used by TNS to map MDCT indices k to Bark bands.
"""
# -----------------------------------------------------------------------------
# Level 1 AAC sequence payload types
# -----------------------------------------------------------------------------
class AACChannelFrameF(TypedDict):
"""
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
Keys
----
frame_F:
The MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8) (8 short subframes)
- else: (1024, 1) or (1024,)
"""
frame_F: FrameChannelF
class AACSeq1Frame(TypedDict):
"""
One frame dictionary element of aac_seq_1 (Level 1).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF
chr: AACChannelFrameF
AACSeq1: TypeAlias = List[AACSeq1Frame]
"""
AAC sequence for Level 1:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
"""
# -----------------------------------------------------------------------------
# Level 2 AAC sequence payload types (TNS)
# -----------------------------------------------------------------------------
class AACChannelFrameF2(TypedDict):
"""
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
Keys
----
frame_F:
The TNS-processed MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8)
- else: (1024, 1) or (1024,)
tns_coeffs:
Quantized TNS predictor coefficients for ONE channel.
Typical shapes:
- ESH: (PRED_ORDER, 8)
- else: (PRED_ORDER, 1)
"""
frame_F: FrameChannelF
tns_coeffs: TnsCoeffs
class AACSeq2Frame(TypedDict):
"""
One frame dictionary element of aac_seq_2 (Level 2).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF2
chr: AACChannelFrameF2
AACSeq2: TypeAlias = List[AACSeq2Frame]
"""
AAC sequence for Level 2:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
Level 2 adds:
- per-channel "tns_coeffs"
and stores:
- per-channel "frame_F" after applying TNS.
"""

View File

@ -19,3 +19,125 @@
# ------------------------------------------------------------ # ------------------------------------------------------------
from __future__ import annotations from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_types import AACSeq2, StereoSignal
from core.aac_coder import aac_coder_2 as core_aac_coder_2
from core.aac_coder import aac_read_wav_stereo_48k
from core.aac_decoder import aac_decoder_2 as core_aac_decoder_2
from core.aac_snr_db import snr_db
# -----------------------------------------------------------------------------
# Public Level 2 API (wrappers)
# -----------------------------------------------------------------------------
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
"""
Level-2 AAC encoder (wrapper).
Delegates to core implementation.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
Returns
-------
AACSeq2
List of encoded frames (Level 2 schema).
"""
return core_aac_coder_2(filename_in)
def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
"""
Level-2 AAC decoder (wrapper).
Delegates to core implementation.
Parameters
----------
aac_seq_2 : AACSeq2
Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
return core_aac_decoder_2(aac_seq_2, filename_out)
# -----------------------------------------------------------------------------
# Demo (Level 2)
# -----------------------------------------------------------------------------
def demo_aac_2(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float:
"""
Demonstration for the Level-2 codec.
Runs:
- aac_coder_2(filename_in)
- aac_decoder_2(aac_seq_2, filename_out)
and computes total SNR between original and decoded audio.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
filename_out : Union[str, Path]
Output WAV filename (stereo, 48 kHz).
Returns
-------
float
Overall SNR in dB.
"""
filename_in = Path(filename_in)
filename_out = Path(filename_out)
# Read original audio (reference) with the same validation as the codec.
x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in)
if int(fs_ref) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
# Encode / decode
aac_seq_2 = aac_coder_2(filename_in)
x_hat = i_aac_coder_2(aac_seq_2, filename_out)
# Optional sanity: ensure output file exists and is readable
_, fs_hat = sf.read(str(filename_out), always_2d=True)
if int(fs_hat) != 48000:
raise ValueError("Decoded output sampling rate must be 48 kHz.")
return snr_db(x_ref, x_hat)
# -----------------------------------------------------------------------------
# CLI
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Example:
# cd level_2
# python -m level_2 input.wav output.wav
# or
# python -m level_2 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l2.wav
import sys
if len(sys.argv) != 3:
raise SystemExit("Usage: python -m level_2 <input.wav> <output.wav>")
in_wav = Path(sys.argv[1])
out_wav = Path(sys.argv[2])
print(f"Encoding/Decoding {in_wav} to {out_wav}")
snr = demo_aac_2(in_wav, out_wav)
print(f"SNR = {snr:.3f} dB")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,400 @@
import numpy as np
import scipy.io as sio
import os
# ------------------ LOAD LUT ------------------
def load_LUT(mat_filename=None):
"""
Loads the list of Huffman Codebooks (LUTs)
Returns:
huffLUT : list (index 1..11 used, index 0 unused)
"""
if mat_filename is None:
current_dir = os.path.dirname(os.path.abspath(__file__))
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
mat = sio.loadmat(mat_filename)
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
huffCodebooks = []
for i in range(11):
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
# Build inverse VLC tables
invTable = [None] * 11
for i in range(11):
h = huffCodebooks[i][:, 2].astype(int) # column 3
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
hbin = []
for j in range(len(h)):
hbin.append(format(h[j], f'0{hlength[j]}b'))
invTable[i] = vlc_table(hbin)
# Build Huffman LUT dicts
huffLUT = [None] * 12 # index 0 unused
params = [
(4, 1, True),
(4, 1, True),
(4, 2, False),
(4, 2, False),
(2, 4, True),
(2, 4, True),
(2, 7, False),
(2, 7, False),
(2, 12, False),
(2, 12, False),
(2, 16, False),
]
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
huffLUT[i] = {
'LUT': huffCodebooks[i-1],
'invTable': invTable[i-1],
'codebook': i,
'nTupleSize': nTupleSize,
'maxAbsCodeVal': maxAbs,
'signedValues': signed
}
return huffLUT
def vlc_table(code_array):
"""
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
returns:
h : NumPy array of shape (num_nodes, 3)
columns:
[ next_if_0 , next_if_1 , symbol_index ]
"""
h = np.zeros((1, 3), dtype=int)
for code_index, code in enumerate(code_array, start=1):
word = [int(bit) for bit in code]
h_index = 0
for bit in word:
k = bit
next_node = h[h_index, k]
if next_node == 0:
h = np.vstack([h, [0, 0, 0]])
new_index = h.shape[0] - 1
h[h_index, k] = new_index
h_index = new_index
else:
h_index = next_node
h[h_index, 2] = code_index
return h
# ------------------ ENCODE ------------------
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
"""
Huffman-encode a sequence of quantized coefficients.
This function selects the appropriate Huffman codebook based on the
maximum absolute value of the input coefficients, encodes the coefficients
into a binary Huffman bitstream, and returns both the bitstream and the
selected codebook index.
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
in audio/image coding (e.g., scale factor band encoding). The input
coefficient sequence is grouped into fixed-size tuples as defined by
the chosen Huffman LUT. Zero-padding may be applied internally.
Parameters
----------
coeff_sec : array_like of int
1-D array of quantized integer coefficients to encode.
Typically corresponds to a "section" or scale-factor band.
huff_LUT_list : list
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
Index 1..11 correspond to valid Huffman codebooks.
Index 0 is unused.
Returns
-------
huffSec : str
Huffman-encoded bitstream represented as a string of '0' and '1'
characters.
huffCodebook : int
Index (1..11) of the Huffman codebook used for encoding.
A value of 0 indicates a special all-zero section.
"""
if force_codebook is not None:
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
maxAbsVal = np.max(np.abs(coeff_sec))
if maxAbsVal == 0:
huffCodebook = 0
huffSec = huff_LUT_code_0()
elif maxAbsVal == 1:
candidates = [1, 2]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal == 2:
candidates = [3, 4]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (3, 4):
candidates = [5, 6]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (5, 6, 7):
candidates = [7, 8]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (8, 9, 10, 11, 12):
candidates = [9, 10]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (13, 14, 15):
huffCodebook = 11
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
else:
huffCodebook = 11
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
return huffSec, huffCodebook
def huff_LUT_code_1(huff_LUT, coeff_sec):
LUT = huff_LUT['LUT']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
signedValues = huff_LUT['signedValues']
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
if signedValues:
coeff = coeff_sec + maxAbsCodeVal
base = 2 * maxAbsCodeVal + 1
else:
coeff = coeff_sec
base = maxAbsCodeVal + 1
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
coeffPad[:len(coeff)] = coeff
huffSec = []
powers = base ** np.arange(nTupleSize - 1, -1, -1)
for i in range(numTuples):
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
huffIndex = int(np.abs(nTuple) @ powers)
hexVal = LUT[huffIndex, 2]
huffLen = LUT[huffIndex, 1]
bits = format(int(hexVal), f'0{int(huffLen)}b')
if signedValues:
huffSec.append(bits)
else:
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
huffSec.append(bits + signBits)
return ''.join(huffSec)
def huff_LUT_code_0():
return ''
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
LUT = huff_LUT['LUT']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
base = maxAbsCodeVal + 1
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
coeffPad[:len(coeff_sec)] = coeff_sec
huffSec = []
powers = base ** np.arange(nTupleSize - 1, -1, -1)
for i in range(numTuples):
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
lnTuple = nTuple.astype(float)
lnTuple[lnTuple == 0] = np.finfo(float).eps
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
N = np.maximum(0, N4 - 4)
esc = np.abs(nTuple) > 15
nTupleESC = nTuple.copy()
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
huffIndex = int(np.abs(nTupleESC) @ powers)
hexVal = LUT[huffIndex, 2]
huffLen = LUT[huffIndex, 1]
bits = format(int(hexVal), f'0{int(huffLen)}b')
escSeq = ''
for k in range(nTupleSize):
if esc[k]:
escSeq += '1' * N[k]
escSeq += '0'
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
huffSec.append(bits + signBits + escSeq)
return ''.join(huffSec)
# ------------------ DECODE ------------------
def decode_huff(huff_sec, huff_LUT):
"""
Decode a Huffman-encoded stream.
Parameters
----------
huff_sec : array-like of int or str
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
huff_LUT : dict
Huffman lookup table with keys:
- 'invTable': inverse table (numpy array)
- 'codebook': codebook number
- 'nTupleSize': tuple size
- 'maxAbsCodeVal': maximum absolute code value
- 'signedValues': True/False
Returns
-------
decCoeffs : list of int
Decoded quantized coefficients.
"""
h = huff_LUT['invTable']
huffCodebook = huff_LUT['codebook']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
signedValues = huff_LUT['signedValues']
# Convert string to array of ints
if isinstance(huff_sec, str):
huff_sec = np.array([int(b) for b in huff_sec])
eos = False
decCoeffs = []
streamIndex = 0
while not eos:
wordbit = 0
r = 0 # start at root
# Decode Huffman word using inverse table
while True:
b = huff_sec[streamIndex + wordbit]
wordbit += 1
rOld = r
r = h[rOld, b]
if h[r, 0] == 0 and h[r, 1] == 0:
symbolIndex = h[r, 2] - 1 # zero-based
streamIndex += wordbit
break
# Decode n-tuple magnitudes
if signedValues:
base = 2 * maxAbsCodeVal + 1
nTupleDec = []
tmp = symbolIndex
for p in reversed(range(nTupleSize)):
val = tmp // (base ** p)
nTupleDec.append(val - maxAbsCodeVal)
tmp = tmp % (base ** p)
nTupleDec = np.array(nTupleDec)
else:
base = maxAbsCodeVal + 1
nTupleDec = []
tmp = symbolIndex
for p in reversed(range(nTupleSize)):
val = tmp // (base ** p)
nTupleDec.append(val)
tmp = tmp % (base ** p)
nTupleDec = np.array(nTupleDec)
# Apply sign bits
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
streamIndex += nTupleSize
nTupleDec = nTupleDec * nTupleSign
# Handle escape sequences
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
if huffCodebook == 11 and escIndex.size > 0:
for idx in escIndex:
N = 0
b = huff_sec[streamIndex]
while b:
N += 1
b = huff_sec[streamIndex + N]
streamIndex += N
N4 = N + 4
escape_word = huff_sec[streamIndex:streamIndex + N4]
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
nTupleDec[idx] = escape_value
streamIndex += N4 + 1
# Apply signs again
nTupleDec[escIndex] *= nTupleSign[escIndex]
decCoeffs.extend(nTupleDec.tolist())
if streamIndex >= len(huff_sec):
eos = True
return decCoeffs