Level 2: Core functionality and level_2 script wrappers added
This commit is contained in:
parent
399abebd2c
commit
9931e3830a
@ -30,6 +30,7 @@ import soundfile as sf
|
||||
from core.aac_configuration import WIN_TYPE
|
||||
from core.aac_filterbank import aac_filter_bank
|
||||
from core.aac_ssc import aac_SSC
|
||||
from core.aac_tns import aac_tns
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
@ -144,8 +145,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||
AACSeq1
|
||||
List of encoded frames (Level 1 schema).
|
||||
"""
|
||||
x, fs = aac_read_wav_stereo_48k(filename_in)
|
||||
_ = fs # kept for clarity; The assignment assumes 48 kHz
|
||||
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||
# The assignment assumes 48 kHz
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
@ -196,3 +197,88 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||
prev_frame_type = frame_type
|
||||
|
||||
return aac_seq
|
||||
|
||||
|
||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
||||
"""
|
||||
Level-2 AAC encoder (Level 1 + TNS).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename_in : Union[str, Path]
|
||||
Input WAV filename (stereo, 48 kHz).
|
||||
|
||||
Returns
|
||||
-------
|
||||
AACSeq2
|
||||
Encoded AAC sequence (Level 2 payload schema).
|
||||
For each frame i:
|
||||
- "frame_type": FrameType
|
||||
- "win_type": WinType
|
||||
- "chl"/"chr":
|
||||
- "frame_F": FrameChannelF (after TNS)
|
||||
- "tns_coeffs": TnsCoeffs
|
||||
"""
|
||||
filename_in = Path(filename_in)
|
||||
|
||||
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||
# The assignment assumes 48 kHz
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
|
||||
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||
|
||||
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||
if K <= 0:
|
||||
raise ValueError("Input too short for framing.")
|
||||
|
||||
aac_seq: AACSeq2 = []
|
||||
prev_frame_type: FrameType = "OLS"
|
||||
|
||||
for i in range(K):
|
||||
start = i * hop
|
||||
|
||||
frame_t: FrameT = x_pad[start : start + win, :]
|
||||
if frame_t.shape != (win, 2):
|
||||
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||
|
||||
next_t = x_pad[start + hop : start + hop + win, :]
|
||||
if next_t.shape[0] < win:
|
||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||
next_t = np.vstack([next_t, tail])
|
||||
|
||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
||||
|
||||
# Level 1 analysis (packed stereo container)
|
||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||
|
||||
# Unpack to per-channel (as you already do in Level 1)
|
||||
if frame_type == "ESH":
|
||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
||||
for j in range(8):
|
||||
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
|
||||
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
|
||||
else:
|
||||
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
|
||||
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
|
||||
|
||||
# Level 2: apply TNS per channel
|
||||
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||
|
||||
aac_seq.append(
|
||||
{
|
||||
"frame_type": frame_type,
|
||||
"win_type": WIN_TYPE,
|
||||
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
|
||||
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
||||
}
|
||||
)
|
||||
|
||||
prev_frame_type = frame_type
|
||||
|
||||
return aac_seq
|
||||
@ -17,6 +17,15 @@ from __future__ import annotations
|
||||
# Imports
|
||||
from core.aac_types import WinType
|
||||
|
||||
# Filterbank
|
||||
# ------------------------------------------------------------
|
||||
# Window type
|
||||
# Options: "SIN", "KBD"
|
||||
WIN_TYPE: WinType = "SIN"
|
||||
WIN_TYPE: WinType = "SIN"
|
||||
|
||||
|
||||
# TNS
|
||||
# ------------------------------------------------------------
|
||||
PRED_ORDER = 4
|
||||
QUANT_STEP = 0.1
|
||||
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]
|
||||
@ -28,6 +28,7 @@ from typing import Union
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_filterbank import aac_i_filter_bank
|
||||
from core.aac_tns import aac_i_tns
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
@ -164,3 +165,93 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
||||
sf.write(str(filename_out), y, 48000)
|
||||
|
||||
return y
|
||||
|
||||
|
||||
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
||||
"""
|
||||
Level-2 AAC decoder (inverse of aac_coder_2).
|
||||
|
||||
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
|
||||
- Per frame/channel: inverse TNS using stored coefficients
|
||||
- Re-pack to stereo frame_F
|
||||
- IMDCT + windowing
|
||||
- Overlap-add over frames
|
||||
- Remove Level-1 padding (hop samples start/end)
|
||||
- Write output WAV (48 kHz)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
aac_seq_2 : AACSeq2
|
||||
Encoded sequence as produced by aac_coder_2().
|
||||
filename_out : Union[str, Path]
|
||||
Output WAV filename.
|
||||
|
||||
Returns
|
||||
-------
|
||||
StereoSignal
|
||||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||
"""
|
||||
filename_out = Path(filename_out)
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
K = len(aac_seq_2)
|
||||
|
||||
if K <= 0:
|
||||
raise ValueError("aac_seq_2 must contain at least one frame.")
|
||||
|
||||
n_pad = (K - 1) * hop + win
|
||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||
|
||||
for i, fr in enumerate(aac_seq_2):
|
||||
frame_type: FrameType = fr["frame_type"]
|
||||
win_type: WinType = fr["win_type"]
|
||||
|
||||
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||
|
||||
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||
|
||||
# Inverse TNS per channel
|
||||
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
|
||||
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
|
||||
|
||||
# Re-pack to the stereo container expected by aac_i_filter_bank
|
||||
if frame_type == "ESH":
|
||||
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||
|
||||
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
|
||||
for j in range(8):
|
||||
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||
else:
|
||||
# Accept either (1024,1) or (1024,) from your internal convention.
|
||||
if chl_f.shape == (1024,):
|
||||
chl_col = chl_f.reshape(1024, 1)
|
||||
elif chl_f.shape == (1024, 1):
|
||||
chl_col = chl_f
|
||||
else:
|
||||
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
|
||||
|
||||
if chr_f.shape == (1024,):
|
||||
chr_col = chr_f.reshape(1024, 1)
|
||||
elif chr_f.shape == (1024, 1):
|
||||
chr_col = chr_f
|
||||
else:
|
||||
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
|
||||
|
||||
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||
frame_f[:, 0] = chl_col[:, 0]
|
||||
frame_f[:, 1] = chr_col[:, 0]
|
||||
|
||||
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||
|
||||
start = i * hop
|
||||
y_pad[start : start + win, :] += frame_t_hat
|
||||
|
||||
y = aac_remove_padding(y_pad, hop=hop)
|
||||
|
||||
sf.write(str(filename_out), y, 48000)
|
||||
return y
|
||||
60
source/core/aac_snr_db.py
Normal file
60
source/core/aac_snr_db.py
Normal file
@ -0,0 +1,60 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - SNR dB calculator
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# This module implements SNR calculation in dB
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from core.aac_types import StereoSignal
|
||||
import numpy as np
|
||||
|
||||
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||
"""
|
||||
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_ref : StereoSignal
|
||||
Reference stereo stream.
|
||||
x_hat : StereoSignal
|
||||
Reconstructed stereo stream.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
SNR in dB.
|
||||
- Returns +inf if noise power is zero.
|
||||
- Returns -inf if signal power is zero.
|
||||
"""
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||
|
||||
if x_ref.ndim == 1:
|
||||
x_ref = x_ref.reshape(-1, 1)
|
||||
if x_hat.ndim == 1:
|
||||
x_hat = x_hat.reshape(-1, 1)
|
||||
|
||||
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||
|
||||
x_ref = x_ref[:n, :c]
|
||||
x_hat = x_hat[:n, :c]
|
||||
|
||||
err = x_ref - x_hat
|
||||
ps = float(np.sum(x_ref * x_ref))
|
||||
pn = float(np.sum(err * err))
|
||||
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
|
||||
return float(10.0 * np.log10(ps / pn))
|
||||
549
source/core/aac_tns.py
Normal file
549
source/core/aac_tns.py
Normal file
@ -0,0 +1,549 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Temporal Noise Shaping (TNS)
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Temporal Noise Shaping (TNS) module (Level 2).
|
||||
#
|
||||
# Public API:
|
||||
# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||
# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs)
|
||||
#
|
||||
# Notes (per assignment):
|
||||
# - TNS is applied per channel (not stereo).
|
||||
# - For ESH, TNS is applied independently to each of the 8 short subframes.
|
||||
# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short)
|
||||
# provided in TableB219.mat.
|
||||
# - Predictor order is fixed to p = 4.
|
||||
# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1.
|
||||
# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||
# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
from scipy.io import loadmat
|
||||
|
||||
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
|
||||
from core.aac_types import *
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
_B219_CACHE: dict[str, FloatArray] | None = None
|
||||
|
||||
|
||||
def _load_b219_tables() -> dict[str, FloatArray]:
|
||||
"""
|
||||
Load TableB219.mat and cache the contents.
|
||||
|
||||
The project layout guarantees that a 'material' directory is discoverable
|
||||
from the current working directory (tests and level_123 entrypoints).
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, FloatArray]
|
||||
Keys:
|
||||
- "B219a": long bands table (for K=1024 MDCT lines)
|
||||
- "B219b": short bands table (for K=128 MDCT lines)
|
||||
"""
|
||||
global _B219_CACHE
|
||||
if _B219_CACHE is not None:
|
||||
return _B219_CACHE
|
||||
|
||||
mat_path = Path("material") / "TableB219.mat"
|
||||
if not mat_path.exists():
|
||||
raise FileNotFoundError("Could not locate material/TableB219.mat in the current working directory.")
|
||||
|
||||
d = loadmat(str(mat_path))
|
||||
if "B219a" not in d or "B219b" not in d:
|
||||
raise ValueError("TableB219.mat missing required variables B219a and/or B219b.")
|
||||
|
||||
_B219_CACHE = {
|
||||
"B219a": np.asarray(d["B219a"], dtype=np.float64),
|
||||
"B219b": np.asarray(d["B219b"], dtype=np.float64),
|
||||
}
|
||||
return _B219_CACHE
|
||||
|
||||
|
||||
def _band_ranges_for_kcount(k_count: int) -> BandRanges:
|
||||
"""
|
||||
Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k_count : int
|
||||
Number of MDCT lines:
|
||||
- 1024 for long frames
|
||||
- 128 for short subframes (ESH)
|
||||
|
||||
Returns
|
||||
-------
|
||||
BandRanges (list[tuple[int, int]])
|
||||
Each tuple is (start_k, end_k) inclusive.
|
||||
"""
|
||||
tables = _load_b219_tables()
|
||||
if k_count == 1024:
|
||||
tbl = tables["B219a"]
|
||||
elif k_count == 128:
|
||||
tbl = tables["B219b"]
|
||||
else:
|
||||
raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).")
|
||||
|
||||
start = tbl[:, 1].astype(int)
|
||||
end = tbl[:, 2].astype(int)
|
||||
|
||||
ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)]
|
||||
|
||||
for s, e in ranges:
|
||||
if s < 0 or e < s or e >= k_count:
|
||||
raise ValueError("Invalid band table ranges for given k_count.")
|
||||
return ranges
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Core DSP helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _smooth_sw_inplace(sw: MdctCoeffs) -> None:
|
||||
"""
|
||||
Smooth Sw(k) to reduce discontinuities between adjacent Bark bands.
|
||||
|
||||
The assignment applies two passes:
|
||||
- Backward: Sw(k) = (Sw(k) + Sw(k+1))/2
|
||||
- Forward: Sw(k) = (Sw(k) + Sw(k-1))/2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sw : MdctCoeffs
|
||||
1-D array of length K (float64). Modified in-place.
|
||||
"""
|
||||
k_count = int(sw.shape[0])
|
||||
|
||||
for k in range(k_count - 2, -1, -1):
|
||||
sw[k] = 0.5 * (sw[k] + sw[k + 1])
|
||||
|
||||
for k in range(1, k_count):
|
||||
sw[k] = 0.5 * (sw[k] + sw[k - 1])
|
||||
|
||||
|
||||
def _compute_sw(x: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Compute Sw(k) from band energies P(j) and apply boundary smoothing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D MDCT line array, length K.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Sw(k), 1-D array of length K, float64.
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
k_count = int(x.shape[0])
|
||||
|
||||
bands = _band_ranges_for_kcount(k_count)
|
||||
sw = np.zeros(k_count, dtype=np.float64)
|
||||
|
||||
for s, e in bands:
|
||||
seg = x[s : e + 1]
|
||||
p_j = float(np.sum(seg * seg))
|
||||
sw_val = float(np.sqrt(p_j))
|
||||
sw[s : e + 1] = sw_val
|
||||
|
||||
_smooth_sw_inplace(sw)
|
||||
return sw
|
||||
|
||||
|
||||
def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs:
|
||||
"""
|
||||
Autocorrelation r(m) for m=0..p.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D signal.
|
||||
p : int
|
||||
Maximum lag.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
r, shape (p+1,), float64.
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
n = int(x.shape[0])
|
||||
|
||||
r = np.zeros(p + 1, dtype=np.float64)
|
||||
for m in range(p + 1):
|
||||
r[m] = float(np.dot(x[m:], x[: n - m]))
|
||||
return r
|
||||
|
||||
|
||||
def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs:
|
||||
"""
|
||||
Solve Yule-Walker normal equations for LPC coefficients of order p.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
xw : MdctCoeffs
|
||||
1-D normalized sequence Xw(k).
|
||||
p : int
|
||||
Predictor order.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
LPC coefficients a[0..p-1], shape (p,), float64.
|
||||
"""
|
||||
r = _autocorr(xw, p)
|
||||
|
||||
R = np.empty((p, p), dtype=np.float64)
|
||||
for i in range(p):
|
||||
for j in range(p):
|
||||
R[i, j] = r[abs(i - j)]
|
||||
|
||||
rhs = r[1 : p + 1].reshape(p)
|
||||
|
||||
reg = 1e-12
|
||||
R_reg = R + reg * np.eye(p, dtype=np.float64)
|
||||
|
||||
a = np.linalg.solve(R_reg, rhs)
|
||||
return a
|
||||
|
||||
|
||||
def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Quantize LPC coefficients with uniform symmetric quantizer and clamp.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : MdctCoeffs
|
||||
LPC coefficient array, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Quantized coefficients, shape (p,), float64.
|
||||
"""
|
||||
a = np.asarray(a, dtype=np.float64).reshape(-1)
|
||||
q = np.round(a / QUANT_STEP) * QUANT_STEP
|
||||
q = np.clip(q, -QUANT_MAX, QUANT_MAX)
|
||||
return q.astype(np.float64, copy=False)
|
||||
|
||||
|
||||
def _is_inverse_stable(a_q: MdctCoeffs) -> bool:
|
||||
"""
|
||||
Check stability of the inverse TNS filter H_TNS^{-1}.
|
||||
|
||||
Forward filter:
|
||||
H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||
|
||||
Inverse filter poles are roots of:
|
||||
A(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||
Multiply by z^p:
|
||||
z^p - a1 z^{p-1} - ... - ap = 0
|
||||
|
||||
Stability condition:
|
||||
all roots satisfy |z| < 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a_q : MdctCoeffs
|
||||
Quantized predictor coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if stable, else False.
|
||||
"""
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
p = int(a_q.shape[0])
|
||||
|
||||
# Polynomial in z: z^p - a1 z^{p-1} - ... - ap
|
||||
poly = np.empty(p + 1, dtype=np.float64)
|
||||
poly[0] = 1.0
|
||||
poly[1:] = -a_q
|
||||
|
||||
roots = np.roots(poly)
|
||||
|
||||
# Strictly inside unit circle for stability. Add tiny margin for numeric safety.
|
||||
margin = 1e-12
|
||||
return bool(np.all(np.abs(roots) < (1.0 - margin)))
|
||||
|
||||
|
||||
def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Make quantized predictor coefficients stable for inverse filtering.
|
||||
|
||||
Policy:
|
||||
- If already stable: return as-is.
|
||||
- Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid.
|
||||
- If still unstable after attempts: fall back to all-zero coefficients (disable TNS).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a_q : MdctCoeffs
|
||||
Quantized predictor coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Stable quantized coefficients, shape (p,).
|
||||
"""
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
|
||||
if _is_inverse_stable(a_q):
|
||||
return a_q
|
||||
|
||||
# Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid.
|
||||
gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1)
|
||||
|
||||
for g in gammas:
|
||||
cand = _quantize_coeffs(g * a_q)
|
||||
if _is_inverse_stable(cand):
|
||||
return cand
|
||||
|
||||
# Last resort: disable TNS for this vector
|
||||
return np.zeros_like(a_q, dtype=np.float64)
|
||||
|
||||
|
||||
def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Apply forward TNS FIR filter:
|
||||
y[k] = x[k] - sum_{l=1..p} a_l * x[k-l]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D MDCT lines, length K.
|
||||
a_q : MdctCoeffs
|
||||
Quantized LPC coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Filtered MDCT lines y, length K.
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
p = int(a_q.shape[0])
|
||||
k_count = int(x.shape[0])
|
||||
|
||||
y = np.zeros(k_count, dtype=np.float64)
|
||||
for k in range(k_count):
|
||||
acc = x[k]
|
||||
for l in range(1, p + 1):
|
||||
if k - l >= 0:
|
||||
acc -= a_q[l - 1] * x[k - l]
|
||||
y[k] = acc
|
||||
return y
|
||||
|
||||
|
||||
def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Apply inverse TNS IIR filter:
|
||||
x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : MdctCoeffs
|
||||
1-D MDCT lines after TNS, length K.
|
||||
a_q : MdctCoeffs
|
||||
Quantized LPC coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Reconstructed MDCT lines x_hat, length K.
|
||||
"""
|
||||
y = np.asarray(y, dtype=np.float64).reshape(-1)
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
p = int(a_q.shape[0])
|
||||
k_count = int(y.shape[0])
|
||||
|
||||
x_hat = np.zeros(k_count, dtype=np.float64)
|
||||
for k in range(k_count):
|
||||
acc = y[k]
|
||||
for l in range(1, p + 1):
|
||||
if k - l >= 0:
|
||||
acc += a_q[l - 1] * x_hat[k - l]
|
||||
x_hat[k] = acc
|
||||
return x_hat
|
||||
|
||||
|
||||
def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
|
||||
"""
|
||||
TNS for a single MDCT vector (one long frame or one short subframe).
|
||||
|
||||
Steps:
|
||||
1) Compute Sw(k) from Bark band energies and smooth it.
|
||||
2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0).
|
||||
3) Compute LPC coefficients (order p=PRED_ORDER) on Xw.
|
||||
4) Quantize coefficients (4-bit symmetric, step QUANT_STEP).
|
||||
5) Apply FIR filter on original X(k) using quantized coefficients.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D MDCT vector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : MdctCoeffs
|
||||
TNS-processed MDCT vector (same length).
|
||||
a_q : MdctCoeffs
|
||||
Quantized LPC coefficients, shape (PRED_ORDER,).
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
sw = _compute_sw(x)
|
||||
|
||||
eps = 1e-12
|
||||
xw = np.where(sw > eps, x / sw, 0.0)
|
||||
|
||||
a = _lpc_coeffs(xw, PRED_ORDER)
|
||||
a_q = _quantize_coeffs(a)
|
||||
|
||||
# Ensure inverse stability (assignment requirement)
|
||||
a_q = _stabilize_quantized_coeffs(a_q)
|
||||
|
||||
y = _apply_tns_fir(x, a_q)
|
||||
return y, a_q
|
||||
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Public Functions (Level 2)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]:
|
||||
"""
|
||||
Temporal Noise Shaping (TNS) for ONE channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_F_in : FrameChannelF
|
||||
Per-channel MDCT coefficients.
|
||||
Expected (typical) shapes:
|
||||
- If frame_type == "ESH": (128, 8)
|
||||
- Else: (1024, 1) or (1024,)
|
||||
|
||||
frame_type : FrameType
|
||||
Frame type code ("OLS", "LSS", "ESH", "LPS").
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame_F_out : FrameChannelF
|
||||
Per-channel MDCT coefficients after applying TNS.
|
||||
Same shape convention as input.
|
||||
|
||||
tns_coeffs : TnsCoeffs
|
||||
Quantized TNS predictor coefficients.
|
||||
Expected shapes:
|
||||
- If frame_type == "ESH": (PRED_ORDER, 8)
|
||||
- Else: (PRED_ORDER, 1)
|
||||
"""
|
||||
x = np.asarray(frame_F_in, dtype=np.float64)
|
||||
|
||||
if frame_type == "ESH":
|
||||
if x.shape != (128, 8):
|
||||
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
|
||||
|
||||
y = np.empty_like(x, dtype=np.float64)
|
||||
a_out = np.empty((PRED_ORDER, 8), dtype=np.float64)
|
||||
|
||||
for j in range(8):
|
||||
y[:, j], a_out[:, j] = _tns_one_vector(x[:, j])
|
||||
|
||||
return y, a_out
|
||||
|
||||
if x.shape == (1024,):
|
||||
x_vec = x
|
||||
out_shape = (1024,)
|
||||
elif x.shape == (1024, 1):
|
||||
x_vec = x[:, 0]
|
||||
out_shape = (1024, 1)
|
||||
else:
|
||||
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
|
||||
|
||||
y_vec, a_q = _tns_one_vector(x_vec)
|
||||
|
||||
if out_shape == (1024,):
|
||||
y_out = y_vec
|
||||
else:
|
||||
y_out = y_vec.reshape(1024, 1)
|
||||
|
||||
a_out = a_q.reshape(PRED_ORDER, 1)
|
||||
return y_out, a_out
|
||||
|
||||
|
||||
def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF:
|
||||
"""
|
||||
Inverse Temporal Noise Shaping (iTNS) for ONE channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_F_in : FrameChannelF
|
||||
Per-channel MDCT coefficients after TNS.
|
||||
Expected (typical) shapes:
|
||||
- If frame_type == "ESH": (128, 8)
|
||||
- Else: (1024, 1) or (1024,)
|
||||
|
||||
frame_type : FrameType
|
||||
Frame type code ("OLS", "LSS", "ESH", "LPS").
|
||||
|
||||
tns_coeffs : TnsCoeffs
|
||||
Quantized TNS predictor coefficients.
|
||||
Expected shapes:
|
||||
- If frame_type == "ESH": (PRED_ORDER, 8)
|
||||
- Else: (PRED_ORDER, 1)
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameChannelF
|
||||
Per-channel MDCT coefficients after inverse TNS.
|
||||
Same shape convention as input frame_F_in.
|
||||
"""
|
||||
x = np.asarray(frame_F_in, dtype=np.float64)
|
||||
a = np.asarray(tns_coeffs, dtype=np.float64)
|
||||
|
||||
if frame_type == "ESH":
|
||||
if x.shape != (128, 8):
|
||||
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
|
||||
if a.shape != (PRED_ORDER, 8):
|
||||
raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).")
|
||||
|
||||
y = np.empty_like(x, dtype=np.float64)
|
||||
for j in range(8):
|
||||
y[:, j] = _apply_itns_iir(x[:, j], a[:, j])
|
||||
return y
|
||||
|
||||
if a.shape != (PRED_ORDER, 1):
|
||||
raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).")
|
||||
|
||||
if x.shape == (1024,):
|
||||
x_vec = x
|
||||
out_shape = (1024,)
|
||||
elif x.shape == (1024, 1):
|
||||
x_vec = x[:, 0]
|
||||
out_shape = (1024, 1)
|
||||
else:
|
||||
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
|
||||
|
||||
y_vec = _apply_itns_iir(x_vec, a[:, 0])
|
||||
|
||||
if out_shape == (1024,):
|
||||
return y_vec
|
||||
return y_vec.reshape(1024, 1)
|
||||
@ -10,7 +10,6 @@
|
||||
#
|
||||
# Description:
|
||||
# This module implements Public Type aliases
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
@ -39,7 +38,7 @@ Window type codes (AAC):
|
||||
"""
|
||||
|
||||
ChannelKey: TypeAlias = Literal["chl", "chr"]
|
||||
"""Channel dictionary keys used in Level 1 payloads."""
|
||||
"""Channel dictionary keys used in Level payloads."""
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -105,6 +104,40 @@ Examples:
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
MdctFrameChannel: TypeAlias = FloatArray
|
||||
"""
|
||||
Per-channel MDCT container used in Level-1/2 sequences.
|
||||
|
||||
Typical shapes:
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||
|
||||
dtype: float64
|
||||
|
||||
Notes
|
||||
-----
|
||||
Some parts of the assignment store long-frame coefficients as a column vector
|
||||
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
|
||||
when convenient, but the semantic meaning is identical.
|
||||
"""
|
||||
|
||||
TnsCoeffs: TypeAlias = FloatArray
|
||||
"""
|
||||
Quantized TNS predictor coefficients (one channel).
|
||||
|
||||
Typical shapes (Level 2):
|
||||
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
|
||||
- Else: (4, 1) (order p=4 for the long frame)
|
||||
|
||||
dtype: float64
|
||||
|
||||
Notes
|
||||
-----
|
||||
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
|
||||
We store the quantized coefficient values as float64 (typically multiples of 0.1)
|
||||
to keep the pipeline simple and readable.
|
||||
"""
|
||||
|
||||
|
||||
FrameT: TypeAlias = FloatArray
|
||||
"""
|
||||
@ -142,17 +175,23 @@ Rationale for ESH (128, 16):
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
FrameChannelF: TypeAlias = FloatArray
|
||||
FrameChannelF: TypeAlias = MdctFrameChannel
|
||||
"""
|
||||
Frequency-domain single-channel frame (MDCT coefficients).
|
||||
Frequency-domain single-channel MDCT coefficients.
|
||||
|
||||
Typical shapes (Level 1):
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024,)
|
||||
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||
Typical shapes (Level 1/2):
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||
- If frame_type == "ESH": (128, 8)
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
BandRanges: TypeAlias = list[tuple[int, int]]
|
||||
"""
|
||||
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
|
||||
|
||||
Used by TNS to map MDCT indices k to Bark bands.
|
||||
"""
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 1 AAC sequence payload types
|
||||
@ -168,7 +207,7 @@ class AACChannelFrameF(TypedDict):
|
||||
The MDCT coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (128, 8) (8 short subframes)
|
||||
- else: (1024, )
|
||||
- else: (1024, 1) or (1024,)
|
||||
"""
|
||||
frame_F: FrameChannelF
|
||||
|
||||
@ -191,3 +230,53 @@ List of length K (K = number of frames).
|
||||
Each element is a dict with keys:
|
||||
- "frame_type", "win_type", "chl", "chr"
|
||||
"""
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 2 AAC sequence payload types (TNS)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class AACChannelFrameF2(TypedDict):
|
||||
"""
|
||||
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
|
||||
|
||||
Keys
|
||||
----
|
||||
frame_F:
|
||||
The TNS-processed MDCT coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (128, 8)
|
||||
- else: (1024, 1) or (1024,)
|
||||
tns_coeffs:
|
||||
Quantized TNS predictor coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (PRED_ORDER, 8)
|
||||
- else: (PRED_ORDER, 1)
|
||||
"""
|
||||
frame_F: FrameChannelF
|
||||
tns_coeffs: TnsCoeffs
|
||||
|
||||
|
||||
class AACSeq2Frame(TypedDict):
|
||||
"""
|
||||
One frame dictionary element of aac_seq_2 (Level 2).
|
||||
"""
|
||||
frame_type: FrameType
|
||||
win_type: WinType
|
||||
chl: AACChannelFrameF2
|
||||
chr: AACChannelFrameF2
|
||||
|
||||
|
||||
AACSeq2: TypeAlias = List[AACSeq2Frame]
|
||||
"""
|
||||
AAC sequence for Level 2:
|
||||
List of length K (K = number of frames).
|
||||
|
||||
Each element is a dict with keys:
|
||||
- "frame_type", "win_type", "chl", "chr"
|
||||
|
||||
Level 2 adds:
|
||||
- per-channel "tns_coeffs"
|
||||
and stores:
|
||||
- per-channel "frame_F" after applying TNS.
|
||||
"""
|
||||
|
||||
@ -19,59 +19,15 @@ import numpy as np
|
||||
import pytest
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_coder import aac_coder_1
|
||||
from core.aac_decoder import aac_decoder_1
|
||||
from core.aac_coder import aac_coder_1, aac_coder_2, aac_read_wav_stereo_48k
|
||||
from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_remove_padding
|
||||
from core.aac_types import *
|
||||
from core.aac_snr_db import snr_db
|
||||
|
||||
|
||||
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||
"""
|
||||
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_ref : StereoSignal
|
||||
Reference signal, shape (N, 2) typical.
|
||||
x_hat : StereoSignal
|
||||
Reconstructed signal, shape (M, 2) typical.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
SNR in dB.
|
||||
- Returns +inf if noise power is zero.
|
||||
- Returns -inf if signal power is zero.
|
||||
"""
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||
|
||||
# Be conservative: align lengths and common channels.
|
||||
if x_ref.ndim == 1:
|
||||
x_ref = x_ref.reshape(-1, 1)
|
||||
if x_hat.ndim == 1:
|
||||
x_hat = x_hat.reshape(-1, 1)
|
||||
|
||||
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||
|
||||
x_ref = x_ref[:n, :c]
|
||||
x_hat = x_hat[:n, :c]
|
||||
|
||||
err = x_ref - x_hat
|
||||
ps = float(np.sum(x_ref * x_ref))
|
||||
pn = float(np.sum(err * err))
|
||||
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
|
||||
return float(10.0 * np.log10(ps / pn))
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tmp_stereo_wav(tmp_path: Path) -> Path:
|
||||
"""
|
||||
@ -89,6 +45,56 @@ def tmp_stereo_wav(tmp_path: Path) -> Path:
|
||||
return wav_path
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helper-function tests
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None:
|
||||
"""
|
||||
Contract test for aac_read_wav_stereo_48k():
|
||||
- Reads stereo WAV
|
||||
- Returns float64 array with shape (N,2)
|
||||
- Returns fs = 48000
|
||||
"""
|
||||
x, fs = aac_read_wav_stereo_48k(tmp_stereo_wav)
|
||||
|
||||
assert int(fs) == 48000
|
||||
assert isinstance(x, np.ndarray)
|
||||
assert x.dtype == np.float64
|
||||
assert x.ndim == 2
|
||||
assert x.shape[1] == 2
|
||||
assert x.shape[0] > 0
|
||||
|
||||
|
||||
def test_aac_remove_padding_removes_hop_from_both_ends() -> None:
|
||||
"""
|
||||
Contract test for aac_remove_padding():
|
||||
- Removes 'hop' samples from start and end.
|
||||
"""
|
||||
hop = 1024
|
||||
n = 10000
|
||||
|
||||
y_pad: StereoSignal = np.zeros((n, 2), dtype=np.float64)
|
||||
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||
|
||||
assert y.shape == (n - 2 * hop, 2)
|
||||
assert y.dtype == np.float64
|
||||
|
||||
|
||||
def test_aac_remove_padding_errors_on_too_short_input() -> None:
|
||||
"""
|
||||
aac_remove_padding must raise if y_pad is shorter than 2*hop.
|
||||
"""
|
||||
hop = 1024
|
||||
y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_ = aac_remove_padding(y_pad, hop=hop)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 1 tests
|
||||
# -----------------------------------------------------------------------------
|
||||
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
||||
"""
|
||||
Module-level contract test:
|
||||
@ -152,5 +158,68 @@ def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: P
|
||||
assert int(fs_hat) == 48000
|
||||
|
||||
# SNR against returned array (file should match closely, but we do not require it here).
|
||||
snr = _snr_db(x_ref, x_hat)
|
||||
snr = snr_db(x_ref, x_hat)
|
||||
assert snr > 80.0
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 2 tests (new)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
||||
"""
|
||||
Module-level contract test (Level 2):
|
||||
Ensure aac_seq_2 follows the expected schema and per-frame shapes, including tns_coeffs.
|
||||
"""
|
||||
aac_seq: AACSeq2 = aac_coder_2(tmp_stereo_wav)
|
||||
|
||||
assert isinstance(aac_seq, list)
|
||||
assert len(aac_seq) > 0
|
||||
|
||||
for fr in aac_seq:
|
||||
assert "frame_type" in fr
|
||||
assert "win_type" in fr
|
||||
assert "chl" in fr
|
||||
assert "chr" in fr
|
||||
|
||||
frame_type: FrameType = fr["frame_type"]
|
||||
assert frame_type in ("OLS", "LSS", "ESH", "LPS")
|
||||
|
||||
for ch_key in ("chl", "chr"):
|
||||
ch = fr[ch_key]
|
||||
assert "frame_F" in ch
|
||||
assert "tns_coeffs" in ch
|
||||
|
||||
frame_f = np.asarray(ch["frame_F"], dtype=np.float64)
|
||||
coeffs = np.asarray(ch["tns_coeffs"], dtype=np.float64)
|
||||
|
||||
if frame_type == "ESH":
|
||||
assert frame_f.shape == (128, 8)
|
||||
assert coeffs.shape[0] == 4
|
||||
assert coeffs.shape[1] == 8
|
||||
else:
|
||||
assert frame_f.shape == (1024, 1)
|
||||
assert coeffs.shape == (4, 1)
|
||||
|
||||
|
||||
def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
|
||||
"""
|
||||
End-to-end test (Level 2):
|
||||
Encode + decode and check SNR remains very high.
|
||||
|
||||
Level 2 is still floating-point (TNS is reversible), so reconstruction
|
||||
should remain numerical-noise only.
|
||||
"""
|
||||
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
assert int(fs) == 48000
|
||||
|
||||
out_wav = tmp_path / "out_l2.wav"
|
||||
aac_seq = aac_coder_2(tmp_stereo_wav)
|
||||
x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav)
|
||||
|
||||
assert out_wav.exists()
|
||||
_, fs_hat = sf.read(str(out_wav), always_2d=True)
|
||||
assert int(fs_hat) == 48000
|
||||
|
||||
snr = snr_db(x_ref, x_hat)
|
||||
assert snr > 75.0
|
||||
@ -17,6 +17,7 @@ from typing import Sequence
|
||||
import pytest
|
||||
|
||||
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
|
||||
from core.aac_snr_db import snr_db
|
||||
from core.aac_types import *
|
||||
|
||||
# Helper fixtures for filterbank
|
||||
@ -56,20 +57,6 @@ def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type
|
||||
return y
|
||||
|
||||
|
||||
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
|
||||
"""
|
||||
Compute SNR in dB over all samples/channels.
|
||||
"""
|
||||
err = x - y
|
||||
ps = float(np.sum(x * x))
|
||||
pn = float(np.sum(err * err))
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
return 10.0 * float(np.log10(ps / pn))
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Forward filterbank tests
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -223,7 +210,7 @@ def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
|
||||
|
||||
a = 1024
|
||||
b = N - 1024
|
||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||
snr = snr_db(x[a:b, :], y[a:b, :])
|
||||
assert snr > 50.0
|
||||
|
||||
|
||||
@ -244,7 +231,7 @@ def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
|
||||
|
||||
a = 1024
|
||||
b = N - 1024
|
||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||
snr = snr_db(x[a:b, :], y[a:b, :])
|
||||
assert snr > 45.0
|
||||
|
||||
|
||||
@ -265,5 +252,5 @@ def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
|
||||
|
||||
a = 1024
|
||||
b = N - 1024
|
||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||
snr = snr_db(x[a:b, :], y[a:b, :])
|
||||
assert snr > 40.0
|
||||
|
||||
98
source/core/tests/test_snr_db.py
Normal file
98
source/core/tests/test_snr_db.py
Normal file
@ -0,0 +1,98 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - SNR dB Tests
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Basic tests for SNR calculation utility.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from core.aac_snr_db import snr_db
|
||||
from core.aac_types import StereoSignal
|
||||
|
||||
|
||||
def test_snr_perfect_reconstruction_returns_inf() -> None:
|
||||
"""
|
||||
If x_hat == x_ref exactly, noise power is zero and SNR must be +inf.
|
||||
"""
|
||||
rng = np.random.default_rng(0)
|
||||
x: StereoSignal = rng.normal(size=(1024, 2)).astype(np.float64)
|
||||
|
||||
snr = snr_db(x, x)
|
||||
assert snr == float("inf")
|
||||
|
||||
|
||||
def test_snr_zero_reference_returns_minus_inf() -> None:
|
||||
"""
|
||||
If reference signal is identically zero, signal power is zero
|
||||
and SNR must be -inf (unless noise is also zero, which is degenerate).
|
||||
"""
|
||||
x_ref: StereoSignal = np.zeros((1024, 2), dtype=np.float64)
|
||||
x_hat: StereoSignal = np.ones((1024, 2), dtype=np.float64)
|
||||
|
||||
snr = snr_db(x_ref, x_hat)
|
||||
assert snr == float("-inf")
|
||||
|
||||
|
||||
def test_snr_known_noise_level_matches_expected_value() -> None:
|
||||
"""
|
||||
Deterministic test with known signal and noise power.
|
||||
|
||||
Let:
|
||||
x_ref = ones
|
||||
x_hat = ones + noise
|
||||
|
||||
With noise variance sigma^2, expected SNR:
|
||||
10 * log10(Ps / Pn)
|
||||
"""
|
||||
n = 1000
|
||||
sigma = 0.1
|
||||
|
||||
x_ref: StereoSignal = np.ones((n, 2), dtype=np.float64)
|
||||
noise = sigma * np.ones((n, 2), dtype=np.float64)
|
||||
x_hat: StereoSignal = x_ref + noise
|
||||
|
||||
ps = float(np.sum(x_ref * x_ref))
|
||||
pn = float(np.sum(noise * noise))
|
||||
expected = 10.0 * np.log10(ps / pn)
|
||||
|
||||
snr = snr_db(x_ref, x_hat)
|
||||
assert np.isclose(snr, expected, rtol=1e-12, atol=1e-12)
|
||||
|
||||
|
||||
def test_snr_aligns_different_lengths_and_channels() -> None:
|
||||
"""
|
||||
The function must:
|
||||
- align to minimum length
|
||||
- align to minimum channel count
|
||||
without crashing.
|
||||
"""
|
||||
rng = np.random.default_rng(1)
|
||||
|
||||
x_ref: StereoSignal = rng.normal(size=(1000, 2)).astype(np.float64)
|
||||
x_hat: StereoSignal = rng.normal(size=(800, 1)).astype(np.float64)
|
||||
|
||||
snr = snr_db(x_ref, x_hat)
|
||||
assert np.isfinite(snr)
|
||||
|
||||
|
||||
def test_snr_accepts_1d_inputs() -> None:
|
||||
"""
|
||||
1-D inputs must be accepted and treated as single-channel signals.
|
||||
"""
|
||||
rng = np.random.default_rng(2)
|
||||
|
||||
x_ref = rng.normal(size=1024).astype(np.float64)
|
||||
x_hat = x_ref + 0.01 * rng.normal(size=1024).astype(np.float64)
|
||||
|
||||
snr = snr_db(x_ref, x_hat)
|
||||
assert np.isfinite(snr)
|
||||
196
source/core/tests/test_tns.py
Normal file
196
source/core/tests/test_tns.py
Normal file
@ -0,0 +1,196 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - TNS Tests
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Tests for Temporal Noise Shaping (TNS) module (Level 2).
|
||||
#
|
||||
# Validates:
|
||||
# - I/O shapes for long and ESH modes
|
||||
# - Quantization grid and clamping of predictor coefficients
|
||||
# - Inverse-filter stability (all poles inside unit circle)
|
||||
# - Functional correctness: iTNS(TNS(X)) ≈ X
|
||||
# ------------------------------------------------------------
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from core.aac_configuration import PRED_ORDER, QUANT_MAX, QUANT_STEP
|
||||
from core.aac_tns import aac_tns, aac_i_tns
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helper utilities
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _is_inverse_stable_from_coeffs(a_q: MdctCoeffs) -> bool:
|
||||
"""
|
||||
Check stability of the inverse TNS filter H_TNS^{-1}.
|
||||
|
||||
Poles are roots of:
|
||||
z^p - a1 z^{p-1} - ... - ap = 0
|
||||
|
||||
Stability condition:
|
||||
|root| < 1 for all roots.
|
||||
"""
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
p = int(a_q.shape[0])
|
||||
|
||||
poly = np.empty(p + 1, dtype=np.float64)
|
||||
poly[0] = 1.0
|
||||
poly[1:] = -a_q
|
||||
|
||||
roots = np.roots(poly)
|
||||
margin = 1e-12
|
||||
return bool(np.all(np.abs(roots) < (1.0 - margin)))
|
||||
|
||||
|
||||
def _assert_quantized_and_clamped(a_q: MdctCoeffs) -> None:
|
||||
"""
|
||||
Assert that coefficients:
|
||||
- lie on the QUANT_STEP grid
|
||||
- are clamped to [-QUANT_MAX, +QUANT_MAX]
|
||||
"""
|
||||
a_q = np.asarray(a_q, dtype=np.float64)
|
||||
|
||||
assert np.max(np.abs(a_q)) <= (QUANT_MAX + 1e-12)
|
||||
|
||||
grid = a_q / float(QUANT_STEP)
|
||||
assert np.max(np.abs(grid - np.round(grid))) < 1e-12
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Shape / contract tests
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||
def test_tns_shapes_long_sequences(frame_type: FrameType) -> None:
|
||||
"""
|
||||
Contract test (long frames):
|
||||
- Input shape: (1024, 1)
|
||||
- Output shape preserved
|
||||
- Predictor coeffs shape: (PRED_ORDER, 1)
|
||||
"""
|
||||
rng = np.random.default_rng(0)
|
||||
frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||
|
||||
frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||
|
||||
assert frame_F_out.shape == frame_F_in.shape
|
||||
assert tns_coeffs.shape == (PRED_ORDER, 1)
|
||||
|
||||
|
||||
def test_tns_shapes_esh() -> None:
|
||||
"""
|
||||
Contract test (ESH):
|
||||
- Input shape: (128, 8)
|
||||
- Output shape preserved
|
||||
- Predictor coeffs shape: (PRED_ORDER, 8)
|
||||
"""
|
||||
rng = np.random.default_rng(1)
|
||||
frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
|
||||
|
||||
frame_F_out, tns_coeffs = aac_tns(frame_F_in, "ESH")
|
||||
|
||||
assert frame_F_out.shape == (128, 8)
|
||||
assert tns_coeffs.shape == (PRED_ORDER, 8)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Coefficient properties
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||
def test_tns_coeffs_quantized_clamped_and_stable_long(frame_type: FrameType) -> None:
|
||||
"""
|
||||
Long-frame predictor coefficients must be:
|
||||
- quantized on QUANT_STEP grid
|
||||
- clamped to [-QUANT_MAX, +QUANT_MAX]
|
||||
- stable for inverse filtering
|
||||
"""
|
||||
rng = np.random.default_rng(2)
|
||||
frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||
|
||||
_, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||
|
||||
a_q: MdctCoeffs = tns_coeffs[:, 0]
|
||||
_assert_quantized_and_clamped(a_q)
|
||||
assert _is_inverse_stable_from_coeffs(a_q)
|
||||
|
||||
|
||||
def test_tns_coeffs_quantized_clamped_and_stable_esh() -> None:
|
||||
"""
|
||||
ESH predictor coefficients must satisfy quantization and stability
|
||||
independently for each of the 8 short subframes.
|
||||
"""
|
||||
rng = np.random.default_rng(3)
|
||||
frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
|
||||
|
||||
_, tns_coeffs = aac_tns(frame_F_in, "ESH")
|
||||
|
||||
for j in range(8):
|
||||
a_q: MdctCoeffs = tns_coeffs[:, j]
|
||||
_assert_quantized_and_clamped(a_q)
|
||||
assert _is_inverse_stable_from_coeffs(a_q)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Functional correctness (round-trip)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||
def test_tns_roundtrip_long_is_close(frame_type: FrameType) -> None:
|
||||
"""
|
||||
Functional test:
|
||||
iTNS(TNS(X)) ≈ X for long frames.
|
||||
"""
|
||||
rng = np.random.default_rng(4)
|
||||
frame_F_in: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||
|
||||
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||
frame_F_hat = aac_i_tns(frame_F_tns, frame_type, tns_coeffs)
|
||||
|
||||
np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-9, atol=1e-9)
|
||||
|
||||
|
||||
def test_tns_roundtrip_esh_is_close() -> None:
|
||||
"""
|
||||
Functional test:
|
||||
iTNS(TNS(X)) ≈ X for ESH frames (8 independent subframes).
|
||||
"""
|
||||
rng = np.random.default_rng(5)
|
||||
frame_F_in: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
|
||||
|
||||
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, "ESH")
|
||||
frame_F_hat = aac_i_tns(frame_F_tns, "ESH", tns_coeffs)
|
||||
|
||||
np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-9, atol=1e-9)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Sanity
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def test_tns_outputs_are_finite() -> None:
|
||||
"""
|
||||
Sanity test: no NaN or inf in outputs.
|
||||
"""
|
||||
rng = np.random.default_rng(6)
|
||||
|
||||
frame_F_long: FrameChannelF = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||
out_long, coeffs_long = aac_tns(frame_F_long, "OLS")
|
||||
assert np.isfinite(out_long).all()
|
||||
assert np.isfinite(coeffs_long).all()
|
||||
|
||||
frame_F_esh: FrameChannelF = rng.normal(size=(128, 8)).astype(np.float64)
|
||||
out_esh, coeffs_esh = aac_tns(frame_F_esh, "ESH")
|
||||
assert np.isfinite(out_esh).all()
|
||||
assert np.isfinite(coeffs_esh).all()
|
||||
@ -30,6 +30,7 @@ import soundfile as sf
|
||||
from core.aac_configuration import WIN_TYPE
|
||||
from core.aac_filterbank import aac_filter_bank
|
||||
from core.aac_ssc import aac_SSC
|
||||
from core.aac_tns import aac_tns
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
@ -144,8 +145,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||
AACSeq1
|
||||
List of encoded frames (Level 1 schema).
|
||||
"""
|
||||
x, fs = aac_read_wav_stereo_48k(filename_in)
|
||||
_ = fs # kept for clarity; The assignment assumes 48 kHz
|
||||
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||
# The assignment assumes 48 kHz
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
@ -196,3 +197,88 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||
prev_frame_type = frame_type
|
||||
|
||||
return aac_seq
|
||||
|
||||
|
||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
||||
"""
|
||||
Level-2 AAC encoder (Level 1 + TNS).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename_in : Union[str, Path]
|
||||
Input WAV filename (stereo, 48 kHz).
|
||||
|
||||
Returns
|
||||
-------
|
||||
AACSeq2
|
||||
Encoded AAC sequence (Level 2 payload schema).
|
||||
For each frame i:
|
||||
- "frame_type": FrameType
|
||||
- "win_type": WinType
|
||||
- "chl"/"chr":
|
||||
- "frame_F": FrameChannelF (after TNS)
|
||||
- "tns_coeffs": TnsCoeffs
|
||||
"""
|
||||
filename_in = Path(filename_in)
|
||||
|
||||
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||
# The assignment assumes 48 kHz
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
|
||||
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||
|
||||
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||
if K <= 0:
|
||||
raise ValueError("Input too short for framing.")
|
||||
|
||||
aac_seq: AACSeq2 = []
|
||||
prev_frame_type: FrameType = "OLS"
|
||||
|
||||
for i in range(K):
|
||||
start = i * hop
|
||||
|
||||
frame_t: FrameT = x_pad[start : start + win, :]
|
||||
if frame_t.shape != (win, 2):
|
||||
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||
|
||||
next_t = x_pad[start + hop : start + hop + win, :]
|
||||
if next_t.shape[0] < win:
|
||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||
next_t = np.vstack([next_t, tail])
|
||||
|
||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
||||
|
||||
# Level 1 analysis (packed stereo container)
|
||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||
|
||||
# Unpack to per-channel (as you already do in Level 1)
|
||||
if frame_type == "ESH":
|
||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
||||
for j in range(8):
|
||||
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
|
||||
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
|
||||
else:
|
||||
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
|
||||
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
|
||||
|
||||
# Level 2: apply TNS per channel
|
||||
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||
|
||||
aac_seq.append(
|
||||
{
|
||||
"frame_type": frame_type,
|
||||
"win_type": WIN_TYPE,
|
||||
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
|
||||
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
||||
}
|
||||
)
|
||||
|
||||
prev_frame_type = frame_type
|
||||
|
||||
return aac_seq
|
||||
@ -17,6 +17,15 @@ from __future__ import annotations
|
||||
# Imports
|
||||
from core.aac_types import WinType
|
||||
|
||||
# Filterbank
|
||||
# ------------------------------------------------------------
|
||||
# Window type
|
||||
# Options: "SIN", "KBD"
|
||||
WIN_TYPE: WinType = "SIN"
|
||||
WIN_TYPE: WinType = "SIN"
|
||||
|
||||
|
||||
# TNS
|
||||
# ------------------------------------------------------------
|
||||
PRED_ORDER = 4
|
||||
QUANT_STEP = 0.1
|
||||
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]
|
||||
@ -28,6 +28,7 @@ from typing import Union
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_filterbank import aac_i_filter_bank
|
||||
from core.aac_tns import aac_i_tns
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
@ -164,3 +165,93 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
||||
sf.write(str(filename_out), y, 48000)
|
||||
|
||||
return y
|
||||
|
||||
|
||||
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
||||
"""
|
||||
Level-2 AAC decoder (inverse of aac_coder_2).
|
||||
|
||||
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
|
||||
- Per frame/channel: inverse TNS using stored coefficients
|
||||
- Re-pack to stereo frame_F
|
||||
- IMDCT + windowing
|
||||
- Overlap-add over frames
|
||||
- Remove Level-1 padding (hop samples start/end)
|
||||
- Write output WAV (48 kHz)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
aac_seq_2 : AACSeq2
|
||||
Encoded sequence as produced by aac_coder_2().
|
||||
filename_out : Union[str, Path]
|
||||
Output WAV filename.
|
||||
|
||||
Returns
|
||||
-------
|
||||
StereoSignal
|
||||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||
"""
|
||||
filename_out = Path(filename_out)
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
K = len(aac_seq_2)
|
||||
|
||||
if K <= 0:
|
||||
raise ValueError("aac_seq_2 must contain at least one frame.")
|
||||
|
||||
n_pad = (K - 1) * hop + win
|
||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||
|
||||
for i, fr in enumerate(aac_seq_2):
|
||||
frame_type: FrameType = fr["frame_type"]
|
||||
win_type: WinType = fr["win_type"]
|
||||
|
||||
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||
|
||||
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||
|
||||
# Inverse TNS per channel
|
||||
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
|
||||
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
|
||||
|
||||
# Re-pack to the stereo container expected by aac_i_filter_bank
|
||||
if frame_type == "ESH":
|
||||
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||
|
||||
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
|
||||
for j in range(8):
|
||||
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||
else:
|
||||
# Accept either (1024,1) or (1024,) from your internal convention.
|
||||
if chl_f.shape == (1024,):
|
||||
chl_col = chl_f.reshape(1024, 1)
|
||||
elif chl_f.shape == (1024, 1):
|
||||
chl_col = chl_f
|
||||
else:
|
||||
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
|
||||
|
||||
if chr_f.shape == (1024,):
|
||||
chr_col = chr_f.reshape(1024, 1)
|
||||
elif chr_f.shape == (1024, 1):
|
||||
chr_col = chr_f
|
||||
else:
|
||||
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
|
||||
|
||||
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||
frame_f[:, 0] = chl_col[:, 0]
|
||||
frame_f[:, 1] = chr_col[:, 0]
|
||||
|
||||
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||
|
||||
start = i * hop
|
||||
y_pad[start : start + win, :] += frame_t_hat
|
||||
|
||||
y = aac_remove_padding(y_pad, hop=hop)
|
||||
|
||||
sf.write(str(filename_out), y, 48000)
|
||||
return y
|
||||
60
source/level_1/core/aac_snr_db.py
Normal file
60
source/level_1/core/aac_snr_db.py
Normal file
@ -0,0 +1,60 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - SNR dB calculator
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# This module implements SNR calculation in dB
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from core.aac_types import StereoSignal
|
||||
import numpy as np
|
||||
|
||||
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||
"""
|
||||
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_ref : StereoSignal
|
||||
Reference stereo stream.
|
||||
x_hat : StereoSignal
|
||||
Reconstructed stereo stream.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
SNR in dB.
|
||||
- Returns +inf if noise power is zero.
|
||||
- Returns -inf if signal power is zero.
|
||||
"""
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||
|
||||
if x_ref.ndim == 1:
|
||||
x_ref = x_ref.reshape(-1, 1)
|
||||
if x_hat.ndim == 1:
|
||||
x_hat = x_hat.reshape(-1, 1)
|
||||
|
||||
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||
|
||||
x_ref = x_ref[:n, :c]
|
||||
x_hat = x_hat[:n, :c]
|
||||
|
||||
err = x_ref - x_hat
|
||||
ps = float(np.sum(x_ref * x_ref))
|
||||
pn = float(np.sum(err * err))
|
||||
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
|
||||
return float(10.0 * np.log10(ps / pn))
|
||||
@ -10,7 +10,6 @@
|
||||
#
|
||||
# Description:
|
||||
# This module implements Public Type aliases
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
@ -39,7 +38,7 @@ Window type codes (AAC):
|
||||
"""
|
||||
|
||||
ChannelKey: TypeAlias = Literal["chl", "chr"]
|
||||
"""Channel dictionary keys used in Level 1 payloads."""
|
||||
"""Channel dictionary keys used in Level payloads."""
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -105,6 +104,40 @@ Examples:
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
MdctFrameChannel: TypeAlias = FloatArray
|
||||
"""
|
||||
Per-channel MDCT container used in Level-1/2 sequences.
|
||||
|
||||
Typical shapes:
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||
|
||||
dtype: float64
|
||||
|
||||
Notes
|
||||
-----
|
||||
Some parts of the assignment store long-frame coefficients as a column vector
|
||||
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
|
||||
when convenient, but the semantic meaning is identical.
|
||||
"""
|
||||
|
||||
TnsCoeffs: TypeAlias = FloatArray
|
||||
"""
|
||||
Quantized TNS predictor coefficients (one channel).
|
||||
|
||||
Typical shapes (Level 2):
|
||||
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
|
||||
- Else: (4, 1) (order p=4 for the long frame)
|
||||
|
||||
dtype: float64
|
||||
|
||||
Notes
|
||||
-----
|
||||
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
|
||||
We store the quantized coefficient values as float64 (typically multiples of 0.1)
|
||||
to keep the pipeline simple and readable.
|
||||
"""
|
||||
|
||||
|
||||
FrameT: TypeAlias = FloatArray
|
||||
"""
|
||||
@ -142,17 +175,23 @@ Rationale for ESH (128, 16):
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
FrameChannelF: TypeAlias = FloatArray
|
||||
FrameChannelF: TypeAlias = MdctFrameChannel
|
||||
"""
|
||||
Frequency-domain single-channel frame (MDCT coefficients).
|
||||
Frequency-domain single-channel MDCT coefficients.
|
||||
|
||||
Typical shapes (Level 1):
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024,)
|
||||
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||
Typical shapes (Level 1/2):
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||
- If frame_type == "ESH": (128, 8)
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
BandRanges: TypeAlias = list[tuple[int, int]]
|
||||
"""
|
||||
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
|
||||
|
||||
Used by TNS to map MDCT indices k to Bark bands.
|
||||
"""
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 1 AAC sequence payload types
|
||||
@ -168,7 +207,7 @@ class AACChannelFrameF(TypedDict):
|
||||
The MDCT coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (128, 8) (8 short subframes)
|
||||
- else: (1024, )
|
||||
- else: (1024, 1) or (1024,)
|
||||
"""
|
||||
frame_F: FrameChannelF
|
||||
|
||||
@ -191,3 +230,53 @@ List of length K (K = number of frames).
|
||||
Each element is a dict with keys:
|
||||
- "frame_type", "win_type", "chl", "chr"
|
||||
"""
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 2 AAC sequence payload types (TNS)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class AACChannelFrameF2(TypedDict):
|
||||
"""
|
||||
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
|
||||
|
||||
Keys
|
||||
----
|
||||
frame_F:
|
||||
The TNS-processed MDCT coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (128, 8)
|
||||
- else: (1024, 1) or (1024,)
|
||||
tns_coeffs:
|
||||
Quantized TNS predictor coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (PRED_ORDER, 8)
|
||||
- else: (PRED_ORDER, 1)
|
||||
"""
|
||||
frame_F: FrameChannelF
|
||||
tns_coeffs: TnsCoeffs
|
||||
|
||||
|
||||
class AACSeq2Frame(TypedDict):
|
||||
"""
|
||||
One frame dictionary element of aac_seq_2 (Level 2).
|
||||
"""
|
||||
frame_type: FrameType
|
||||
win_type: WinType
|
||||
chl: AACChannelFrameF2
|
||||
chr: AACChannelFrameF2
|
||||
|
||||
|
||||
AACSeq2: TypeAlias = List[AACSeq2Frame]
|
||||
"""
|
||||
AAC sequence for Level 2:
|
||||
List of length K (K = number of frames).
|
||||
|
||||
Each element is a dict with keys:
|
||||
- "frame_type", "win_type", "chl", "chr"
|
||||
|
||||
Level 2 adds:
|
||||
- per-channel "tns_coeffs"
|
||||
and stores:
|
||||
- per-channel "frame_F" after applying TNS.
|
||||
"""
|
||||
|
||||
@ -1,234 +0,0 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Sequence Segmentation Control Tests
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Tests for Sequence Segmentation Control module (SSC).
|
||||
# ------------------------------------------------------------
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from core.aac_ssc import aac_SSC
|
||||
from core.aac_types import FrameT
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helper fixtures for SSC
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _next_frame_no_attack() -> FrameT:
|
||||
"""
|
||||
Build a next_frame_T that must NOT trigger ESH detection.
|
||||
|
||||
Uses exact zeros so all segment energies are zero and the condition
|
||||
s[l] > 1e-3 cannot hold for any l.
|
||||
"""
|
||||
return np.zeros((2048, 2), dtype=np.float64)
|
||||
|
||||
|
||||
def _next_frame_strong_attack(
|
||||
*,
|
||||
attack_left: bool,
|
||||
attack_right: bool,
|
||||
segment_l: int = 4,
|
||||
baseline: float = 1e-6,
|
||||
burst_amp: float = 1.0,
|
||||
) -> FrameT:
|
||||
"""
|
||||
Build a next_frame_T (2048x2) that should trigger ESH detection on selected channels.
|
||||
|
||||
Attack criterion (spec):
|
||||
Attack exists if there exists l in {1..7} such that:
|
||||
s[l] > 1e-3 and ds[l] > 10,
|
||||
where s[l] is the energy of segment l (length 128) after high-pass filtering,
|
||||
and ds[l] = s[l] / s[l-1].
|
||||
|
||||
Construction:
|
||||
- A small baseline is added everywhere to avoid relying on the epsilon guard in ds,
|
||||
keeping ds behavior stable/reproducible.
|
||||
- A strong burst is added inside a chosen segment l in 1..7.
|
||||
"""
|
||||
if not (1 <= segment_l <= 7):
|
||||
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
|
||||
|
||||
x = np.full((2048, 2), baseline, dtype=np.float64)
|
||||
|
||||
a = segment_l * 128
|
||||
b = (segment_l + 1) * 128
|
||||
|
||||
if attack_left:
|
||||
x[a:b, 0] += burst_amp
|
||||
if attack_right:
|
||||
x[a:b, 1] += burst_amp
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def _next_frame_below_s_threshold(
|
||||
*,
|
||||
left: bool,
|
||||
right: bool,
|
||||
segment_l: int = 4,
|
||||
impulse_amp: float = 0.01,
|
||||
) -> FrameT:
|
||||
"""
|
||||
Construct a next_frame_T where s[l] is below 1e-3, so ESH must NOT be triggered,
|
||||
even if the ratio ds[l] could be large.
|
||||
|
||||
We place a single impulse of amplitude 'impulse_amp' inside one segment.
|
||||
Approx. segment energy: s[l] ~= impulse_amp^2.
|
||||
|
||||
Example:
|
||||
impulse_amp = 0.01 => s[l] ~= 1e-4 < 1e-3
|
||||
"""
|
||||
if not (1 <= segment_l <= 7):
|
||||
raise ValueError(f"segment_l must be in [1, 7], got {segment_l}.")
|
||||
|
||||
x = np.zeros((2048, 2), dtype=np.float64)
|
||||
|
||||
idx = segment_l * 128 + 10 # inside segment l
|
||||
if left:
|
||||
x[idx, 0] = impulse_amp
|
||||
if right:
|
||||
x[idx, 1] = impulse_amp
|
||||
|
||||
return x
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# 1) Fixed/mandatory cases (prev frame type forces current type)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
|
||||
"""
|
||||
Spec:
|
||||
- If prev was LSS => current MUST be ESH
|
||||
- If prev was LPS => current MUST be OLS
|
||||
independent of attack detection on (i+1).
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
|
||||
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||
|
||||
out1 = aac_SSC(frame_t, next_attack, "LSS")
|
||||
assert out1 == "ESH"
|
||||
|
||||
out2 = aac_SSC(frame_t, next_attack, "LPS")
|
||||
assert out2 == "OLS"
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# 2) Cases requiring next-frame ESH prediction (attack computation)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def test_prev_ols_next_not_esh_returns_ols() -> None:
|
||||
"""
|
||||
If prev=OLS, current is:
|
||||
- LSS iff (i+1) is predicted ESH
|
||||
- else OLS
|
||||
Here: no attack => expect OLS.
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
next_t = _next_frame_no_attack()
|
||||
|
||||
out = aac_SSC(frame_t, next_t, "OLS")
|
||||
assert out == "OLS"
|
||||
|
||||
|
||||
def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
|
||||
"""
|
||||
prev=OLS and next predicted ESH for both channels:
|
||||
per-channel: LSS, LSS
|
||||
merged: LSS
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||
|
||||
out = aac_SSC(frame_t, next_t, "OLS")
|
||||
assert out == "LSS"
|
||||
|
||||
|
||||
def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
|
||||
"""
|
||||
prev=OLS:
|
||||
- one channel predicts ESH => LSS
|
||||
- other channel predicts not ESH => OLS
|
||||
Merge table: OLS + LSS => LSS (either side).
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
|
||||
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||
out1 = aac_SSC(frame_t, next1_t, "OLS")
|
||||
assert out1 == "LSS"
|
||||
|
||||
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||
out2 = aac_SSC(frame_t, next2_t, "OLS")
|
||||
assert out2 == "LSS"
|
||||
|
||||
|
||||
def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
|
||||
"""
|
||||
prev=ESH and next predicted ESH for both channels:
|
||||
per-channel: ESH, ESH
|
||||
merged: ESH
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||
|
||||
out = aac_SSC(frame_t, next_t, "ESH")
|
||||
assert out == "ESH"
|
||||
|
||||
|
||||
def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
|
||||
"""
|
||||
prev=ESH and next not predicted ESH for both channels:
|
||||
per-channel: LPS, LPS
|
||||
merged: LPS
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
next_t = _next_frame_no_attack()
|
||||
|
||||
out = aac_SSC(frame_t, next_t, "ESH")
|
||||
assert out == "LPS"
|
||||
|
||||
|
||||
def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
|
||||
"""
|
||||
prev=ESH:
|
||||
- one channel predicts ESH => ESH
|
||||
- other channel predicts not ESH => LPS
|
||||
Merge table: ESH + LPS => ESH (either side).
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
|
||||
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||
out1 = aac_SSC(frame_t, next1_t, "ESH")
|
||||
assert out1 == "ESH"
|
||||
|
||||
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||
out2 = aac_SSC(frame_t, next2_t, "ESH")
|
||||
assert out2 == "ESH"
|
||||
|
||||
|
||||
def test_threshold_s_must_exceed_1e_3() -> None:
|
||||
"""
|
||||
Spec: next frame is predicted ESH only if:
|
||||
s[l] > 1e-3 AND ds[l] > 10
|
||||
for some l in 1..7.
|
||||
|
||||
This test checks the necessity of the s[l] threshold:
|
||||
- Create a frame with s[l] ~= 1e-4 < 1e-3 (single impulse with amp 0.01).
|
||||
- Expect: not classified as ESH -> for prev=OLS return OLS.
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
|
||||
|
||||
out = aac_SSC(frame_t, next_t, "OLS")
|
||||
assert out == "OLS"
|
||||
@ -1,156 +0,0 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - AAC Coder/DecoderTests
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Tests for AAC Coder/Decoder module.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_coder import aac_coder_1
|
||||
from core.aac_decoder import aac_decoder_1
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||
"""
|
||||
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_ref : StereoSignal
|
||||
Reference signal, shape (N, 2) typical.
|
||||
x_hat : StereoSignal
|
||||
Reconstructed signal, shape (M, 2) typical.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
SNR in dB.
|
||||
- Returns +inf if noise power is zero.
|
||||
- Returns -inf if signal power is zero.
|
||||
"""
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||
|
||||
# Be conservative: align lengths and common channels.
|
||||
if x_ref.ndim == 1:
|
||||
x_ref = x_ref.reshape(-1, 1)
|
||||
if x_hat.ndim == 1:
|
||||
x_hat = x_hat.reshape(-1, 1)
|
||||
|
||||
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||
|
||||
x_ref = x_ref[:n, :c]
|
||||
x_hat = x_hat[:n, :c]
|
||||
|
||||
err = x_ref - x_hat
|
||||
ps = float(np.sum(x_ref * x_ref))
|
||||
pn = float(np.sum(err * err))
|
||||
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
|
||||
return float(10.0 * np.log10(ps / pn))
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tmp_stereo_wav(tmp_path: Path) -> Path:
|
||||
"""
|
||||
Create a temporary 48 kHz stereo WAV with random samples.
|
||||
"""
|
||||
rng = np.random.default_rng(123)
|
||||
fs = 48000
|
||||
|
||||
# ~1 second of audio (kept small for test speed).
|
||||
n = fs
|
||||
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
|
||||
|
||||
wav_path = tmp_path / "in.wav"
|
||||
sf.write(str(wav_path), x, fs)
|
||||
return wav_path
|
||||
|
||||
|
||||
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
||||
"""
|
||||
Module-level contract test:
|
||||
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
|
||||
"""
|
||||
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
|
||||
|
||||
assert isinstance(aac_seq, list)
|
||||
assert len(aac_seq) > 0
|
||||
|
||||
for fr in aac_seq:
|
||||
assert isinstance(fr, dict)
|
||||
|
||||
# Required keys
|
||||
assert "frame_type" in fr
|
||||
assert "win_type" in fr
|
||||
assert "chl" in fr
|
||||
assert "chr" in fr
|
||||
|
||||
frame_type = fr["frame_type"]
|
||||
win_type = fr["win_type"]
|
||||
|
||||
assert frame_type in ("OLS", "LSS", "ESH", "LPS")
|
||||
assert win_type in ("SIN", "KBD")
|
||||
|
||||
assert isinstance(fr["chl"], dict)
|
||||
assert isinstance(fr["chr"], dict)
|
||||
assert "frame_F" in fr["chl"]
|
||||
assert "frame_F" in fr["chr"]
|
||||
|
||||
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||
|
||||
if frame_type == "ESH":
|
||||
assert chl_f.shape == (128, 8)
|
||||
assert chr_f.shape == (128, 8)
|
||||
else:
|
||||
assert chl_f.shape == (1024, 1)
|
||||
assert chr_f.shape == (1024, 1)
|
||||
|
||||
|
||||
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
|
||||
"""
|
||||
End-to-end test:
|
||||
Encode + decode and check SNR is very high (numerical-noise only).
|
||||
|
||||
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
|
||||
"""
|
||||
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
assert int(fs) == 48000
|
||||
|
||||
out_wav = tmp_path / "out.wav"
|
||||
|
||||
aac_seq = aac_coder_1(tmp_stereo_wav)
|
||||
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
|
||||
|
||||
# Basic sanity: output file exists and is readable
|
||||
assert out_wav.exists()
|
||||
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
|
||||
assert int(fs_hat) == 48000
|
||||
|
||||
# SNR against returned array (file should match closely, but we do not require it here).
|
||||
snr = _snr_db(x_ref, x_hat)
|
||||
assert snr > 80.0
|
||||
@ -1,269 +0,0 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Filterbank Tests
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Tests for Filterbank module.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Sequence
|
||||
import pytest
|
||||
|
||||
from core.aac_filterbank import aac_filter_bank, aac_i_filter_bank
|
||||
from core.aac_types import *
|
||||
|
||||
# Helper fixtures for filterbank
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _ola_reconstruct(x: StereoSignal, frame_types: Sequence[FrameType], win_type: WinType) -> StereoSignal:
|
||||
"""
|
||||
Analyze-synthesize each frame and overlap-add with hop=1024.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : StereoSignal
|
||||
Input stereo stream, expected shape (N, 2).
|
||||
frame_types : Sequence[FrameType]
|
||||
Length K sequence of frame types for frames starting at i*1024.
|
||||
win_type : WinType
|
||||
Window type ("SIN" or "KBD").
|
||||
|
||||
Returns
|
||||
-------
|
||||
StereoSignal
|
||||
Reconstructed stereo stream, same shape as x (N, 2).
|
||||
"""
|
||||
hop = 1024
|
||||
win = 2048
|
||||
K = len(frame_types)
|
||||
|
||||
y: StereoSignal = np.zeros_like(x, dtype=np.float64)
|
||||
|
||||
for i in range(K):
|
||||
start = i * hop
|
||||
frame_t: FrameT = x[start:start + win, :]
|
||||
frame_f: FrameF = aac_filter_bank(frame_t, frame_types[i], win_type)
|
||||
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_types[i], win_type)
|
||||
y[start:start + win, :] += frame_t_hat
|
||||
|
||||
return y
|
||||
|
||||
|
||||
def _snr_db(x: StereoSignal, y: StereoSignal) -> float:
|
||||
"""
|
||||
Compute SNR in dB over all samples/channels.
|
||||
"""
|
||||
err = x - y
|
||||
ps = float(np.sum(x * x))
|
||||
pn = float(np.sum(err * err))
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
return 10.0 * float(np.log10(ps / pn))
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Forward filterbank tests
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||
def test_filterbank_shapes_long_sequences(frame_type: FrameType, win_type: WinType) -> None:
|
||||
"""
|
||||
Contract test: for OLS/LSS/LPS, aac_filter_bank returns shape (1024, 2).
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||
assert frame_f.shape == (1024, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_filterbank_shapes_esh(win_type: WinType) -> None:
|
||||
"""
|
||||
Contract test: for ESH, aac_filter_bank returns shape (128, 16).
|
||||
"""
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
|
||||
assert frame_f.shape == (128, 16)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_filterbank_channel_isolation_long_sequences(win_type: WinType) -> None:
|
||||
"""
|
||||
Behavior test: for OLS (representative long-sequence), channels are independent.
|
||||
If right channel is zero and left is random, right spectrum should be near zero.
|
||||
"""
|
||||
rng = np.random.default_rng(0)
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
frame_t[:, 0] = rng.normal(size=2048)
|
||||
|
||||
frame_f = aac_filter_bank(frame_t, "OLS", win_type)
|
||||
|
||||
assert np.max(np.abs(frame_f[:, 1])) < 1e-9
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_filterbank_channel_isolation_esh(win_type: WinType) -> None:
|
||||
"""
|
||||
Behavior test: for ESH, channels are independent.
|
||||
If right channel is zero and left is random, all odd columns (right) should be near zero.
|
||||
"""
|
||||
rng = np.random.default_rng(1)
|
||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
frame_t[:, 0] = rng.normal(size=2048)
|
||||
|
||||
frame_f = aac_filter_bank(frame_t, "ESH", win_type)
|
||||
|
||||
right_cols = frame_f[:, 1::2] # columns 1,3,5,...,15
|
||||
assert np.max(np.abs(right_cols)) < 1e-9
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_filterbank_esh_ignores_outer_regions(win_type: WinType) -> None:
|
||||
"""
|
||||
Spec-driven behavior test:
|
||||
ESH uses only the central region [448, 1600), split into 8 overlapping
|
||||
windows of length 256 with 50% overlap.
|
||||
|
||||
Therefore, changing samples outside [448, 1600) must not affect the output.
|
||||
"""
|
||||
rng = np.random.default_rng(2)
|
||||
|
||||
frame_a: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
frame_b: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||
|
||||
center = rng.normal(size=(1152, 2))
|
||||
frame_a[448:1600, :] = center
|
||||
frame_b[448:1600, :] = center
|
||||
|
||||
frame_b[0:448, :] = rng.normal(size=(448, 2))
|
||||
frame_b[1600:2048, :] = rng.normal(size=(448, 2))
|
||||
|
||||
fa = aac_filter_bank(frame_a, "ESH", win_type)
|
||||
fb = aac_filter_bank(frame_b, "ESH", win_type)
|
||||
|
||||
# Use a tiny tolerance to avoid flaky failures due to floating-point minutiae.
|
||||
np.testing.assert_allclose(fa, fb, rtol=0.0, atol=1e-12)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_filterbank_output_is_finite(win_type: WinType) -> None:
|
||||
"""
|
||||
Sanity test: output must not contain NaN or inf for representative cases.
|
||||
"""
|
||||
rng = np.random.default_rng(3)
|
||||
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
|
||||
|
||||
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||
assert np.isfinite(frame_f).all()
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Reverse i_filterbank tests
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_ifilterbank_shapes_long_sequences(win_type: WinType) -> None:
|
||||
"""
|
||||
Contract test: for OLS/LSS/LPS, aac_i_filter_bank returns shape (2048, 2).
|
||||
"""
|
||||
frame_f: FrameF = np.zeros((1024, 2), dtype=np.float64)
|
||||
for frame_type in ("OLS", "LSS", "LPS"):
|
||||
frame_t = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||
assert frame_t.shape == (2048, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_ifilterbank_shapes_esh(win_type: WinType) -> None:
|
||||
"""
|
||||
Contract test: for ESH, aac_i_filter_bank returns shape (2048, 2).
|
||||
"""
|
||||
frame_f: FrameF = np.zeros((128, 16), dtype=np.float64)
|
||||
frame_t = aac_i_filter_bank(frame_f, "ESH", win_type)
|
||||
assert frame_t.shape == (2048, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_roundtrip_per_frame_is_finite(win_type: WinType) -> None:
|
||||
"""
|
||||
Sanity test: per-frame analysis+synthesis must produce finite outputs.
|
||||
"""
|
||||
rng = np.random.default_rng(0)
|
||||
frame_t: FrameT = rng.normal(size=(2048, 2)).astype(np.float64)
|
||||
|
||||
for frame_type in ("OLS", "LSS", "ESH", "LPS"):
|
||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||
frame_t_hat = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||
assert np.isfinite(frame_t_hat).all()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_ola_reconstruction_ols_high_snr(win_type: WinType) -> None:
|
||||
"""
|
||||
Module-level test:
|
||||
OLS analysis+synthesis with hop=1024 must reconstruct with high SNR
|
||||
in the steady-state region.
|
||||
"""
|
||||
rng = np.random.default_rng(1)
|
||||
|
||||
K = 6
|
||||
N = 1024 * (K + 1)
|
||||
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||
|
||||
y = _ola_reconstruct(x, ["OLS"] * K, win_type)
|
||||
|
||||
a = 1024
|
||||
b = N - 1024
|
||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||
assert snr > 50.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_ola_reconstruction_esh_high_snr(win_type: WinType) -> None:
|
||||
"""
|
||||
Module-level test:
|
||||
ESH analysis+synthesis with hop=1024 must reconstruct with high SNR
|
||||
in the steady-state region.
|
||||
"""
|
||||
rng = np.random.default_rng(2)
|
||||
|
||||
K = 6
|
||||
N = 1024 * (K + 1)
|
||||
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||
|
||||
y = _ola_reconstruct(x, ["ESH"] * K, win_type)
|
||||
|
||||
a = 1024
|
||||
b = N - 1024
|
||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||
assert snr > 45.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("win_type", ["SIN", "KBD"])
|
||||
def test_ola_reconstruction_transition_sequence(win_type: WinType) -> None:
|
||||
"""
|
||||
Transition sequence test matching the windowing logic:
|
||||
OLS -> LSS -> ESH -> LPS -> OLS -> OLS
|
||||
"""
|
||||
rng = np.random.default_rng(3)
|
||||
|
||||
frame_types: list[FrameType] = ["OLS", "LSS", "ESH", "LPS", "OLS", "OLS"]
|
||||
K = len(frame_types)
|
||||
N = 1024 * (K + 1)
|
||||
x: StereoSignal = rng.normal(size=(N, 2)).astype(np.float64)
|
||||
|
||||
y = _ola_reconstruct(x, frame_types, win_type)
|
||||
|
||||
a = 1024
|
||||
b = N - 1024
|
||||
snr = _snr_db(x[a:b, :], y[a:b, :])
|
||||
assert snr > 40.0
|
||||
@ -1,117 +0,0 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Filterbank internal (mdct) Tests
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Tests for Filterbank internal MDCT/IMDCT functionality.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from core.aac_filterbank import _imdct, _mdct
|
||||
from core.aac_types import FloatArray, TimeSignal, MdctCoeffs
|
||||
|
||||
|
||||
def _assert_allclose(a: FloatArray, b: FloatArray, *, rtol: float, atol: float) -> None:
|
||||
"""
|
||||
Helper for consistent tolerances across tests.
|
||||
"""
|
||||
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
def _estimate_gain(y: MdctCoeffs, x: MdctCoeffs) -> float:
|
||||
"""
|
||||
Estimate scalar gain g such that y ~= g*x in least-squares sense.
|
||||
"""
|
||||
denom = float(np.dot(x, x))
|
||||
if denom == 0.0:
|
||||
return 0.0
|
||||
return float(np.dot(y, x) / denom)
|
||||
|
||||
|
||||
tolerance = 1e-10
|
||||
|
||||
@pytest.mark.parametrize("N", [256, 2048])
|
||||
def test_mdct_imdct_mdct_identity_up_to_gain(N: int) -> None:
|
||||
"""
|
||||
Consistency test in coefficient domain:
|
||||
mdct(imdct(X)) ~= g * X
|
||||
|
||||
For the chosen (non-orthonormal) scaling, g is expected to be close to 2.
|
||||
"""
|
||||
rng = np.random.default_rng(0)
|
||||
K = N // 2
|
||||
|
||||
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||
x: TimeSignal = _imdct(X)
|
||||
X_hat: MdctCoeffs = _mdct(x)
|
||||
|
||||
g = _estimate_gain(X_hat, X)
|
||||
_assert_allclose(X_hat, g * X, rtol=tolerance, atol=tolerance)
|
||||
_assert_allclose(np.array([g], dtype=np.float64), np.array([2.0], dtype=np.float64), rtol=tolerance, atol=tolerance)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("N", [256, 2048])
|
||||
def test_mdct_linearity(N: int) -> None:
|
||||
"""
|
||||
Linearity test:
|
||||
mdct(a*x + b*y) == a*mdct(x) + b*mdct(y)
|
||||
"""
|
||||
rng = np.random.default_rng(1)
|
||||
x: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||
y: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||
|
||||
a = 0.37
|
||||
b = -1.12
|
||||
|
||||
left: MdctCoeffs = _mdct(a * x + b * y)
|
||||
right: MdctCoeffs = a * _mdct(x) + b * _mdct(y)
|
||||
|
||||
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("N", [256, 2048])
|
||||
def test_imdct_linearity(N: int) -> None:
|
||||
"""
|
||||
Linearity test for IMDCT:
|
||||
imdct(a*X + b*Y) == a*imdct(X) + b*imdct(Y)
|
||||
"""
|
||||
rng = np.random.default_rng(2)
|
||||
K = N // 2
|
||||
|
||||
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||
Y: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||
|
||||
a = -0.5
|
||||
b = 2.0
|
||||
|
||||
left: TimeSignal = _imdct(a * X + b * Y)
|
||||
right: TimeSignal = a * _imdct(X) + b * _imdct(Y)
|
||||
|
||||
_assert_allclose(left, right, rtol=tolerance, atol=tolerance)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("N", [256, 2048])
|
||||
def test_mdct_imdct_outputs_are_finite(N: int) -> None:
|
||||
"""
|
||||
Sanity test: no NaN/inf on random inputs.
|
||||
"""
|
||||
rng = np.random.default_rng(3)
|
||||
K = N // 2
|
||||
|
||||
x: TimeSignal = rng.normal(size=N).astype(np.float64)
|
||||
X: MdctCoeffs = rng.normal(size=K).astype(np.float64)
|
||||
|
||||
X1 = _mdct(x)
|
||||
x1 = _imdct(X)
|
||||
|
||||
assert np.isfinite(X1).all()
|
||||
assert np.isfinite(x1).all()
|
||||
@ -22,13 +22,13 @@ from __future__ import annotations
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_types import AACSeq1, StereoSignal
|
||||
from core.aac_coder import aac_coder_1 as core_aac_coder_1
|
||||
from core.aac_coder import aac_read_wav_stereo_48k
|
||||
from core.aac_decoder import aac_decoder_1 as core_aac_decoder_1
|
||||
from core.aac_snr_db import snr_db
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -80,50 +80,6 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
||||
# Demo (Level 1)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||
"""
|
||||
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_ref : StereoSignal
|
||||
Reference stereo stream.
|
||||
x_hat : StereoSignal
|
||||
Reconstructed stereo stream.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
SNR in dB.
|
||||
- Returns +inf if noise power is zero.
|
||||
- Returns -inf if signal power is zero.
|
||||
"""
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||
|
||||
if x_ref.ndim == 1:
|
||||
x_ref = x_ref.reshape(-1, 1)
|
||||
if x_hat.ndim == 1:
|
||||
x_hat = x_hat.reshape(-1, 1)
|
||||
|
||||
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||
|
||||
x_ref = x_ref[:n, :c]
|
||||
x_hat = x_hat[:n, :c]
|
||||
|
||||
err = x_ref - x_hat
|
||||
ps = float(np.sum(x_ref * x_ref))
|
||||
pn = float(np.sum(err * err))
|
||||
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
|
||||
return float(10.0 * np.log10(ps / pn))
|
||||
|
||||
|
||||
def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float:
|
||||
"""
|
||||
Demonstration for the Level-1 codec.
|
||||
@ -158,12 +114,11 @@ def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) ->
|
||||
x_hat = aac_decoder_1(aac_seq_1, filename_out)
|
||||
|
||||
# Optional sanity: ensure output file exists and is readable
|
||||
x_hat_file, fs_hat = sf.read(str(filename_out), always_2d=True)
|
||||
_ = x_hat_file
|
||||
_, fs_hat = sf.read(str(filename_out), always_2d=True)
|
||||
if int(fs_hat) != 48000:
|
||||
raise ValueError("Decoded output sampling rate must be 48 kHz.")
|
||||
|
||||
return _snr_db(x_ref, x_hat)
|
||||
return snr_db(x_ref, x_hat)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -172,11 +127,14 @@ def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) ->
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example:
|
||||
# python -m level_1.level_1 input.wav output.wav
|
||||
# cd level_1
|
||||
# python -m level_1 input.wav output.wav
|
||||
# or
|
||||
# python -m level_1 material/LicorDeCalandraca.wav LicorDeCalandraca_out.wav
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
raise SystemExit("Usage: python -m level_1.level_1 <input.wav> <output.wav>")
|
||||
raise SystemExit("Usage: python -m level_1 <input.wav> <output.wav>")
|
||||
|
||||
in_wav = Path(sys.argv[1])
|
||||
out_wav = Path(sys.argv[2])
|
||||
|
||||
BIN
source/level_1/material/LicorDeCalandraca.wav
Normal file
BIN
source/level_1/material/LicorDeCalandraca.wav
Normal file
Binary file not shown.
BIN
source/level_1/material/TableB219.mat
Normal file
BIN
source/level_1/material/TableB219.mat
Normal file
Binary file not shown.
BIN
source/level_1/material/huffCodebooks.mat
Normal file
BIN
source/level_1/material/huffCodebooks.mat
Normal file
Binary file not shown.
400
source/level_1/material/huff_utils.py
Normal file
400
source/level_1/material/huff_utils.py
Normal file
@ -0,0 +1,400 @@
|
||||
import numpy as np
|
||||
import scipy.io as sio
|
||||
import os
|
||||
|
||||
# ------------------ LOAD LUT ------------------
|
||||
|
||||
def load_LUT(mat_filename=None):
|
||||
"""
|
||||
Loads the list of Huffman Codebooks (LUTs)
|
||||
|
||||
Returns:
|
||||
huffLUT : list (index 1..11 used, index 0 unused)
|
||||
"""
|
||||
if mat_filename is None:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
|
||||
|
||||
mat = sio.loadmat(mat_filename)
|
||||
|
||||
|
||||
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
|
||||
|
||||
huffCodebooks = []
|
||||
for i in range(11):
|
||||
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
|
||||
|
||||
# Build inverse VLC tables
|
||||
invTable = [None] * 11
|
||||
|
||||
for i in range(11):
|
||||
h = huffCodebooks[i][:, 2].astype(int) # column 3
|
||||
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
|
||||
|
||||
hbin = []
|
||||
for j in range(len(h)):
|
||||
hbin.append(format(h[j], f'0{hlength[j]}b'))
|
||||
|
||||
invTable[i] = vlc_table(hbin)
|
||||
|
||||
# Build Huffman LUT dicts
|
||||
huffLUT = [None] * 12 # index 0 unused
|
||||
params = [
|
||||
(4, 1, True),
|
||||
(4, 1, True),
|
||||
(4, 2, False),
|
||||
(4, 2, False),
|
||||
(2, 4, True),
|
||||
(2, 4, True),
|
||||
(2, 7, False),
|
||||
(2, 7, False),
|
||||
(2, 12, False),
|
||||
(2, 12, False),
|
||||
(2, 16, False),
|
||||
]
|
||||
|
||||
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
|
||||
huffLUT[i] = {
|
||||
'LUT': huffCodebooks[i-1],
|
||||
'invTable': invTable[i-1],
|
||||
'codebook': i,
|
||||
'nTupleSize': nTupleSize,
|
||||
'maxAbsCodeVal': maxAbs,
|
||||
'signedValues': signed
|
||||
}
|
||||
|
||||
return huffLUT
|
||||
|
||||
def vlc_table(code_array):
|
||||
"""
|
||||
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
|
||||
returns:
|
||||
h : NumPy array of shape (num_nodes, 3)
|
||||
columns:
|
||||
[ next_if_0 , next_if_1 , symbol_index ]
|
||||
"""
|
||||
h = np.zeros((1, 3), dtype=int)
|
||||
|
||||
for code_index, code in enumerate(code_array, start=1):
|
||||
word = [int(bit) for bit in code]
|
||||
h_index = 0
|
||||
|
||||
for bit in word:
|
||||
k = bit
|
||||
next_node = h[h_index, k]
|
||||
if next_node == 0:
|
||||
h = np.vstack([h, [0, 0, 0]])
|
||||
new_index = h.shape[0] - 1
|
||||
h[h_index, k] = new_index
|
||||
h_index = new_index
|
||||
else:
|
||||
h_index = next_node
|
||||
|
||||
h[h_index, 2] = code_index
|
||||
|
||||
return h
|
||||
|
||||
# ------------------ ENCODE ------------------
|
||||
|
||||
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
|
||||
"""
|
||||
Huffman-encode a sequence of quantized coefficients.
|
||||
|
||||
This function selects the appropriate Huffman codebook based on the
|
||||
maximum absolute value of the input coefficients, encodes the coefficients
|
||||
into a binary Huffman bitstream, and returns both the bitstream and the
|
||||
selected codebook index.
|
||||
|
||||
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
|
||||
in audio/image coding (e.g., scale factor band encoding). The input
|
||||
coefficient sequence is grouped into fixed-size tuples as defined by
|
||||
the chosen Huffman LUT. Zero-padding may be applied internally.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coeff_sec : array_like of int
|
||||
1-D array of quantized integer coefficients to encode.
|
||||
Typically corresponds to a "section" or scale-factor band.
|
||||
|
||||
huff_LUT_list : list
|
||||
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
|
||||
Index 1..11 correspond to valid Huffman codebooks.
|
||||
Index 0 is unused.
|
||||
|
||||
Returns
|
||||
-------
|
||||
huffSec : str
|
||||
Huffman-encoded bitstream represented as a string of '0' and '1'
|
||||
characters.
|
||||
|
||||
huffCodebook : int
|
||||
Index (1..11) of the Huffman codebook used for encoding.
|
||||
A value of 0 indicates a special all-zero section.
|
||||
"""
|
||||
if force_codebook is not None:
|
||||
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
|
||||
|
||||
maxAbsVal = np.max(np.abs(coeff_sec))
|
||||
|
||||
if maxAbsVal == 0:
|
||||
huffCodebook = 0
|
||||
huffSec = huff_LUT_code_0()
|
||||
|
||||
elif maxAbsVal == 1:
|
||||
candidates = [1, 2]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal == 2:
|
||||
candidates = [3, 4]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (3, 4):
|
||||
candidates = [5, 6]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (5, 6, 7):
|
||||
candidates = [7, 8]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (8, 9, 10, 11, 12):
|
||||
candidates = [9, 10]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (13, 14, 15):
|
||||
huffCodebook = 11
|
||||
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
|
||||
|
||||
else:
|
||||
huffCodebook = 11
|
||||
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
|
||||
|
||||
return huffSec, huffCodebook
|
||||
|
||||
def huff_LUT_code_1(huff_LUT, coeff_sec):
|
||||
LUT = huff_LUT['LUT']
|
||||
nTupleSize = huff_LUT['nTupleSize']
|
||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||
signedValues = huff_LUT['signedValues']
|
||||
|
||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
||||
|
||||
if signedValues:
|
||||
coeff = coeff_sec + maxAbsCodeVal
|
||||
base = 2 * maxAbsCodeVal + 1
|
||||
else:
|
||||
coeff = coeff_sec
|
||||
base = maxAbsCodeVal + 1
|
||||
|
||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
||||
coeffPad[:len(coeff)] = coeff
|
||||
|
||||
huffSec = []
|
||||
|
||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
||||
|
||||
for i in range(numTuples):
|
||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
||||
huffIndex = int(np.abs(nTuple) @ powers)
|
||||
|
||||
hexVal = LUT[huffIndex, 2]
|
||||
huffLen = LUT[huffIndex, 1]
|
||||
|
||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
||||
|
||||
if signedValues:
|
||||
huffSec.append(bits)
|
||||
else:
|
||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
||||
huffSec.append(bits + signBits)
|
||||
|
||||
return ''.join(huffSec)
|
||||
|
||||
def huff_LUT_code_0():
|
||||
return ''
|
||||
|
||||
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
|
||||
LUT = huff_LUT['LUT']
|
||||
nTupleSize = huff_LUT['nTupleSize']
|
||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||
|
||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
||||
base = maxAbsCodeVal + 1
|
||||
|
||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
||||
coeffPad[:len(coeff_sec)] = coeff_sec
|
||||
|
||||
huffSec = []
|
||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
||||
|
||||
for i in range(numTuples):
|
||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
||||
|
||||
lnTuple = nTuple.astype(float)
|
||||
lnTuple[lnTuple == 0] = np.finfo(float).eps
|
||||
|
||||
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
|
||||
N = np.maximum(0, N4 - 4)
|
||||
esc = np.abs(nTuple) > 15
|
||||
|
||||
nTupleESC = nTuple.copy()
|
||||
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
|
||||
|
||||
huffIndex = int(np.abs(nTupleESC) @ powers)
|
||||
|
||||
hexVal = LUT[huffIndex, 2]
|
||||
huffLen = LUT[huffIndex, 1]
|
||||
|
||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
||||
|
||||
escSeq = ''
|
||||
for k in range(nTupleSize):
|
||||
if esc[k]:
|
||||
escSeq += '1' * N[k]
|
||||
escSeq += '0'
|
||||
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
|
||||
|
||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
||||
huffSec.append(bits + signBits + escSeq)
|
||||
|
||||
return ''.join(huffSec)
|
||||
|
||||
# ------------------ DECODE ------------------
|
||||
|
||||
def decode_huff(huff_sec, huff_LUT):
|
||||
"""
|
||||
Decode a Huffman-encoded stream.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
huff_sec : array-like of int or str
|
||||
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
|
||||
huff_LUT : dict
|
||||
Huffman lookup table with keys:
|
||||
- 'invTable': inverse table (numpy array)
|
||||
- 'codebook': codebook number
|
||||
- 'nTupleSize': tuple size
|
||||
- 'maxAbsCodeVal': maximum absolute code value
|
||||
- 'signedValues': True/False
|
||||
|
||||
Returns
|
||||
-------
|
||||
decCoeffs : list of int
|
||||
Decoded quantized coefficients.
|
||||
"""
|
||||
|
||||
h = huff_LUT['invTable']
|
||||
huffCodebook = huff_LUT['codebook']
|
||||
nTupleSize = huff_LUT['nTupleSize']
|
||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||
signedValues = huff_LUT['signedValues']
|
||||
|
||||
# Convert string to array of ints
|
||||
if isinstance(huff_sec, str):
|
||||
huff_sec = np.array([int(b) for b in huff_sec])
|
||||
|
||||
eos = False
|
||||
decCoeffs = []
|
||||
streamIndex = 0
|
||||
|
||||
while not eos:
|
||||
wordbit = 0
|
||||
r = 0 # start at root
|
||||
|
||||
# Decode Huffman word using inverse table
|
||||
while True:
|
||||
b = huff_sec[streamIndex + wordbit]
|
||||
wordbit += 1
|
||||
rOld = r
|
||||
r = h[rOld, b]
|
||||
if h[r, 0] == 0 and h[r, 1] == 0:
|
||||
symbolIndex = h[r, 2] - 1 # zero-based
|
||||
streamIndex += wordbit
|
||||
break
|
||||
|
||||
# Decode n-tuple magnitudes
|
||||
if signedValues:
|
||||
base = 2 * maxAbsCodeVal + 1
|
||||
nTupleDec = []
|
||||
tmp = symbolIndex
|
||||
for p in reversed(range(nTupleSize)):
|
||||
val = tmp // (base ** p)
|
||||
nTupleDec.append(val - maxAbsCodeVal)
|
||||
tmp = tmp % (base ** p)
|
||||
nTupleDec = np.array(nTupleDec)
|
||||
else:
|
||||
base = maxAbsCodeVal + 1
|
||||
nTupleDec = []
|
||||
tmp = symbolIndex
|
||||
for p in reversed(range(nTupleSize)):
|
||||
val = tmp // (base ** p)
|
||||
nTupleDec.append(val)
|
||||
tmp = tmp % (base ** p)
|
||||
nTupleDec = np.array(nTupleDec)
|
||||
|
||||
# Apply sign bits
|
||||
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
|
||||
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
|
||||
streamIndex += nTupleSize
|
||||
nTupleDec = nTupleDec * nTupleSign
|
||||
|
||||
# Handle escape sequences
|
||||
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
|
||||
if huffCodebook == 11 and escIndex.size > 0:
|
||||
for idx in escIndex:
|
||||
N = 0
|
||||
b = huff_sec[streamIndex]
|
||||
while b:
|
||||
N += 1
|
||||
b = huff_sec[streamIndex + N]
|
||||
streamIndex += N
|
||||
N4 = N + 4
|
||||
escape_word = huff_sec[streamIndex:streamIndex + N4]
|
||||
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
|
||||
nTupleDec[idx] = escape_value
|
||||
streamIndex += N4 + 1
|
||||
# Apply signs again
|
||||
nTupleDec[escIndex] *= nTupleSign[escIndex]
|
||||
|
||||
decCoeffs.extend(nTupleDec.tolist())
|
||||
|
||||
if streamIndex >= len(huff_sec):
|
||||
eos = True
|
||||
|
||||
return decCoeffs
|
||||
|
||||
|
||||
284
source/level_2/core/aac_coder.py
Normal file
284
source/level_2/core/aac_coder.py
Normal file
@ -0,0 +1,284 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - AAC Coder (Core)
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Level 1 AAC encoder orchestration.
|
||||
# Keeps the same functional behavior as the original level_1 implementation:
|
||||
# - Reads WAV via soundfile
|
||||
# - Validates stereo and 48 kHz
|
||||
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
|
||||
# - SSC decision uses next-frame attack detection
|
||||
# - Filterbank analysis (MDCT)
|
||||
# - Stores per-channel spectra in AACSeq1 schema:
|
||||
# * ESH: (128, 8)
|
||||
# * else: (1024, 1)
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_configuration import WIN_TYPE
|
||||
from core.aac_filterbank import aac_filter_bank
|
||||
from core.aac_ssc import aac_SSC
|
||||
from core.aac_tns import aac_tns
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Public helpers (useful for level_x demo wrappers)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
|
||||
"""
|
||||
Read a WAV file using soundfile and validate the Level-1 assumptions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename_in : Union[str, Path]
|
||||
Input WAV filename.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : StereoSignal (np.ndarray)
|
||||
Stereo samples as float64, shape (N, 2).
|
||||
fs : int
|
||||
Sampling rate (Hz). Must be 48000.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the input is not stereo or the sampling rate is not 48 kHz.
|
||||
"""
|
||||
filename_in = Path(filename_in)
|
||||
|
||||
x, fs = sf.read(str(filename_in), always_2d=True)
|
||||
x = np.asarray(x, dtype=np.float64)
|
||||
|
||||
if x.shape[1] != 2:
|
||||
raise ValueError("Input must be stereo (2 channels).")
|
||||
if int(fs) != 48000:
|
||||
raise ValueError("Input sampling rate must be 48 kHz.")
|
||||
|
||||
return x, int(fs)
|
||||
|
||||
|
||||
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||
"""
|
||||
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
|
||||
as required by the Level-1 AACSeq1 schema.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_type : FrameType
|
||||
"OLS" | "LSS" | "ESH" | "LPS".
|
||||
frame_f : FrameF
|
||||
Output of aac_filter_bank():
|
||||
- If frame_type != "ESH": shape (1024, 2)
|
||||
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||
|
||||
Returns
|
||||
-------
|
||||
chl_f : FrameChannelF
|
||||
Left channel coefficients:
|
||||
- ESH: shape (128, 8)
|
||||
- else: shape (1024, 1)
|
||||
chr_f : FrameChannelF
|
||||
Right channel coefficients:
|
||||
- ESH: shape (128, 8)
|
||||
- else: shape (1024, 1)
|
||||
"""
|
||||
if frame_type == "ESH":
|
||||
if frame_f.shape != (128, 16):
|
||||
raise ValueError("For ESH, frame_f must have shape (128, 16).")
|
||||
|
||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
||||
for j in range(8):
|
||||
chl_f[:, j] = frame_f[:, 2 * j + 0]
|
||||
chr_f[:, j] = frame_f[:, 2 * j + 1]
|
||||
return chl_f, chr_f
|
||||
|
||||
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
|
||||
if frame_f.shape != (1024, 2):
|
||||
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
|
||||
|
||||
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
|
||||
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
|
||||
return chl_f, chr_f
|
||||
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 1 encoder
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
||||
"""
|
||||
Level-1 AAC encoder.
|
||||
|
||||
This function preserves the behavior of the original level_1 implementation:
|
||||
- Read stereo 48 kHz WAV
|
||||
- Pad hop samples at start and hop samples at end
|
||||
- Frame with win=2048, hop=1024
|
||||
- Use SSC with next-frame lookahead
|
||||
- Apply filterbank analysis
|
||||
- Store per-channel coefficients using AACSeq1 schema
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename_in : Union[str, Path]
|
||||
Input WAV filename.
|
||||
Assumption: stereo audio, sampling rate 48 kHz.
|
||||
|
||||
Returns
|
||||
-------
|
||||
AACSeq1
|
||||
List of encoded frames (Level 1 schema).
|
||||
"""
|
||||
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||
# The assignment assumes 48 kHz
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
|
||||
# Pad at the beginning to support the first overlap region.
|
||||
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
|
||||
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||
|
||||
# Number of frames such that current frame fits; next frame will be padded if needed.
|
||||
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||
if K <= 0:
|
||||
raise ValueError("Input too short for framing.")
|
||||
|
||||
aac_seq: AACSeq1 = []
|
||||
prev_frame_type: FrameType = "OLS"
|
||||
|
||||
win_type: WinType = WIN_TYPE
|
||||
|
||||
for i in range(K):
|
||||
start = i * hop
|
||||
|
||||
frame_t: FrameT = x_pad[start:start + win, :]
|
||||
if frame_t.shape != (win, 2):
|
||||
# This should not happen due to K definition, but keep it explicit.
|
||||
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||
|
||||
next_t = x_pad[start + hop:start + hop + win, :]
|
||||
|
||||
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
|
||||
if next_t.shape[0] < win:
|
||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||
next_t = np.vstack([next_t, tail])
|
||||
|
||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||
|
||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||
|
||||
aac_seq.append({
|
||||
"frame_type": frame_type,
|
||||
"win_type": win_type,
|
||||
"chl": {"frame_F": chl_f},
|
||||
"chr": {"frame_F": chr_f},
|
||||
})
|
||||
|
||||
prev_frame_type = frame_type
|
||||
|
||||
return aac_seq
|
||||
|
||||
|
||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
||||
"""
|
||||
Level-2 AAC encoder (Level 1 + TNS).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename_in : Union[str, Path]
|
||||
Input WAV filename (stereo, 48 kHz).
|
||||
|
||||
Returns
|
||||
-------
|
||||
AACSeq2
|
||||
Encoded AAC sequence (Level 2 payload schema).
|
||||
For each frame i:
|
||||
- "frame_type": FrameType
|
||||
- "win_type": WinType
|
||||
- "chl"/"chr":
|
||||
- "frame_F": FrameChannelF (after TNS)
|
||||
- "tns_coeffs": TnsCoeffs
|
||||
"""
|
||||
filename_in = Path(filename_in)
|
||||
|
||||
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||
# The assignment assumes 48 kHz
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
|
||||
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||
|
||||
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||
if K <= 0:
|
||||
raise ValueError("Input too short for framing.")
|
||||
|
||||
aac_seq: AACSeq2 = []
|
||||
prev_frame_type: FrameType = "OLS"
|
||||
|
||||
for i in range(K):
|
||||
start = i * hop
|
||||
|
||||
frame_t: FrameT = x_pad[start : start + win, :]
|
||||
if frame_t.shape != (win, 2):
|
||||
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||
|
||||
next_t = x_pad[start + hop : start + hop + win, :]
|
||||
if next_t.shape[0] < win:
|
||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||
next_t = np.vstack([next_t, tail])
|
||||
|
||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
||||
|
||||
# Level 1 analysis (packed stereo container)
|
||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||
|
||||
# Unpack to per-channel (as you already do in Level 1)
|
||||
if frame_type == "ESH":
|
||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
||||
for j in range(8):
|
||||
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
|
||||
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
|
||||
else:
|
||||
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
|
||||
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
|
||||
|
||||
# Level 2: apply TNS per channel
|
||||
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||
|
||||
aac_seq.append(
|
||||
{
|
||||
"frame_type": frame_type,
|
||||
"win_type": WIN_TYPE,
|
||||
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
|
||||
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
||||
}
|
||||
)
|
||||
|
||||
prev_frame_type = frame_type
|
||||
|
||||
return aac_seq
|
||||
31
source/level_2/core/aac_configuration.py
Normal file
31
source/level_2/core/aac_configuration.py
Normal file
@ -0,0 +1,31 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Configuration
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# This module contains the global configurations
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
# Imports
|
||||
from core.aac_types import WinType
|
||||
|
||||
# Filterbank
|
||||
# ------------------------------------------------------------
|
||||
# Window type
|
||||
# Options: "SIN", "KBD"
|
||||
WIN_TYPE: WinType = "SIN"
|
||||
|
||||
|
||||
# TNS
|
||||
# ------------------------------------------------------------
|
||||
PRED_ORDER = 4
|
||||
QUANT_STEP = 0.1
|
||||
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]
|
||||
257
source/level_2/core/aac_decoder.py
Normal file
257
source/level_2/core/aac_decoder.py
Normal file
@ -0,0 +1,257 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Inverse AAC Coder (Core)
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
|
||||
# Keeps the same functional behavior as the original level_1 implementation:
|
||||
# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
|
||||
# - IMDCT synthesis per frame
|
||||
# - Overlap-add with hop=1024
|
||||
# - Remove encoder boundary padding: hop at start and hop at end
|
||||
#
|
||||
# Note:
|
||||
# This core module returns the reconstructed samples. Writing to disk is kept
|
||||
# in level_x demos.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_filterbank import aac_i_filter_bank
|
||||
from core.aac_tns import aac_i_tns
|
||||
from core.aac_types import *
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Public helpers (useful for level_x demo wrappers)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
|
||||
"""
|
||||
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
|
||||
FrameF container expected by aac_i_filter_bank().
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_type : FrameType
|
||||
"OLS" | "LSS" | "ESH" | "LPS".
|
||||
chl_f : FrameChannelF
|
||||
Left channel coefficients:
|
||||
- ESH: (128, 8)
|
||||
- else: (1024, 1)
|
||||
chr_f : FrameChannelF
|
||||
Right channel coefficients:
|
||||
- ESH: (128, 8)
|
||||
- else: (1024, 1)
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameF
|
||||
Stereo coefficients:
|
||||
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||
- else: (1024, 2)
|
||||
"""
|
||||
if frame_type == "ESH":
|
||||
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||
|
||||
frame_f = np.empty((128, 16), dtype=np.float64)
|
||||
for j in range(8):
|
||||
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||
return frame_f
|
||||
|
||||
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
|
||||
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
|
||||
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
|
||||
|
||||
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||
frame_f[:, 0] = chl_f[:, 0]
|
||||
frame_f[:, 1] = chr_f[:, 0]
|
||||
return frame_f
|
||||
|
||||
|
||||
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
||||
"""
|
||||
Remove the boundary padding that the Level-1 encoder adds:
|
||||
hop samples at start and hop samples at end.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pad : StereoSignal (np.ndarray)
|
||||
Reconstructed padded stream, shape (N_pad, 2).
|
||||
hop : int
|
||||
Hop size in samples (default 1024).
|
||||
|
||||
Returns
|
||||
-------
|
||||
StereoSignal (np.ndarray)
|
||||
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If y_pad is too short to unpad.
|
||||
"""
|
||||
if y_pad.shape[0] < 2 * hop:
|
||||
raise ValueError("Decoded stream too short to unpad.")
|
||||
return y_pad[hop:-hop, :]
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 1 decoder (core)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
||||
"""
|
||||
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||||
|
||||
This function preserves the behavior of the original level_1 implementation:
|
||||
- Reconstruct the full padded stream by overlap-adding K synthesized frames
|
||||
- Remove hop padding at the beginning and hop padding at the end
|
||||
- Write the reconstructed stereo WAV file (48 kHz)
|
||||
- Return reconstructed stereo samples as float64
|
||||
|
||||
Parameters
|
||||
----------
|
||||
aac_seq_1 : AACSeq1
|
||||
Encoded sequence as produced by aac_coder_1().
|
||||
filename_out : Union[str, Path]
|
||||
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||
|
||||
Returns
|
||||
-------
|
||||
StereoSignal
|
||||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||
"""
|
||||
filename_out = Path(filename_out)
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
K = len(aac_seq_1)
|
||||
|
||||
# Output includes the encoder padding region, so we reconstruct the full padded stream.
|
||||
# For K frames: last frame starts at (K-1)*hop and spans win,
|
||||
# so total length = (K-1)*hop + win.
|
||||
n_pad = (K - 1) * hop + win
|
||||
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||||
|
||||
for i, fr in enumerate(aac_seq_1):
|
||||
frame_type: FrameType = fr["frame_type"]
|
||||
win_type: WinType = fr["win_type"]
|
||||
|
||||
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||
|
||||
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
|
||||
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
|
||||
|
||||
start = i * hop
|
||||
y_pad[start:start + win, :] += frame_t_hat
|
||||
|
||||
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||
|
||||
# Level 1 assumption: 48 kHz output.
|
||||
sf.write(str(filename_out), y, 48000)
|
||||
|
||||
return y
|
||||
|
||||
|
||||
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
||||
"""
|
||||
Level-2 AAC decoder (inverse of aac_coder_2).
|
||||
|
||||
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
|
||||
- Per frame/channel: inverse TNS using stored coefficients
|
||||
- Re-pack to stereo frame_F
|
||||
- IMDCT + windowing
|
||||
- Overlap-add over frames
|
||||
- Remove Level-1 padding (hop samples start/end)
|
||||
- Write output WAV (48 kHz)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
aac_seq_2 : AACSeq2
|
||||
Encoded sequence as produced by aac_coder_2().
|
||||
filename_out : Union[str, Path]
|
||||
Output WAV filename.
|
||||
|
||||
Returns
|
||||
-------
|
||||
StereoSignal
|
||||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||
"""
|
||||
filename_out = Path(filename_out)
|
||||
|
||||
hop = 1024
|
||||
win = 2048
|
||||
K = len(aac_seq_2)
|
||||
|
||||
if K <= 0:
|
||||
raise ValueError("aac_seq_2 must contain at least one frame.")
|
||||
|
||||
n_pad = (K - 1) * hop + win
|
||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||
|
||||
for i, fr in enumerate(aac_seq_2):
|
||||
frame_type: FrameType = fr["frame_type"]
|
||||
win_type: WinType = fr["win_type"]
|
||||
|
||||
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||
|
||||
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||
|
||||
# Inverse TNS per channel
|
||||
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
|
||||
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
|
||||
|
||||
# Re-pack to the stereo container expected by aac_i_filter_bank
|
||||
if frame_type == "ESH":
|
||||
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||
|
||||
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
|
||||
for j in range(8):
|
||||
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||
else:
|
||||
# Accept either (1024,1) or (1024,) from your internal convention.
|
||||
if chl_f.shape == (1024,):
|
||||
chl_col = chl_f.reshape(1024, 1)
|
||||
elif chl_f.shape == (1024, 1):
|
||||
chl_col = chl_f
|
||||
else:
|
||||
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
|
||||
|
||||
if chr_f.shape == (1024,):
|
||||
chr_col = chr_f.reshape(1024, 1)
|
||||
elif chr_f.shape == (1024, 1):
|
||||
chr_col = chr_f
|
||||
else:
|
||||
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
|
||||
|
||||
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||
frame_f[:, 0] = chl_col[:, 0]
|
||||
frame_f[:, 1] = chr_col[:, 0]
|
||||
|
||||
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||
|
||||
start = i * hop
|
||||
y_pad[start : start + win, :] += frame_t_hat
|
||||
|
||||
y = aac_remove_padding(y_pad, hop=hop)
|
||||
|
||||
sf.write(str(filename_out), y, 48000)
|
||||
return y
|
||||
454
source/level_2/core/aac_filterbank.py
Normal file
454
source/level_2/core/aac_filterbank.py
Normal file
@ -0,0 +1,454 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Filterbank module
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from core.aac_types import *
|
||||
|
||||
from scipy.signal.windows import kaiser
|
||||
|
||||
# Private helpers for Filterbank
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def _sin_window(N: int) -> Window:
|
||||
"""
|
||||
Build a sinusoidal (SIN) window of length N.
|
||||
|
||||
The AAC sinusoid window is:
|
||||
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
|
||||
|
||||
Parameters
|
||||
----------
|
||||
N : int
|
||||
Window length in samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Window
|
||||
1-D array of shape (N, ) with dtype float64.
|
||||
"""
|
||||
n = np.arange(N, dtype=np.float64)
|
||||
return np.sin((np.pi / N) * (n + 0.5))
|
||||
|
||||
|
||||
def _kbd_window(N: int, alpha: float) -> Window:
|
||||
"""
|
||||
Build a Kaiser-Bessel-Derived (KBD) window of length N.
|
||||
|
||||
This follows the standard KBD construction used in AAC:
|
||||
1) Build a Kaiser kernel of length (N/2 + 1).
|
||||
2) Form the left half by cumulative summation, normalization, and sqrt.
|
||||
3) Mirror the left half to form the right half (symmetric full-length window).
|
||||
|
||||
Notes
|
||||
-----
|
||||
- N must be even (AAC uses N=2048 for long and N=256 for short).
|
||||
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
|
||||
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
N : int
|
||||
Window length in samples (must be even).
|
||||
alpha : float
|
||||
KBD alpha parameter.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Window
|
||||
1-D array of shape (N,) with dtype float64.
|
||||
"""
|
||||
half = N // 2
|
||||
|
||||
# Kaiser kernel length: half + 1 samples (0 .. half)
|
||||
# beta = pi * alpha per the usual correspondence with the ISO definition
|
||||
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
|
||||
|
||||
csum = np.cumsum(kernel)
|
||||
denom = csum[-1]
|
||||
|
||||
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
|
||||
w_right = w_left[::-1] # mirror for second half
|
||||
|
||||
return np.concatenate([w_left, w_right])
|
||||
|
||||
|
||||
def _long_window(win_type: WinType) -> Window:
|
||||
"""
|
||||
Return the long AAC window (length 2048) for the selected window family.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
win_type : WinType
|
||||
Either "SIN" or "KBD".
|
||||
|
||||
Returns
|
||||
-------
|
||||
Window
|
||||
1-D array of shape (2048,) with dtype float64.
|
||||
"""
|
||||
if win_type == "SIN":
|
||||
return _sin_window(2048)
|
||||
if win_type == "KBD":
|
||||
# Assignment-specific alpha values
|
||||
return _kbd_window(2048, alpha=6.0)
|
||||
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||
|
||||
|
||||
def _short_window(win_type: WinType) -> Window:
|
||||
"""
|
||||
Return the short AAC window (length 256) for the selected window family.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
win_type : WinType
|
||||
Either "SIN" or "KBD".
|
||||
|
||||
Returns
|
||||
-------
|
||||
Window
|
||||
1-D array of shape (256,) with dtype float64.
|
||||
"""
|
||||
if win_type == "SIN":
|
||||
return _sin_window(256)
|
||||
if win_type == "KBD":
|
||||
# Assignment-specific alpha values
|
||||
return _kbd_window(256, alpha=4.0)
|
||||
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||
|
||||
|
||||
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
|
||||
"""
|
||||
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
|
||||
|
||||
In this assignment we assume a single window family is used globally
|
||||
(no mixed KBD/SIN halves). Therefore, both the long and short windows
|
||||
are drawn from the same family.
|
||||
|
||||
For frame_type:
|
||||
- "OLS": return the long window Wl (2048).
|
||||
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
|
||||
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_type : FrameType
|
||||
One of "OLS", "LSS", "LPS".
|
||||
win_type : WinType
|
||||
Either "SIN" or "KBD".
|
||||
|
||||
Returns
|
||||
-------
|
||||
Window
|
||||
1-D array of shape (2048,) with dtype float64.
|
||||
"""
|
||||
wL = _long_window(win_type) # length 2048
|
||||
wS = _short_window(win_type) # length 256
|
||||
|
||||
if frame_type == "OLS":
|
||||
return wL
|
||||
|
||||
if frame_type == "LSS":
|
||||
# 0..1023: left half of long window
|
||||
# 1024..1471: ones (448 samples)
|
||||
# 1472..1599: right half of short window (128 samples)
|
||||
# 1600..2047: zeros (448 samples)
|
||||
out = np.zeros(2048, dtype=np.float64)
|
||||
out[0:1024] = wL[0:1024]
|
||||
out[1024:1472] = 1.0
|
||||
out[1472:1600] = wS[128:256]
|
||||
out[1600:2048] = 0.0
|
||||
return out
|
||||
|
||||
if frame_type == "LPS":
|
||||
# 0..447: zeros (448)
|
||||
# 448..575: left half of short window (128)
|
||||
# 576..1023: ones (448)
|
||||
# 1024..2047: right half of long window (1024)
|
||||
out = np.zeros(2048, dtype=np.float64)
|
||||
out[0:448] = 0.0
|
||||
out[448:576] = wS[0:128]
|
||||
out[576:1024] = 1.0
|
||||
out[1024:2048] = wL[1024:2048]
|
||||
return out
|
||||
|
||||
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
|
||||
|
||||
|
||||
def _mdct(s: TimeSignal) -> MdctCoeffs:
|
||||
"""
|
||||
MDCT (direct form) as specified in the assignment.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s : TimeSignal
|
||||
Windowed time samples, 1-D array of length N (N = 2048 or 256).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
MDCT coefficients, 1-D array of length N/2.
|
||||
|
||||
Definition
|
||||
----------
|
||||
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||
where n0 = (N/2 + 1)/2.
|
||||
"""
|
||||
s = np.asarray(s, dtype=np.float64).reshape(-1)
|
||||
N = int(s.shape[0])
|
||||
if N not in (2048, 256):
|
||||
raise ValueError("MDCT input length must be 2048 or 256.")
|
||||
|
||||
n0 = (N / 2.0 + 1.0) / 2.0
|
||||
n = np.arange(N, dtype=np.float64) + n0
|
||||
k = np.arange(N // 2, dtype=np.float64) + 0.5
|
||||
|
||||
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
|
||||
X = 2.0 * (s @ C) # (N/2,)
|
||||
return X
|
||||
|
||||
|
||||
def _imdct(X: MdctCoeffs) -> TimeSignal:
|
||||
"""
|
||||
IMDCT (direct form) as specified in the assignment.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : MdctCoeffs
|
||||
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
|
||||
|
||||
Returns
|
||||
-------
|
||||
TimeSignal
|
||||
Reconstructed time samples, 1-D array of length N = 2K.
|
||||
|
||||
Definition
|
||||
----------
|
||||
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||
where n0 = (N/2 + 1)/2.
|
||||
"""
|
||||
X = np.asarray(X, dtype=np.float64).reshape(-1)
|
||||
K = int(X.shape[0])
|
||||
if K not in (1024, 128):
|
||||
raise ValueError("IMDCT input length must be 1024 or 128.")
|
||||
|
||||
N = 2 * K
|
||||
n0 = (N / 2.0 + 1.0) / 2.0
|
||||
|
||||
n = np.arange(N, dtype=np.float64) + n0
|
||||
k = np.arange(K, dtype=np.float64) + 0.5
|
||||
|
||||
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
|
||||
s = (2.0 / N) * (C @ X) # (N,)
|
||||
return s
|
||||
|
||||
|
||||
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
|
||||
"""
|
||||
ESH analysis for one channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_ch : FrameChannelT
|
||||
Time-domain channel frame (expected shape: (2048,)).
|
||||
win_type : WinType
|
||||
Window family ("KBD" or "SIN").
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameChannelF
|
||||
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
|
||||
of the j-th short window.
|
||||
"""
|
||||
wS = _short_window(win_type) # (256,)
|
||||
X_esh = np.empty((128, 8), dtype=np.float64)
|
||||
|
||||
# ESH subwindows are taken from the central region:
|
||||
# start positions: 448 + 128*j, j = 0..7
|
||||
for j in range(8):
|
||||
start = 448 + 128 * j
|
||||
seg = x_ch[start:start + 256] * wS # (256,)
|
||||
X_esh[:, j] = _mdct(seg) # (128,)
|
||||
|
||||
return X_esh
|
||||
|
||||
|
||||
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||
"""
|
||||
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_F : FrameF
|
||||
Packed ESH spectrum (expected shape: (128, 16)).
|
||||
|
||||
Returns
|
||||
-------
|
||||
left : FrameChannelF
|
||||
Left channel spectrum, shape (128, 8).
|
||||
right : FrameChannelF
|
||||
Right channel spectrum, shape (128, 8).
|
||||
|
||||
Notes
|
||||
-----
|
||||
Inverse mapping of the packing used in aac_filter_bank():
|
||||
packed[:, 2*j] = left[:, j]
|
||||
packed[:, 2*j+1] = right[:, j]
|
||||
"""
|
||||
if frame_F.shape != (128, 16):
|
||||
raise ValueError("ESH frame_F must have shape (128, 16).")
|
||||
|
||||
left = np.empty((128, 8), dtype=np.float64)
|
||||
right = np.empty((128, 8), dtype=np.float64)
|
||||
for j in range(8):
|
||||
left[:, j] = frame_F[:, 2 * j + 0]
|
||||
right[:, j] = frame_F[:, 2 * j + 1]
|
||||
return left, right
|
||||
|
||||
|
||||
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
|
||||
"""
|
||||
ESH synthesis for one channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_esh : FrameChannelF
|
||||
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
|
||||
win_type : WinType
|
||||
Window family ("KBD" or "SIN").
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameChannelT
|
||||
Time-domain channel contribution, shape (2048,).
|
||||
This is already overlap-added internally for the 8 short blocks and
|
||||
ready for OLA at the caller level.
|
||||
"""
|
||||
if X_esh.shape != (128, 8):
|
||||
raise ValueError("X_esh must have shape (128, 8).")
|
||||
|
||||
wS = _short_window(win_type) # (256,)
|
||||
out = np.zeros(2048, dtype=np.float64)
|
||||
|
||||
# Each short IMDCT returns 256 samples. Place them at:
|
||||
# start = 448 + 128*j, j=0..7 (50% overlap)
|
||||
for j in range(8):
|
||||
seg = _imdct(X_esh[:, j]) * wS # (256,)
|
||||
start = 448 + 128 * j
|
||||
out[start:start + 256] += seg
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Public Function prototypes (Level 1)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
|
||||
"""
|
||||
Filterbank stage (MDCT analysis).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_T : FrameT
|
||||
Time-domain frame, stereo, shape (2048, 2).
|
||||
frame_type : FrameType
|
||||
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
|
||||
win_type : WinType
|
||||
Window type ("KBD" or "SIN") used for the current frame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame_F : FrameF
|
||||
Frequency-domain MDCT coefficients:
|
||||
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
|
||||
containing MDCT coefficients for both channels.
|
||||
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
|
||||
placed in columns according to subframe order, i.e. overall shape (128, 16).
|
||||
"""
|
||||
if frame_T.shape != (2048, 2):
|
||||
raise ValueError("frame_T must have shape (2048, 2).")
|
||||
|
||||
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
|
||||
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
|
||||
|
||||
if frame_type in ("OLS", "LSS", "LPS"):
|
||||
w = _window_sequence(frame_type, win_type) # length 2048
|
||||
XL = _mdct(xL * w) # length 1024
|
||||
XR = _mdct(xR * w) # length 1024
|
||||
out = np.empty((1024, 2), dtype=np.float64)
|
||||
out[:, 0] = XL
|
||||
out[:, 1] = XR
|
||||
return out
|
||||
|
||||
if frame_type == "ESH":
|
||||
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
|
||||
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
|
||||
|
||||
# Pack into (128, 16): each subframe as (128,2) placed in columns
|
||||
out = np.empty((128, 16), dtype=np.float64)
|
||||
for j in range(8):
|
||||
out[:, 2 * j + 0] = Xl[:, j]
|
||||
out[:, 2 * j + 1] = Xr[:, j]
|
||||
return out
|
||||
|
||||
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||
|
||||
|
||||
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
|
||||
"""
|
||||
Inverse filterbank (IMDCT synthesis).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_F : FrameF
|
||||
Frequency-domain MDCT coefficients as produced by filter_bank().
|
||||
frame_type : FrameType
|
||||
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
|
||||
win_type : WinType
|
||||
Window type ("KBD" or "SIN").
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame_T : FrameT
|
||||
Reconstructed time-domain frame, stereo, shape (2048, 2).
|
||||
"""
|
||||
if frame_type in ("OLS", "LSS", "LPS"):
|
||||
if frame_F.shape != (1024, 2):
|
||||
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
|
||||
|
||||
w = _window_sequence(frame_type, win_type)
|
||||
|
||||
xL = _imdct(frame_F[:, 0]) * w
|
||||
xR = _imdct(frame_F[:, 1]) * w
|
||||
|
||||
out = np.empty((2048, 2), dtype=np.float64)
|
||||
out[:, 0] = xL
|
||||
out[:, 1] = xR
|
||||
return out
|
||||
|
||||
if frame_type == "ESH":
|
||||
if frame_F.shape != (128, 16):
|
||||
raise ValueError("For ESH, frame_F must have shape (128, 16).")
|
||||
|
||||
Xl, Xr = _unpack_esh(frame_F)
|
||||
xL = _i_filter_bank_esh_channel(Xl, win_type)
|
||||
xR = _i_filter_bank_esh_channel(Xr, win_type)
|
||||
|
||||
out = np.empty((2048, 2), dtype=np.float64)
|
||||
out[:, 0] = xL
|
||||
out[:, 1] = xR
|
||||
return out
|
||||
|
||||
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||
60
source/level_2/core/aac_snr_db.py
Normal file
60
source/level_2/core/aac_snr_db.py
Normal file
@ -0,0 +1,60 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - SNR dB calculator
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# This module implements SNR calculation in dB
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from core.aac_types import StereoSignal
|
||||
import numpy as np
|
||||
|
||||
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||
"""
|
||||
Compute overall SNR (dB) over all samples and channels after aligning lengths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_ref : StereoSignal
|
||||
Reference stereo stream.
|
||||
x_hat : StereoSignal
|
||||
Reconstructed stereo stream.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
SNR in dB.
|
||||
- Returns +inf if noise power is zero.
|
||||
- Returns -inf if signal power is zero.
|
||||
"""
|
||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||
|
||||
if x_ref.ndim == 1:
|
||||
x_ref = x_ref.reshape(-1, 1)
|
||||
if x_hat.ndim == 1:
|
||||
x_hat = x_hat.reshape(-1, 1)
|
||||
|
||||
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||
|
||||
x_ref = x_ref[:n, :c]
|
||||
x_hat = x_hat[:n, :c]
|
||||
|
||||
err = x_ref - x_hat
|
||||
ps = float(np.sum(x_ref * x_ref))
|
||||
pn = float(np.sum(err * err))
|
||||
|
||||
if pn <= 0.0:
|
||||
return float("inf")
|
||||
if ps <= 0.0:
|
||||
return float("-inf")
|
||||
|
||||
return float(10.0 * np.log10(ps / pn))
|
||||
217
source/level_2/core/aac_ssc.py
Normal file
217
source/level_2/core/aac_ssc.py
Normal file
@ -0,0 +1,217 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Sequence Segmentation Control module
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Sequence Segmentation Control module (SSC).
|
||||
# Selects and returns the frame type based on input parameters.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, Tuple
|
||||
from core.aac_types import FrameType, FrameT, FrameChannelT
|
||||
|
||||
import numpy as np
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Private helpers for SSC
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# See Table 1 in mm-2025-hw-v0.1.pdf
|
||||
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
|
||||
("OLS", "OLS"): "OLS",
|
||||
("OLS", "LSS"): "LSS",
|
||||
("OLS", "ESH"): "ESH",
|
||||
("OLS", "LPS"): "LPS",
|
||||
("LSS", "OLS"): "LSS",
|
||||
("LSS", "LSS"): "LSS",
|
||||
("LSS", "ESH"): "ESH",
|
||||
("LSS", "LPS"): "ESH",
|
||||
("ESH", "OLS"): "ESH",
|
||||
("ESH", "LSS"): "ESH",
|
||||
("ESH", "ESH"): "ESH",
|
||||
("ESH", "LPS"): "ESH",
|
||||
("LPS", "OLS"): "LPS",
|
||||
("LPS", "LSS"): "ESH",
|
||||
("LPS", "ESH"): "ESH",
|
||||
("LPS", "LPS"): "LPS",
|
||||
}
|
||||
|
||||
|
||||
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
|
||||
"""
|
||||
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
|
||||
according to the assignment's criterion.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
next_frame_channel : FrameChannelT
|
||||
One channel of next_frame_T (expected shape: (2048,)).
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if an attack is detected (=> next frame predicted ESH), else False.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The criterion is implemented as described in the spec:
|
||||
|
||||
1) Apply the high-pass filter:
|
||||
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
|
||||
implemented in the time domain as:
|
||||
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
|
||||
|
||||
2) Split y into 16 segments of length 128 and compute segment energies s[l].
|
||||
|
||||
3) Compute the ratio:
|
||||
ds[l] = s[l] / s[l-1]
|
||||
|
||||
4) An attack exists if there exists l in {1..7} such that:
|
||||
s[l] > 1e-3 and ds[l] > 10
|
||||
"""
|
||||
# Local alias; expected to be a 1-D array of length 2048.
|
||||
x = next_frame_channel
|
||||
|
||||
# High-pass filter reference implementation (scalar recurrence).
|
||||
y = np.zeros_like(x)
|
||||
prev_x = 0.0
|
||||
prev_y = 0.0
|
||||
for n in range(x.shape[0]):
|
||||
xn = float(x[n])
|
||||
yn = (xn - prev_x) + 0.5 * prev_y
|
||||
y[n] = yn
|
||||
prev_x = xn
|
||||
prev_y = yn
|
||||
|
||||
# Segment energies over 16 blocks of 128 samples.
|
||||
s = np.empty(16, dtype=np.float64)
|
||||
for l in range(16):
|
||||
a = l * 128
|
||||
b = (l + 1) * 128
|
||||
seg = y[a:b]
|
||||
s[l] = float(np.sum(seg * seg))
|
||||
|
||||
# ds[l] for l>=1. For l=0 not defined, keep 0.
|
||||
ds = np.zeros(16, dtype=np.float64)
|
||||
eps = 1e-12 # Avoid division by zero without materially changing the logic.
|
||||
for l in range(1, 16):
|
||||
ds[l] = s[l] / max(s[l - 1], eps)
|
||||
|
||||
# Spec: check l in {1..7}.
|
||||
for l in range(1, 8):
|
||||
if (s[l] > 1e-3) and (ds[l] > 10.0):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
|
||||
"""
|
||||
Decide the current frame type for a single channel based on the previous
|
||||
frame type and whether the next frame is predicted to be ESH.
|
||||
|
||||
Rules (spec):
|
||||
|
||||
- If prev is "LSS" => current is "ESH"
|
||||
- If prev is "LPS" => current is "OLS"
|
||||
- If prev is "OLS" => current is "LSS" if attack else "OLS"
|
||||
- If prev is "ESH" => current is "ESH" if attack else "LPS"
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prev_frame_type : FrameType
|
||||
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
|
||||
attack : bool
|
||||
True if the next frame is predicted ESH for this channel.
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameType
|
||||
The per-channel decision for the current frame.
|
||||
|
||||
"""
|
||||
if prev_frame_type == "LSS":
|
||||
return "ESH"
|
||||
if prev_frame_type == "LPS":
|
||||
return "OLS"
|
||||
if prev_frame_type == "OLS":
|
||||
return "LSS" if attack else "OLS"
|
||||
if prev_frame_type == "ESH":
|
||||
return "ESH" if attack else "LPS"
|
||||
|
||||
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
|
||||
|
||||
|
||||
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
||||
"""
|
||||
Merge per-channel frame type decisions into one common frame type using
|
||||
the stereo merge table from the spec.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ft_l : FrameType
|
||||
Frame type decision for the left channel.
|
||||
ft_r : FrameType
|
||||
Frame type decision for the right channel.
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameType
|
||||
The merged common frame type.
|
||||
"""
|
||||
try:
|
||||
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Public Function prototypes (Level 1)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
||||
"""
|
||||
Sequence Segmentation Control (SSC).
|
||||
|
||||
Select and return the frame type for the current frame (i) based on:
|
||||
- the current time-domain frame (stereo),
|
||||
- the next time-domain frame (stereo), used for attack detection,
|
||||
- the previous frame type.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_T : FrameT
|
||||
Current time-domain frame i (expected shape: (2048, 2)).
|
||||
next_frame_T : FrameT
|
||||
Next time-domain frame (i+1), used to decide transitions to/from ESH
|
||||
(expected shape: (2048, 2)).
|
||||
prev_frame_type : FrameType
|
||||
Frame type chosen for the previous frame (i-1).
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameType
|
||||
One of: "OLS", "LSS", "ESH", "LPS".
|
||||
"""
|
||||
if frame_T.shape != (2048, 2):
|
||||
raise ValueError("frame_T must have shape (2048, 2).")
|
||||
if next_frame_T.shape != (2048, 2):
|
||||
raise ValueError("next_frame_T must have shape (2048, 2).")
|
||||
|
||||
# Detect attack independently per channel on the next frame.
|
||||
attack_l = _detect_attack(next_frame_T[:, 0])
|
||||
attack_r = _detect_attack(next_frame_T[:, 1])
|
||||
|
||||
# Decide per-channel type based on shared prev_frame_type.
|
||||
ft_l = _decide_frame_type(prev_frame_type, attack_l)
|
||||
ft_r = _decide_frame_type(prev_frame_type, attack_r)
|
||||
|
||||
# Stereo merge as per the spec table.
|
||||
return _stereo_merge(ft_l, ft_r)
|
||||
549
source/level_2/core/aac_tns.py
Normal file
549
source/level_2/core/aac_tns.py
Normal file
@ -0,0 +1,549 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Temporal Noise Shaping (TNS)
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# Temporal Noise Shaping (TNS) module (Level 2).
|
||||
#
|
||||
# Public API:
|
||||
# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||
# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs)
|
||||
#
|
||||
# Notes (per assignment):
|
||||
# - TNS is applied per channel (not stereo).
|
||||
# - For ESH, TNS is applied independently to each of the 8 short subframes.
|
||||
# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short)
|
||||
# provided in TableB219.mat.
|
||||
# - Predictor order is fixed to p = 4.
|
||||
# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1.
|
||||
# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||
# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients.
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
from scipy.io import loadmat
|
||||
|
||||
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
|
||||
from core.aac_types import *
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
_B219_CACHE: dict[str, FloatArray] | None = None
|
||||
|
||||
|
||||
def _load_b219_tables() -> dict[str, FloatArray]:
|
||||
"""
|
||||
Load TableB219.mat and cache the contents.
|
||||
|
||||
The project layout guarantees that a 'material' directory is discoverable
|
||||
from the current working directory (tests and level_123 entrypoints).
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, FloatArray]
|
||||
Keys:
|
||||
- "B219a": long bands table (for K=1024 MDCT lines)
|
||||
- "B219b": short bands table (for K=128 MDCT lines)
|
||||
"""
|
||||
global _B219_CACHE
|
||||
if _B219_CACHE is not None:
|
||||
return _B219_CACHE
|
||||
|
||||
mat_path = Path("material") / "TableB219.mat"
|
||||
if not mat_path.exists():
|
||||
raise FileNotFoundError("Could not locate material/TableB219.mat in the current working directory.")
|
||||
|
||||
d = loadmat(str(mat_path))
|
||||
if "B219a" not in d or "B219b" not in d:
|
||||
raise ValueError("TableB219.mat missing required variables B219a and/or B219b.")
|
||||
|
||||
_B219_CACHE = {
|
||||
"B219a": np.asarray(d["B219a"], dtype=np.float64),
|
||||
"B219b": np.asarray(d["B219b"], dtype=np.float64),
|
||||
}
|
||||
return _B219_CACHE
|
||||
|
||||
|
||||
def _band_ranges_for_kcount(k_count: int) -> BandRanges:
|
||||
"""
|
||||
Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k_count : int
|
||||
Number of MDCT lines:
|
||||
- 1024 for long frames
|
||||
- 128 for short subframes (ESH)
|
||||
|
||||
Returns
|
||||
-------
|
||||
BandRanges (list[tuple[int, int]])
|
||||
Each tuple is (start_k, end_k) inclusive.
|
||||
"""
|
||||
tables = _load_b219_tables()
|
||||
if k_count == 1024:
|
||||
tbl = tables["B219a"]
|
||||
elif k_count == 128:
|
||||
tbl = tables["B219b"]
|
||||
else:
|
||||
raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).")
|
||||
|
||||
start = tbl[:, 1].astype(int)
|
||||
end = tbl[:, 2].astype(int)
|
||||
|
||||
ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)]
|
||||
|
||||
for s, e in ranges:
|
||||
if s < 0 or e < s or e >= k_count:
|
||||
raise ValueError("Invalid band table ranges for given k_count.")
|
||||
return ranges
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Core DSP helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def _smooth_sw_inplace(sw: MdctCoeffs) -> None:
|
||||
"""
|
||||
Smooth Sw(k) to reduce discontinuities between adjacent Bark bands.
|
||||
|
||||
The assignment applies two passes:
|
||||
- Backward: Sw(k) = (Sw(k) + Sw(k+1))/2
|
||||
- Forward: Sw(k) = (Sw(k) + Sw(k-1))/2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sw : MdctCoeffs
|
||||
1-D array of length K (float64). Modified in-place.
|
||||
"""
|
||||
k_count = int(sw.shape[0])
|
||||
|
||||
for k in range(k_count - 2, -1, -1):
|
||||
sw[k] = 0.5 * (sw[k] + sw[k + 1])
|
||||
|
||||
for k in range(1, k_count):
|
||||
sw[k] = 0.5 * (sw[k] + sw[k - 1])
|
||||
|
||||
|
||||
def _compute_sw(x: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Compute Sw(k) from band energies P(j) and apply boundary smoothing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D MDCT line array, length K.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Sw(k), 1-D array of length K, float64.
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
k_count = int(x.shape[0])
|
||||
|
||||
bands = _band_ranges_for_kcount(k_count)
|
||||
sw = np.zeros(k_count, dtype=np.float64)
|
||||
|
||||
for s, e in bands:
|
||||
seg = x[s : e + 1]
|
||||
p_j = float(np.sum(seg * seg))
|
||||
sw_val = float(np.sqrt(p_j))
|
||||
sw[s : e + 1] = sw_val
|
||||
|
||||
_smooth_sw_inplace(sw)
|
||||
return sw
|
||||
|
||||
|
||||
def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs:
|
||||
"""
|
||||
Autocorrelation r(m) for m=0..p.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D signal.
|
||||
p : int
|
||||
Maximum lag.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
r, shape (p+1,), float64.
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
n = int(x.shape[0])
|
||||
|
||||
r = np.zeros(p + 1, dtype=np.float64)
|
||||
for m in range(p + 1):
|
||||
r[m] = float(np.dot(x[m:], x[: n - m]))
|
||||
return r
|
||||
|
||||
|
||||
def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs:
|
||||
"""
|
||||
Solve Yule-Walker normal equations for LPC coefficients of order p.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
xw : MdctCoeffs
|
||||
1-D normalized sequence Xw(k).
|
||||
p : int
|
||||
Predictor order.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
LPC coefficients a[0..p-1], shape (p,), float64.
|
||||
"""
|
||||
r = _autocorr(xw, p)
|
||||
|
||||
R = np.empty((p, p), dtype=np.float64)
|
||||
for i in range(p):
|
||||
for j in range(p):
|
||||
R[i, j] = r[abs(i - j)]
|
||||
|
||||
rhs = r[1 : p + 1].reshape(p)
|
||||
|
||||
reg = 1e-12
|
||||
R_reg = R + reg * np.eye(p, dtype=np.float64)
|
||||
|
||||
a = np.linalg.solve(R_reg, rhs)
|
||||
return a
|
||||
|
||||
|
||||
def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Quantize LPC coefficients with uniform symmetric quantizer and clamp.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : MdctCoeffs
|
||||
LPC coefficient array, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Quantized coefficients, shape (p,), float64.
|
||||
"""
|
||||
a = np.asarray(a, dtype=np.float64).reshape(-1)
|
||||
q = np.round(a / QUANT_STEP) * QUANT_STEP
|
||||
q = np.clip(q, -QUANT_MAX, QUANT_MAX)
|
||||
return q.astype(np.float64, copy=False)
|
||||
|
||||
|
||||
def _is_inverse_stable(a_q: MdctCoeffs) -> bool:
|
||||
"""
|
||||
Check stability of the inverse TNS filter H_TNS^{-1}.
|
||||
|
||||
Forward filter:
|
||||
H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||
|
||||
Inverse filter poles are roots of:
|
||||
A(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||
Multiply by z^p:
|
||||
z^p - a1 z^{p-1} - ... - ap = 0
|
||||
|
||||
Stability condition:
|
||||
all roots satisfy |z| < 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a_q : MdctCoeffs
|
||||
Quantized predictor coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if stable, else False.
|
||||
"""
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
p = int(a_q.shape[0])
|
||||
|
||||
# Polynomial in z: z^p - a1 z^{p-1} - ... - ap
|
||||
poly = np.empty(p + 1, dtype=np.float64)
|
||||
poly[0] = 1.0
|
||||
poly[1:] = -a_q
|
||||
|
||||
roots = np.roots(poly)
|
||||
|
||||
# Strictly inside unit circle for stability. Add tiny margin for numeric safety.
|
||||
margin = 1e-12
|
||||
return bool(np.all(np.abs(roots) < (1.0 - margin)))
|
||||
|
||||
|
||||
def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Make quantized predictor coefficients stable for inverse filtering.
|
||||
|
||||
Policy:
|
||||
- If already stable: return as-is.
|
||||
- Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid.
|
||||
- If still unstable after attempts: fall back to all-zero coefficients (disable TNS).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a_q : MdctCoeffs
|
||||
Quantized predictor coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Stable quantized coefficients, shape (p,).
|
||||
"""
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
|
||||
if _is_inverse_stable(a_q):
|
||||
return a_q
|
||||
|
||||
# Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid.
|
||||
gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1)
|
||||
|
||||
for g in gammas:
|
||||
cand = _quantize_coeffs(g * a_q)
|
||||
if _is_inverse_stable(cand):
|
||||
return cand
|
||||
|
||||
# Last resort: disable TNS for this vector
|
||||
return np.zeros_like(a_q, dtype=np.float64)
|
||||
|
||||
|
||||
def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Apply forward TNS FIR filter:
|
||||
y[k] = x[k] - sum_{l=1..p} a_l * x[k-l]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D MDCT lines, length K.
|
||||
a_q : MdctCoeffs
|
||||
Quantized LPC coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Filtered MDCT lines y, length K.
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
p = int(a_q.shape[0])
|
||||
k_count = int(x.shape[0])
|
||||
|
||||
y = np.zeros(k_count, dtype=np.float64)
|
||||
for k in range(k_count):
|
||||
acc = x[k]
|
||||
for l in range(1, p + 1):
|
||||
if k - l >= 0:
|
||||
acc -= a_q[l - 1] * x[k - l]
|
||||
y[k] = acc
|
||||
return y
|
||||
|
||||
|
||||
def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
|
||||
"""
|
||||
Apply inverse TNS IIR filter:
|
||||
x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : MdctCoeffs
|
||||
1-D MDCT lines after TNS, length K.
|
||||
a_q : MdctCoeffs
|
||||
Quantized LPC coefficients, shape (p,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
MdctCoeffs
|
||||
Reconstructed MDCT lines x_hat, length K.
|
||||
"""
|
||||
y = np.asarray(y, dtype=np.float64).reshape(-1)
|
||||
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||
p = int(a_q.shape[0])
|
||||
k_count = int(y.shape[0])
|
||||
|
||||
x_hat = np.zeros(k_count, dtype=np.float64)
|
||||
for k in range(k_count):
|
||||
acc = y[k]
|
||||
for l in range(1, p + 1):
|
||||
if k - l >= 0:
|
||||
acc += a_q[l - 1] * x_hat[k - l]
|
||||
x_hat[k] = acc
|
||||
return x_hat
|
||||
|
||||
|
||||
def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
|
||||
"""
|
||||
TNS for a single MDCT vector (one long frame or one short subframe).
|
||||
|
||||
Steps:
|
||||
1) Compute Sw(k) from Bark band energies and smooth it.
|
||||
2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0).
|
||||
3) Compute LPC coefficients (order p=PRED_ORDER) on Xw.
|
||||
4) Quantize coefficients (4-bit symmetric, step QUANT_STEP).
|
||||
5) Apply FIR filter on original X(k) using quantized coefficients.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : MdctCoeffs
|
||||
1-D MDCT vector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : MdctCoeffs
|
||||
TNS-processed MDCT vector (same length).
|
||||
a_q : MdctCoeffs
|
||||
Quantized LPC coefficients, shape (PRED_ORDER,).
|
||||
"""
|
||||
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||
sw = _compute_sw(x)
|
||||
|
||||
eps = 1e-12
|
||||
xw = np.where(sw > eps, x / sw, 0.0)
|
||||
|
||||
a = _lpc_coeffs(xw, PRED_ORDER)
|
||||
a_q = _quantize_coeffs(a)
|
||||
|
||||
# Ensure inverse stability (assignment requirement)
|
||||
a_q = _stabilize_quantized_coeffs(a_q)
|
||||
|
||||
y = _apply_tns_fir(x, a_q)
|
||||
return y, a_q
|
||||
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Public Functions (Level 2)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]:
|
||||
"""
|
||||
Temporal Noise Shaping (TNS) for ONE channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_F_in : FrameChannelF
|
||||
Per-channel MDCT coefficients.
|
||||
Expected (typical) shapes:
|
||||
- If frame_type == "ESH": (128, 8)
|
||||
- Else: (1024, 1) or (1024,)
|
||||
|
||||
frame_type : FrameType
|
||||
Frame type code ("OLS", "LSS", "ESH", "LPS").
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame_F_out : FrameChannelF
|
||||
Per-channel MDCT coefficients after applying TNS.
|
||||
Same shape convention as input.
|
||||
|
||||
tns_coeffs : TnsCoeffs
|
||||
Quantized TNS predictor coefficients.
|
||||
Expected shapes:
|
||||
- If frame_type == "ESH": (PRED_ORDER, 8)
|
||||
- Else: (PRED_ORDER, 1)
|
||||
"""
|
||||
x = np.asarray(frame_F_in, dtype=np.float64)
|
||||
|
||||
if frame_type == "ESH":
|
||||
if x.shape != (128, 8):
|
||||
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
|
||||
|
||||
y = np.empty_like(x, dtype=np.float64)
|
||||
a_out = np.empty((PRED_ORDER, 8), dtype=np.float64)
|
||||
|
||||
for j in range(8):
|
||||
y[:, j], a_out[:, j] = _tns_one_vector(x[:, j])
|
||||
|
||||
return y, a_out
|
||||
|
||||
if x.shape == (1024,):
|
||||
x_vec = x
|
||||
out_shape = (1024,)
|
||||
elif x.shape == (1024, 1):
|
||||
x_vec = x[:, 0]
|
||||
out_shape = (1024, 1)
|
||||
else:
|
||||
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
|
||||
|
||||
y_vec, a_q = _tns_one_vector(x_vec)
|
||||
|
||||
if out_shape == (1024,):
|
||||
y_out = y_vec
|
||||
else:
|
||||
y_out = y_vec.reshape(1024, 1)
|
||||
|
||||
a_out = a_q.reshape(PRED_ORDER, 1)
|
||||
return y_out, a_out
|
||||
|
||||
|
||||
def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF:
|
||||
"""
|
||||
Inverse Temporal Noise Shaping (iTNS) for ONE channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame_F_in : FrameChannelF
|
||||
Per-channel MDCT coefficients after TNS.
|
||||
Expected (typical) shapes:
|
||||
- If frame_type == "ESH": (128, 8)
|
||||
- Else: (1024, 1) or (1024,)
|
||||
|
||||
frame_type : FrameType
|
||||
Frame type code ("OLS", "LSS", "ESH", "LPS").
|
||||
|
||||
tns_coeffs : TnsCoeffs
|
||||
Quantized TNS predictor coefficients.
|
||||
Expected shapes:
|
||||
- If frame_type == "ESH": (PRED_ORDER, 8)
|
||||
- Else: (PRED_ORDER, 1)
|
||||
|
||||
Returns
|
||||
-------
|
||||
FrameChannelF
|
||||
Per-channel MDCT coefficients after inverse TNS.
|
||||
Same shape convention as input frame_F_in.
|
||||
"""
|
||||
x = np.asarray(frame_F_in, dtype=np.float64)
|
||||
a = np.asarray(tns_coeffs, dtype=np.float64)
|
||||
|
||||
if frame_type == "ESH":
|
||||
if x.shape != (128, 8):
|
||||
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
|
||||
if a.shape != (PRED_ORDER, 8):
|
||||
raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).")
|
||||
|
||||
y = np.empty_like(x, dtype=np.float64)
|
||||
for j in range(8):
|
||||
y[:, j] = _apply_itns_iir(x[:, j], a[:, j])
|
||||
return y
|
||||
|
||||
if a.shape != (PRED_ORDER, 1):
|
||||
raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).")
|
||||
|
||||
if x.shape == (1024,):
|
||||
x_vec = x
|
||||
out_shape = (1024,)
|
||||
elif x.shape == (1024, 1):
|
||||
x_vec = x[:, 0]
|
||||
out_shape = (1024, 1)
|
||||
else:
|
||||
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
|
||||
|
||||
y_vec = _apply_itns_iir(x_vec, a[:, 0])
|
||||
|
||||
if out_shape == (1024,):
|
||||
return y_vec
|
||||
return y_vec.reshape(1024, 1)
|
||||
282
source/level_2/core/aac_types.py
Normal file
282
source/level_2/core/aac_types.py
Normal file
@ -0,0 +1,282 @@
|
||||
# ------------------------------------------------------------
|
||||
# AAC Coder/Decoder - Public Type Aliases
|
||||
#
|
||||
# Multimedia course at Aristotle University of
|
||||
# Thessaloniki (AUTh)
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# This module implements Public Type aliases
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Literal, TypeAlias, TypedDict
|
||||
import numpy as np
|
||||
from numpy.typing import NDArray
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Code enums (for readability; not intended to enforce shapes/lengths)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
|
||||
"""
|
||||
Frame type codes (AAC):
|
||||
- "OLS": ONLY_LONG_SEQUENCE
|
||||
- "LSS": LONG_START_SEQUENCE
|
||||
- "ESH": EIGHT_SHORT_SEQUENCE
|
||||
- "LPS": LONG_STOP_SEQUENCE
|
||||
"""
|
||||
|
||||
WinType: TypeAlias = Literal["KBD", "SIN"]
|
||||
"""
|
||||
Window type codes (AAC):
|
||||
- "KBD": Kaiser-Bessel-Derived
|
||||
- "SIN": sinusoid
|
||||
"""
|
||||
|
||||
ChannelKey: TypeAlias = Literal["chl", "chr"]
|
||||
"""Channel dictionary keys used in Level payloads."""
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Array “semantic” aliases
|
||||
#
|
||||
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
|
||||
# forcing strict shapes in the type system.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
FloatArray: TypeAlias = NDArray[np.float64]
|
||||
"""
|
||||
Generic float64 NumPy array.
|
||||
|
||||
Note:
|
||||
- We standardize internal numeric computations to float64 for stability and
|
||||
reproducibility. External I/O can still be float32, but we convert at the
|
||||
boundaries.
|
||||
"""
|
||||
|
||||
Window: TypeAlias = FloatArray
|
||||
"""
|
||||
Time-domain window (weighting sequence), 1-D.
|
||||
|
||||
Typical lengths in this assignment:
|
||||
- Long: 2048
|
||||
- Short: 256
|
||||
- Window sequences for LSS/LPS are also 2048
|
||||
|
||||
Expected shape: (N,)
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
TimeSignal: TypeAlias = FloatArray
|
||||
"""
|
||||
Time-domain signal samples, typically 1-D.
|
||||
|
||||
Examples:
|
||||
- Windowed MDCT input: shape (N,)
|
||||
- IMDCT output: shape (N,)
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
StereoSignal: TypeAlias = FloatArray
|
||||
"""
|
||||
Time-domain stereo signal stream.
|
||||
|
||||
Expected (typical) shape: (N, 2)
|
||||
- axis 0: time samples
|
||||
- axis 1: channels [L, R]
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
MdctCoeffs: TypeAlias = FloatArray
|
||||
"""
|
||||
MDCT coefficient vector, typically 1-D.
|
||||
|
||||
Examples:
|
||||
- Long: shape (1024,)
|
||||
- Short: shape (128,)
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
MdctFrameChannel: TypeAlias = FloatArray
|
||||
"""
|
||||
Per-channel MDCT container used in Level-1/2 sequences.
|
||||
|
||||
Typical shapes:
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||
|
||||
dtype: float64
|
||||
|
||||
Notes
|
||||
-----
|
||||
Some parts of the assignment store long-frame coefficients as a column vector
|
||||
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
|
||||
when convenient, but the semantic meaning is identical.
|
||||
"""
|
||||
|
||||
TnsCoeffs: TypeAlias = FloatArray
|
||||
"""
|
||||
Quantized TNS predictor coefficients (one channel).
|
||||
|
||||
Typical shapes (Level 2):
|
||||
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
|
||||
- Else: (4, 1) (order p=4 for the long frame)
|
||||
|
||||
dtype: float64
|
||||
|
||||
Notes
|
||||
-----
|
||||
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
|
||||
We store the quantized coefficient values as float64 (typically multiples of 0.1)
|
||||
to keep the pipeline simple and readable.
|
||||
"""
|
||||
|
||||
|
||||
FrameT: TypeAlias = FloatArray
|
||||
"""
|
||||
Time-domain frame (stereo), as used by the filterbank input/output.
|
||||
|
||||
Expected (typical) shape for stereo: (2048, 2)
|
||||
- axis 0: time samples
|
||||
- axis 1: channels [L, R]
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
FrameChannelT: TypeAlias = FloatArray
|
||||
"""
|
||||
Time-domain single-channel frame.
|
||||
|
||||
Expected (typical) shape: (2048,)
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
FrameF: TypeAlias = FloatArray
|
||||
"""
|
||||
Frequency-domain frame (MDCT coefficients), stereo container.
|
||||
|
||||
Typical shapes (Level 1):
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
|
||||
- If frame_type == "ESH": (128, 16)
|
||||
|
||||
Rationale for ESH (128, 16):
|
||||
- 8 short subframes per channel => 8 * 2 = 16 columns total
|
||||
- Each short subframe per stereo is (128, 2), flattened into columns
|
||||
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
FrameChannelF: TypeAlias = MdctFrameChannel
|
||||
"""
|
||||
Frequency-domain single-channel MDCT coefficients.
|
||||
|
||||
Typical shapes (Level 1/2):
|
||||
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||
- If frame_type == "ESH": (128, 8)
|
||||
|
||||
dtype: float64
|
||||
"""
|
||||
|
||||
BandRanges: TypeAlias = list[tuple[int, int]]
|
||||
"""
|
||||
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
|
||||
|
||||
Used by TNS to map MDCT indices k to Bark bands.
|
||||
"""
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 1 AAC sequence payload types
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class AACChannelFrameF(TypedDict):
|
||||
"""
|
||||
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
|
||||
|
||||
Keys
|
||||
----
|
||||
frame_F:
|
||||
The MDCT coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (128, 8) (8 short subframes)
|
||||
- else: (1024, 1) or (1024,)
|
||||
"""
|
||||
frame_F: FrameChannelF
|
||||
|
||||
|
||||
class AACSeq1Frame(TypedDict):
|
||||
"""
|
||||
One frame dictionary element of aac_seq_1 (Level 1).
|
||||
"""
|
||||
frame_type: FrameType
|
||||
win_type: WinType
|
||||
chl: AACChannelFrameF
|
||||
chr: AACChannelFrameF
|
||||
|
||||
|
||||
AACSeq1: TypeAlias = List[AACSeq1Frame]
|
||||
"""
|
||||
AAC sequence for Level 1:
|
||||
List of length K (K = number of frames).
|
||||
|
||||
Each element is a dict with keys:
|
||||
- "frame_type", "win_type", "chl", "chr"
|
||||
"""
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Level 2 AAC sequence payload types (TNS)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class AACChannelFrameF2(TypedDict):
|
||||
"""
|
||||
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
|
||||
|
||||
Keys
|
||||
----
|
||||
frame_F:
|
||||
The TNS-processed MDCT coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (128, 8)
|
||||
- else: (1024, 1) or (1024,)
|
||||
tns_coeffs:
|
||||
Quantized TNS predictor coefficients for ONE channel.
|
||||
Typical shapes:
|
||||
- ESH: (PRED_ORDER, 8)
|
||||
- else: (PRED_ORDER, 1)
|
||||
"""
|
||||
frame_F: FrameChannelF
|
||||
tns_coeffs: TnsCoeffs
|
||||
|
||||
|
||||
class AACSeq2Frame(TypedDict):
|
||||
"""
|
||||
One frame dictionary element of aac_seq_2 (Level 2).
|
||||
"""
|
||||
frame_type: FrameType
|
||||
win_type: WinType
|
||||
chl: AACChannelFrameF2
|
||||
chr: AACChannelFrameF2
|
||||
|
||||
|
||||
AACSeq2: TypeAlias = List[AACSeq2Frame]
|
||||
"""
|
||||
AAC sequence for Level 2:
|
||||
List of length K (K = number of frames).
|
||||
|
||||
Each element is a dict with keys:
|
||||
- "frame_type", "win_type", "chl", "chr"
|
||||
|
||||
Level 2 adds:
|
||||
- per-channel "tns_coeffs"
|
||||
and stores:
|
||||
- per-channel "frame_F" after applying TNS.
|
||||
"""
|
||||
@ -19,3 +19,125 @@
|
||||
# ------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import soundfile as sf
|
||||
|
||||
from core.aac_types import AACSeq2, StereoSignal
|
||||
from core.aac_coder import aac_coder_2 as core_aac_coder_2
|
||||
from core.aac_coder import aac_read_wav_stereo_48k
|
||||
from core.aac_decoder import aac_decoder_2 as core_aac_decoder_2
|
||||
from core.aac_snr_db import snr_db
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Public Level 2 API (wrappers)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
||||
"""
|
||||
Level-2 AAC encoder (wrapper).
|
||||
|
||||
Delegates to core implementation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename_in : Union[str, Path]
|
||||
Input WAV filename.
|
||||
Assumption: stereo audio, sampling rate 48 kHz.
|
||||
|
||||
Returns
|
||||
-------
|
||||
AACSeq2
|
||||
List of encoded frames (Level 2 schema).
|
||||
"""
|
||||
return core_aac_coder_2(filename_in)
|
||||
|
||||
|
||||
def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
||||
"""
|
||||
Level-2 AAC decoder (wrapper).
|
||||
|
||||
Delegates to core implementation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
aac_seq_2 : AACSeq2
|
||||
Encoded sequence as produced by aac_coder_2().
|
||||
filename_out : Union[str, Path]
|
||||
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||
|
||||
Returns
|
||||
-------
|
||||
StereoSignal
|
||||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||
"""
|
||||
return core_aac_decoder_2(aac_seq_2, filename_out)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Demo (Level 2)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def demo_aac_2(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float:
|
||||
"""
|
||||
Demonstration for the Level-2 codec.
|
||||
|
||||
Runs:
|
||||
- aac_coder_2(filename_in)
|
||||
- aac_decoder_2(aac_seq_2, filename_out)
|
||||
and computes total SNR between original and decoded audio.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename_in : Union[str, Path]
|
||||
Input WAV filename (stereo, 48 kHz).
|
||||
filename_out : Union[str, Path]
|
||||
Output WAV filename (stereo, 48 kHz).
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
Overall SNR in dB.
|
||||
"""
|
||||
filename_in = Path(filename_in)
|
||||
filename_out = Path(filename_out)
|
||||
|
||||
# Read original audio (reference) with the same validation as the codec.
|
||||
x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in)
|
||||
if int(fs_ref) != 48000:
|
||||
raise ValueError("Input sampling rate must be 48 kHz.")
|
||||
|
||||
# Encode / decode
|
||||
aac_seq_2 = aac_coder_2(filename_in)
|
||||
x_hat = i_aac_coder_2(aac_seq_2, filename_out)
|
||||
|
||||
# Optional sanity: ensure output file exists and is readable
|
||||
_, fs_hat = sf.read(str(filename_out), always_2d=True)
|
||||
if int(fs_hat) != 48000:
|
||||
raise ValueError("Decoded output sampling rate must be 48 kHz.")
|
||||
|
||||
return snr_db(x_ref, x_hat)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CLI
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example:
|
||||
# cd level_2
|
||||
# python -m level_2 input.wav output.wav
|
||||
# or
|
||||
# python -m level_2 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l2.wav
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
raise SystemExit("Usage: python -m level_2 <input.wav> <output.wav>")
|
||||
|
||||
in_wav = Path(sys.argv[1])
|
||||
out_wav = Path(sys.argv[2])
|
||||
|
||||
print(f"Encoding/Decoding {in_wav} to {out_wav}")
|
||||
snr = demo_aac_2(in_wav, out_wav)
|
||||
print(f"SNR = {snr:.3f} dB")
|
||||
|
||||
BIN
source/level_2/material/LicorDeCalandraca.wav
Normal file
BIN
source/level_2/material/LicorDeCalandraca.wav
Normal file
Binary file not shown.
BIN
source/level_2/material/LicorDeCalandraca_out2.wav
Normal file
BIN
source/level_2/material/LicorDeCalandraca_out2.wav
Normal file
Binary file not shown.
BIN
source/level_2/material/TableB219.mat
Normal file
BIN
source/level_2/material/TableB219.mat
Normal file
Binary file not shown.
BIN
source/level_2/material/huffCodebooks.mat
Normal file
BIN
source/level_2/material/huffCodebooks.mat
Normal file
Binary file not shown.
400
source/level_2/material/huff_utils.py
Normal file
400
source/level_2/material/huff_utils.py
Normal file
@ -0,0 +1,400 @@
|
||||
import numpy as np
|
||||
import scipy.io as sio
|
||||
import os
|
||||
|
||||
# ------------------ LOAD LUT ------------------
|
||||
|
||||
def load_LUT(mat_filename=None):
|
||||
"""
|
||||
Loads the list of Huffman Codebooks (LUTs)
|
||||
|
||||
Returns:
|
||||
huffLUT : list (index 1..11 used, index 0 unused)
|
||||
"""
|
||||
if mat_filename is None:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
|
||||
|
||||
mat = sio.loadmat(mat_filename)
|
||||
|
||||
|
||||
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
|
||||
|
||||
huffCodebooks = []
|
||||
for i in range(11):
|
||||
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
|
||||
|
||||
# Build inverse VLC tables
|
||||
invTable = [None] * 11
|
||||
|
||||
for i in range(11):
|
||||
h = huffCodebooks[i][:, 2].astype(int) # column 3
|
||||
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
|
||||
|
||||
hbin = []
|
||||
for j in range(len(h)):
|
||||
hbin.append(format(h[j], f'0{hlength[j]}b'))
|
||||
|
||||
invTable[i] = vlc_table(hbin)
|
||||
|
||||
# Build Huffman LUT dicts
|
||||
huffLUT = [None] * 12 # index 0 unused
|
||||
params = [
|
||||
(4, 1, True),
|
||||
(4, 1, True),
|
||||
(4, 2, False),
|
||||
(4, 2, False),
|
||||
(2, 4, True),
|
||||
(2, 4, True),
|
||||
(2, 7, False),
|
||||
(2, 7, False),
|
||||
(2, 12, False),
|
||||
(2, 12, False),
|
||||
(2, 16, False),
|
||||
]
|
||||
|
||||
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
|
||||
huffLUT[i] = {
|
||||
'LUT': huffCodebooks[i-1],
|
||||
'invTable': invTable[i-1],
|
||||
'codebook': i,
|
||||
'nTupleSize': nTupleSize,
|
||||
'maxAbsCodeVal': maxAbs,
|
||||
'signedValues': signed
|
||||
}
|
||||
|
||||
return huffLUT
|
||||
|
||||
def vlc_table(code_array):
|
||||
"""
|
||||
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
|
||||
returns:
|
||||
h : NumPy array of shape (num_nodes, 3)
|
||||
columns:
|
||||
[ next_if_0 , next_if_1 , symbol_index ]
|
||||
"""
|
||||
h = np.zeros((1, 3), dtype=int)
|
||||
|
||||
for code_index, code in enumerate(code_array, start=1):
|
||||
word = [int(bit) for bit in code]
|
||||
h_index = 0
|
||||
|
||||
for bit in word:
|
||||
k = bit
|
||||
next_node = h[h_index, k]
|
||||
if next_node == 0:
|
||||
h = np.vstack([h, [0, 0, 0]])
|
||||
new_index = h.shape[0] - 1
|
||||
h[h_index, k] = new_index
|
||||
h_index = new_index
|
||||
else:
|
||||
h_index = next_node
|
||||
|
||||
h[h_index, 2] = code_index
|
||||
|
||||
return h
|
||||
|
||||
# ------------------ ENCODE ------------------
|
||||
|
||||
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
|
||||
"""
|
||||
Huffman-encode a sequence of quantized coefficients.
|
||||
|
||||
This function selects the appropriate Huffman codebook based on the
|
||||
maximum absolute value of the input coefficients, encodes the coefficients
|
||||
into a binary Huffman bitstream, and returns both the bitstream and the
|
||||
selected codebook index.
|
||||
|
||||
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
|
||||
in audio/image coding (e.g., scale factor band encoding). The input
|
||||
coefficient sequence is grouped into fixed-size tuples as defined by
|
||||
the chosen Huffman LUT. Zero-padding may be applied internally.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coeff_sec : array_like of int
|
||||
1-D array of quantized integer coefficients to encode.
|
||||
Typically corresponds to a "section" or scale-factor band.
|
||||
|
||||
huff_LUT_list : list
|
||||
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
|
||||
Index 1..11 correspond to valid Huffman codebooks.
|
||||
Index 0 is unused.
|
||||
|
||||
Returns
|
||||
-------
|
||||
huffSec : str
|
||||
Huffman-encoded bitstream represented as a string of '0' and '1'
|
||||
characters.
|
||||
|
||||
huffCodebook : int
|
||||
Index (1..11) of the Huffman codebook used for encoding.
|
||||
A value of 0 indicates a special all-zero section.
|
||||
"""
|
||||
if force_codebook is not None:
|
||||
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
|
||||
|
||||
maxAbsVal = np.max(np.abs(coeff_sec))
|
||||
|
||||
if maxAbsVal == 0:
|
||||
huffCodebook = 0
|
||||
huffSec = huff_LUT_code_0()
|
||||
|
||||
elif maxAbsVal == 1:
|
||||
candidates = [1, 2]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal == 2:
|
||||
candidates = [3, 4]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (3, 4):
|
||||
candidates = [5, 6]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (5, 6, 7):
|
||||
candidates = [7, 8]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (8, 9, 10, 11, 12):
|
||||
candidates = [9, 10]
|
||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||
if len(huffSec1) <= len(huffSec2):
|
||||
huffSec = huffSec1
|
||||
huffCodebook = candidates[0]
|
||||
else:
|
||||
huffSec = huffSec2
|
||||
huffCodebook = candidates[1]
|
||||
|
||||
elif maxAbsVal in (13, 14, 15):
|
||||
huffCodebook = 11
|
||||
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
|
||||
|
||||
else:
|
||||
huffCodebook = 11
|
||||
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
|
||||
|
||||
return huffSec, huffCodebook
|
||||
|
||||
def huff_LUT_code_1(huff_LUT, coeff_sec):
|
||||
LUT = huff_LUT['LUT']
|
||||
nTupleSize = huff_LUT['nTupleSize']
|
||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||
signedValues = huff_LUT['signedValues']
|
||||
|
||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
||||
|
||||
if signedValues:
|
||||
coeff = coeff_sec + maxAbsCodeVal
|
||||
base = 2 * maxAbsCodeVal + 1
|
||||
else:
|
||||
coeff = coeff_sec
|
||||
base = maxAbsCodeVal + 1
|
||||
|
||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
||||
coeffPad[:len(coeff)] = coeff
|
||||
|
||||
huffSec = []
|
||||
|
||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
||||
|
||||
for i in range(numTuples):
|
||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
||||
huffIndex = int(np.abs(nTuple) @ powers)
|
||||
|
||||
hexVal = LUT[huffIndex, 2]
|
||||
huffLen = LUT[huffIndex, 1]
|
||||
|
||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
||||
|
||||
if signedValues:
|
||||
huffSec.append(bits)
|
||||
else:
|
||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
||||
huffSec.append(bits + signBits)
|
||||
|
||||
return ''.join(huffSec)
|
||||
|
||||
def huff_LUT_code_0():
|
||||
return ''
|
||||
|
||||
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
|
||||
LUT = huff_LUT['LUT']
|
||||
nTupleSize = huff_LUT['nTupleSize']
|
||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||
|
||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
||||
base = maxAbsCodeVal + 1
|
||||
|
||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
||||
coeffPad[:len(coeff_sec)] = coeff_sec
|
||||
|
||||
huffSec = []
|
||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
||||
|
||||
for i in range(numTuples):
|
||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
||||
|
||||
lnTuple = nTuple.astype(float)
|
||||
lnTuple[lnTuple == 0] = np.finfo(float).eps
|
||||
|
||||
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
|
||||
N = np.maximum(0, N4 - 4)
|
||||
esc = np.abs(nTuple) > 15
|
||||
|
||||
nTupleESC = nTuple.copy()
|
||||
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
|
||||
|
||||
huffIndex = int(np.abs(nTupleESC) @ powers)
|
||||
|
||||
hexVal = LUT[huffIndex, 2]
|
||||
huffLen = LUT[huffIndex, 1]
|
||||
|
||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
||||
|
||||
escSeq = ''
|
||||
for k in range(nTupleSize):
|
||||
if esc[k]:
|
||||
escSeq += '1' * N[k]
|
||||
escSeq += '0'
|
||||
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
|
||||
|
||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
||||
huffSec.append(bits + signBits + escSeq)
|
||||
|
||||
return ''.join(huffSec)
|
||||
|
||||
# ------------------ DECODE ------------------
|
||||
|
||||
def decode_huff(huff_sec, huff_LUT):
|
||||
"""
|
||||
Decode a Huffman-encoded stream.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
huff_sec : array-like of int or str
|
||||
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
|
||||
huff_LUT : dict
|
||||
Huffman lookup table with keys:
|
||||
- 'invTable': inverse table (numpy array)
|
||||
- 'codebook': codebook number
|
||||
- 'nTupleSize': tuple size
|
||||
- 'maxAbsCodeVal': maximum absolute code value
|
||||
- 'signedValues': True/False
|
||||
|
||||
Returns
|
||||
-------
|
||||
decCoeffs : list of int
|
||||
Decoded quantized coefficients.
|
||||
"""
|
||||
|
||||
h = huff_LUT['invTable']
|
||||
huffCodebook = huff_LUT['codebook']
|
||||
nTupleSize = huff_LUT['nTupleSize']
|
||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||
signedValues = huff_LUT['signedValues']
|
||||
|
||||
# Convert string to array of ints
|
||||
if isinstance(huff_sec, str):
|
||||
huff_sec = np.array([int(b) for b in huff_sec])
|
||||
|
||||
eos = False
|
||||
decCoeffs = []
|
||||
streamIndex = 0
|
||||
|
||||
while not eos:
|
||||
wordbit = 0
|
||||
r = 0 # start at root
|
||||
|
||||
# Decode Huffman word using inverse table
|
||||
while True:
|
||||
b = huff_sec[streamIndex + wordbit]
|
||||
wordbit += 1
|
||||
rOld = r
|
||||
r = h[rOld, b]
|
||||
if h[r, 0] == 0 and h[r, 1] == 0:
|
||||
symbolIndex = h[r, 2] - 1 # zero-based
|
||||
streamIndex += wordbit
|
||||
break
|
||||
|
||||
# Decode n-tuple magnitudes
|
||||
if signedValues:
|
||||
base = 2 * maxAbsCodeVal + 1
|
||||
nTupleDec = []
|
||||
tmp = symbolIndex
|
||||
for p in reversed(range(nTupleSize)):
|
||||
val = tmp // (base ** p)
|
||||
nTupleDec.append(val - maxAbsCodeVal)
|
||||
tmp = tmp % (base ** p)
|
||||
nTupleDec = np.array(nTupleDec)
|
||||
else:
|
||||
base = maxAbsCodeVal + 1
|
||||
nTupleDec = []
|
||||
tmp = symbolIndex
|
||||
for p in reversed(range(nTupleSize)):
|
||||
val = tmp // (base ** p)
|
||||
nTupleDec.append(val)
|
||||
tmp = tmp % (base ** p)
|
||||
nTupleDec = np.array(nTupleDec)
|
||||
|
||||
# Apply sign bits
|
||||
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
|
||||
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
|
||||
streamIndex += nTupleSize
|
||||
nTupleDec = nTupleDec * nTupleSign
|
||||
|
||||
# Handle escape sequences
|
||||
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
|
||||
if huffCodebook == 11 and escIndex.size > 0:
|
||||
for idx in escIndex:
|
||||
N = 0
|
||||
b = huff_sec[streamIndex]
|
||||
while b:
|
||||
N += 1
|
||||
b = huff_sec[streamIndex + N]
|
||||
streamIndex += N
|
||||
N4 = N + 4
|
||||
escape_word = huff_sec[streamIndex:streamIndex + N4]
|
||||
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
|
||||
nTupleDec[idx] = escape_value
|
||||
streamIndex += N4 + 1
|
||||
# Apply signs again
|
||||
nTupleDec[escIndex] *= nTupleSign[escIndex]
|
||||
|
||||
decCoeffs.extend(nTupleDec.tolist())
|
||||
|
||||
if streamIndex >= len(huff_sec):
|
||||
eos = True
|
||||
|
||||
return decCoeffs
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user