Level 3: A first failed attempt of encoding-decoding -> SNR=0
This commit is contained in:
parent
ae4ad82136
commit
4ebee28e4e
@ -9,8 +9,16 @@
|
|||||||
# cchoutou@ece.auth.gr
|
# cchoutou@ece.auth.gr
|
||||||
#
|
#
|
||||||
# Description:
|
# Description:
|
||||||
# - Level 1 AAC encoder orchestration.
|
# Level 1 AAC encoder orchestration.
|
||||||
# - Level 2 AAC encoder orchestration.
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Reads WAV via soundfile
|
||||||
|
# - Validates stereo and 48 kHz
|
||||||
|
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
|
||||||
|
# - SSC decision uses next-frame attack detection
|
||||||
|
# - Filterbank analysis (MDCT)
|
||||||
|
# - Stores per-channel spectra in AACSeq1 schema:
|
||||||
|
# * ESH: (128, 8)
|
||||||
|
# * else: (1024, 1)
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@ -18,14 +26,106 @@ from pathlib import Path
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
from scipy.io import savemat
|
||||||
|
|
||||||
from core.aac_configuration import WIN_TYPE
|
from core.aac_configuration import WIN_TYPE
|
||||||
from core.aac_filterbank import aac_filter_bank
|
from core.aac_filterbank import aac_filter_bank
|
||||||
from core.aac_ssc import aac_SSC
|
from core.aac_ssc import aac_ssc
|
||||||
from core.aac_tns import aac_tns
|
from core.aac_tns import aac_tns
|
||||||
|
from core.aac_psycho import aac_psycho
|
||||||
|
from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py
|
||||||
|
from core.aac_huffman import aac_encode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
|
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helpers for thresholds (T(b))
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Return inclusive (lo, hi) band slices derived from TableB219.
|
||||||
|
"""
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)]
|
||||||
|
|
||||||
|
|
||||||
|
def _thresholds_from_smr(
|
||||||
|
frame_F_ch: FrameChannelF,
|
||||||
|
frame_type: FrameType,
|
||||||
|
SMR: FloatArray,
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long: returns (NB, 1)
|
||||||
|
- ESH: returns (NB, 8)
|
||||||
|
"""
|
||||||
|
bands = _band_slices_from_table(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
X = np.asarray(frame_F_ch, dtype=np.float64)
|
||||||
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if X.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F_ch must have shape (128, 8).")
|
||||||
|
if SMR.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
Xj = X[:, j]
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xj[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMR[b, j])
|
||||||
|
T[b, j] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
return T
|
||||||
|
|
||||||
|
# Long
|
||||||
|
if X.shape == (1024,):
|
||||||
|
Xv = X
|
||||||
|
elif X.shape == (1024, 1):
|
||||||
|
Xv = X[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
if SMR.shape == (NB,):
|
||||||
|
SMRv = SMR
|
||||||
|
elif SMR.shape == (NB, 1):
|
||||||
|
SMRv = SMR[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 1), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xv[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMRv[b])
|
||||||
|
T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
|
||||||
|
return T
|
||||||
|
|
||||||
|
def _normalize_global_gain(G: GlobalGain) -> float | FloatArray:
|
||||||
|
"""
|
||||||
|
Normalize GlobalGain to match AACChannelFrameF3["G"] type:
|
||||||
|
- long: return float
|
||||||
|
- ESH: return float64 ndarray of shape (1, 8)
|
||||||
|
"""
|
||||||
|
if np.isscalar(G):
|
||||||
|
return float(G)
|
||||||
|
|
||||||
|
G_arr = np.asarray(G)
|
||||||
|
if G_arr.size == 1:
|
||||||
|
return float(G_arr.reshape(-1)[0])
|
||||||
|
|
||||||
|
return np.asarray(G_arr, dtype=np.float64)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public helpers (useful for level_x demo wrappers)
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -174,7 +274,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
@ -191,10 +291,6 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Level 2 encoder
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC encoder (Level 1 + TNS).
|
Level-2 AAC encoder (Level 1 + TNS).
|
||||||
@ -246,7 +342,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
# Level 1 analysis (packed stereo container)
|
# Level 1 analysis (packed stereo container)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
@ -278,3 +374,180 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
|
def aac_coder_3(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
filename_aac_coded: Union[str, Path] | None = None,
|
||||||
|
) -> AACSeq3:
|
||||||
|
"""
|
||||||
|
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
filename_aac_coded : Union[str, Path] | None
|
||||||
|
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq3
|
||||||
|
Encoded AAC sequence (Level 3 payload schema).
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
aac_seq: AACSeq3 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
# Pin win_type to the WinType literal for type checkers.
|
||||||
|
win_type: WinType = WIN_TYPE
|
||||||
|
|
||||||
|
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
||||||
|
prev1_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev1_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start : start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop : start + hop + win, :]
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
|
# Analysis filterbank (stereo packed)
|
||||||
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
|
# TNS per channel
|
||||||
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
|
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||||
|
|
||||||
|
# Psychoacoustic model per channel (time-domain)
|
||||||
|
frame_L = np.asarray(frame_t[:, 0], dtype=np.float64)
|
||||||
|
frame_R = np.asarray(frame_t[:, 1], dtype=np.float64)
|
||||||
|
|
||||||
|
SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L)
|
||||||
|
SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R)
|
||||||
|
|
||||||
|
# Thresholds T(b) (stored, not entropy-coded)
|
||||||
|
T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L)
|
||||||
|
T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Quantizer per channel
|
||||||
|
S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L)
|
||||||
|
S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Normalize G types for AACSeq3 schema (float | float64 ndarray).
|
||||||
|
G_Ln = _normalize_global_gain(G_L)
|
||||||
|
G_Rn = _normalize_global_gain(G_R)
|
||||||
|
|
||||||
|
# Huffman-code ONLY the DPCM differences for b>0.
|
||||||
|
# sfc[0] corresponds to alpha(0)=G and is stored separately in the frame.
|
||||||
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
|
# Codebook 11:
|
||||||
|
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
||||||
|
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
||||||
|
sf_cb = 11
|
||||||
|
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
||||||
|
|
||||||
|
sfc_L_dpcm = np.clip(
|
||||||
|
sfc_L_dpcm,
|
||||||
|
-sf_max_abs,
|
||||||
|
sf_max_abs,
|
||||||
|
).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
sfc_R_dpcm = np.clip(
|
||||||
|
sfc_R_dpcm,
|
||||||
|
-sf_max_abs,
|
||||||
|
sf_max_abs,
|
||||||
|
).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
sfc_L_stream, _ = aac_encode_huff(
|
||||||
|
sfc_L_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
force_codebook=sf_cb,
|
||||||
|
)
|
||||||
|
sfc_R_stream, _ = aac_encode_huff(
|
||||||
|
sfc_R_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
force_codebook=sf_cb,
|
||||||
|
)
|
||||||
|
|
||||||
|
mdct_L_stream, cb_L = aac_encode_huff(
|
||||||
|
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
mdct_R_stream, cb_R = aac_encode_huff(
|
||||||
|
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
|
frame_out: AACSeq3Frame = {
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": win_type,
|
||||||
|
"chl": {
|
||||||
|
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_L, dtype=np.float64),
|
||||||
|
"G": G_Ln,
|
||||||
|
"sfc": sfc_L_stream,
|
||||||
|
"stream": mdct_L_stream,
|
||||||
|
"codebook": int(cb_L),
|
||||||
|
},
|
||||||
|
"chr": {
|
||||||
|
"tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_R, dtype=np.float64),
|
||||||
|
"G": G_Rn,
|
||||||
|
"sfc": sfc_R_stream,
|
||||||
|
"stream": mdct_R_stream,
|
||||||
|
"codebook": int(cb_R),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
aac_seq.append(frame_out)
|
||||||
|
|
||||||
|
# Update psycho history (shift register)
|
||||||
|
prev2_L = prev1_L
|
||||||
|
prev1_L = frame_L
|
||||||
|
prev2_R = prev1_R
|
||||||
|
prev1_R = frame_R
|
||||||
|
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
|
# Optional: store to .mat for the assignment wrapper
|
||||||
|
if filename_aac_coded is not None:
|
||||||
|
filename_aac_coded = Path(filename_aac_coded)
|
||||||
|
savemat(
|
||||||
|
str(filename_aac_coded),
|
||||||
|
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
||||||
|
do_compression=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return aac_seq
|
||||||
|
|
||||||
|
|||||||
@ -22,9 +22,22 @@ import soundfile as sf
|
|||||||
|
|
||||||
from core.aac_filterbank import aac_i_filter_bank
|
from core.aac_filterbank import aac_i_filter_bank
|
||||||
from core.aac_tns import aac_i_tns
|
from core.aac_tns import aac_i_tns
|
||||||
|
from core.aac_quantizer import aac_i_quantizer
|
||||||
|
from core.aac_huffman import aac_decode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper for NB
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _nbands(frame_type: FrameType) -> int:
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, _whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return int(len(wlow))
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public helpers
|
# Public helpers
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -252,3 +265,144 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-3 AAC decoder (inverse of aac_coder_3).
|
||||||
|
|
||||||
|
Steps per frame:
|
||||||
|
- Huffman decode scalefactors (sfc) using codebook 11
|
||||||
|
- Huffman decode MDCT symbols (stream) using stored codebook
|
||||||
|
- iQuantizer -> MDCT coefficients after TNS
|
||||||
|
- iTNS using stored predictor coefficients
|
||||||
|
- IMDCT filterbank -> time domain
|
||||||
|
- Overlap-add, remove padding, write WAV
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_3 : AACSeq3
|
||||||
|
Encoded sequence as produced by aac_coder_3.
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_3)
|
||||||
|
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("aac_seq_3 must contain at least one frame.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
for i, fr in enumerate(aac_seq_3):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
# We store G separately, so Huffman stream contains only (NB-1) DPCM differences.
|
||||||
|
sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Left channel
|
||||||
|
# -------------------------
|
||||||
|
tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_L = fr["chl"]["G"]
|
||||||
|
sfc_bits_L = fr["chl"]["sfc"]
|
||||||
|
mdct_bits_L = fr["chl"]["stream"]
|
||||||
|
cb_L = int(fr["chl"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_L[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
else:
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_L[0, 0] = int(float(G_L))
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
|
||||||
|
# MDCT symbols: codebook 0 means "all-zero section"
|
||||||
|
if cb_L == 0:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
# Tuple coding may produce extra trailing symbols; caller knows the true length (1024).
|
||||||
|
# Also guard against short outputs by zero-padding.
|
||||||
|
if S_tmp_L.size < 1024:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_L[: S_tmp_L.size] = S_tmp_L
|
||||||
|
else:
|
||||||
|
S_dec_L = S_tmp_L[:1024]
|
||||||
|
|
||||||
|
S_L = S_dec_L.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type)
|
||||||
|
X_L = aac_i_tns(Xq_L, frame_type, tns_L)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Right channel
|
||||||
|
# -------------------------
|
||||||
|
tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_R = fr["chr"]["G"]
|
||||||
|
sfc_bits_R = fr["chr"]["sfc"]
|
||||||
|
mdct_bits_R = fr["chr"]["stream"]
|
||||||
|
cb_R = int(fr["chr"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_R[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
else:
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_R[0, 0] = int(float(G_R))
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
|
||||||
|
if cb_R == 0:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
if S_tmp_R.size < 1024:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_R[: S_tmp_R.size] = S_tmp_R
|
||||||
|
else:
|
||||||
|
S_dec_R = S_tmp_R[:1024]
|
||||||
|
|
||||||
|
S_R = S_dec_R.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type)
|
||||||
|
X_R = aac_i_tns(Xq_R, frame_type, tns_R)
|
||||||
|
|
||||||
|
# Re-pack to stereo container and inverse filterbank
|
||||||
|
frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R))
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|||||||
112
source/core/aac_huffman.py
Normal file
112
source/core/aac_huffman.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Huffman wrappers (Level 3)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Thin wrappers around the provided Huffman utilities (material/huff_utils.py)
|
||||||
|
# so that the API matches the assignment text.
|
||||||
|
#
|
||||||
|
# Exposed API (assignment):
|
||||||
|
# huff_sec, huff_codebook = aac_encode_huff(coeff_sec, huff_LUT_list, force_codebook)
|
||||||
|
# dec_coeffs = aac_decode_huff(huff_sec, huff_codebook, huff_LUT_list)
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# - Huffman coding operates on tuples. Therefore, decode(encode(x)) may return
|
||||||
|
# extra trailing symbols due to tuple padding. The AAC decoder knows the
|
||||||
|
# true section length from side information (band limits) and truncates.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from material.huff_utils import encode_huff, decode_huff
|
||||||
|
|
||||||
|
|
||||||
|
def aac_encode_huff(
|
||||||
|
coeff_sec: np.ndarray,
|
||||||
|
huff_LUT_list: list[dict[str, Any]],
|
||||||
|
force_codebook: int | None = None,
|
||||||
|
) -> tuple[str, int]:
|
||||||
|
"""
|
||||||
|
Huffman-encode a section of coefficients (MDCT symbols or scalefactors).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
coeff_sec : np.ndarray
|
||||||
|
Coefficient section to be encoded. Any shape is accepted; the input
|
||||||
|
is flattened and treated as a 1-D sequence of int64 symbols.
|
||||||
|
huff_LUT_list : list[dict[str, Any]]
|
||||||
|
List of Huffman Look-Up Tables (LUTs) as returned by material.load_LUT().
|
||||||
|
Index corresponds to codebook id (typically 1..11, with 0 reserved).
|
||||||
|
force_codebook : int | None
|
||||||
|
If provided, forces the use of this Huffman codebook. In the assignment,
|
||||||
|
scalefactors are encoded with codebook 11. For MDCT coefficients, this
|
||||||
|
argument is usually omitted (auto-selection).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple[str, int]
|
||||||
|
(huff_sec, huff_codebook)
|
||||||
|
- huff_sec: bitstream as a string of '0'/'1'
|
||||||
|
- huff_codebook: codebook id used by the encoder
|
||||||
|
"""
|
||||||
|
coeff_sec_arr = np.asarray(coeff_sec, dtype=np.int64).reshape(-1)
|
||||||
|
|
||||||
|
if force_codebook is None:
|
||||||
|
# Provided utility returns (bitstream, codebook) in the auto-selection case.
|
||||||
|
huff_sec, huff_codebook = encode_huff(coeff_sec_arr, huff_LUT_list)
|
||||||
|
return str(huff_sec), int(huff_codebook)
|
||||||
|
|
||||||
|
# Provided utility returns ONLY the bitstream when force_codebook is set.
|
||||||
|
cb = int(force_codebook)
|
||||||
|
huff_sec = encode_huff(coeff_sec_arr, huff_LUT_list, force_codebook=cb)
|
||||||
|
return str(huff_sec), cb
|
||||||
|
|
||||||
|
|
||||||
|
def aac_decode_huff(
|
||||||
|
huff_sec: str | np.ndarray,
|
||||||
|
huff_codebook: int,
|
||||||
|
huff_LUT: list[dict[str, Any]],
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Huffman-decode a bitstream using the specified codebook.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
huff_sec : str | np.ndarray
|
||||||
|
Huffman bitstream. Typically a string of '0'/'1'. If an array is provided,
|
||||||
|
it is passed through to the provided decoder.
|
||||||
|
huff_codebook : int
|
||||||
|
Codebook id that was returned by aac_encode_huff.
|
||||||
|
Codebook 0 represents an all-zero section.
|
||||||
|
huff_LUT : list[dict[str, Any]]
|
||||||
|
Huffman LUT list as returned by material.load_LUT().
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray
|
||||||
|
Decoded coefficients as a 1-D np.int64 array.
|
||||||
|
|
||||||
|
Note: Due to tuple coding, the decoded array may contain extra trailing
|
||||||
|
padding symbols. The caller must truncate to the known section length.
|
||||||
|
"""
|
||||||
|
cb = int(huff_codebook)
|
||||||
|
|
||||||
|
if cb == 0:
|
||||||
|
# Codebook 0 represents an all-zero section. The decoded length is not
|
||||||
|
# recoverable from the bitstream alone; the caller must expand/truncate.
|
||||||
|
return np.zeros((0,), dtype=np.int64)
|
||||||
|
|
||||||
|
if cb < 0 or cb >= len(huff_LUT):
|
||||||
|
raise ValueError(f"Invalid Huffman codebook index: {cb}")
|
||||||
|
|
||||||
|
lut = huff_LUT[cb]
|
||||||
|
dec = decode_huff(huff_sec, lut)
|
||||||
|
return np.asarray(dec, dtype=np.int64).reshape(-1)
|
||||||
604
source/core/aac_quantizer.py
Normal file
604
source/core/aac_quantizer.py
Normal file
@ -0,0 +1,604 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Implements AAC quantizer and inverse quantizer for one channel.
|
||||||
|
# Based on assignment section 2.6 (Eq. 12-15).
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# - Bit reservoir is not implemented (assignment simplification).
|
||||||
|
# - Scalefactor bands are assumed equal to psychoacoustic bands
|
||||||
|
# (Table B.2.1.9a / B.2.1.9b from TableB219.mat).
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Constants (assignment)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
MAGIC_NUMBER: float = 0.4054
|
||||||
|
MQ: int = 8191
|
||||||
|
|
||||||
|
EPS: float = 1e-12
|
||||||
|
MAX_SFC_DIFF: int = 60
|
||||||
|
|
||||||
|
# Safeguard: prevents infinite loops in pathological cases
|
||||||
|
MAX_ALPHA_ITERS: int = 2000
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helpers: ESH packing/unpacking (128x8 <-> 1024x1)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1).
|
||||||
|
|
||||||
|
Packing order:
|
||||||
|
Columns are concatenated in subframe order (0..7), column-major.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_128x8 : FloatArray
|
||||||
|
ESH coefficients, shape (128, 8).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Packed coefficients, shape (1024, 1).
|
||||||
|
"""
|
||||||
|
x_128x8 = np.asarray(x_128x8, dtype=np.float64)
|
||||||
|
if x_128x8.shape != (128, 8):
|
||||||
|
raise ValueError("ESH pack expects shape (128, 8).")
|
||||||
|
return x_128x8.reshape(1024, 1, order="F")
|
||||||
|
|
||||||
|
|
||||||
|
def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Unpack a packed ESH vector (1024 elements) back to shape (128, 8).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_1024x1 : FloatArray
|
||||||
|
Packed ESH vector, shape (1024,) or (1024, 1) after flattening.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Unpacked ESH coefficients, shape (128, 8).
|
||||||
|
"""
|
||||||
|
x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1)
|
||||||
|
if x_1024x1.shape[0] != 1024:
|
||||||
|
raise ValueError("ESH unpack expects 1024 elements.")
|
||||||
|
return x_1024x1.reshape(128, 8, order="F")
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Core quantizer formulas (Eq. 12, Eq. 13)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols:
|
||||||
|
"""
|
||||||
|
Quantize MDCT coefficients to integer symbols S(k).
|
||||||
|
|
||||||
|
Implements Eq. (12):
|
||||||
|
S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER )
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : FloatArray
|
||||||
|
MDCT coefficients for a contiguous set of spectral lines.
|
||||||
|
Shape: (N,)
|
||||||
|
alpha : float
|
||||||
|
Scalefactor gain for the corresponding scalefactor band.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
QuantizedSymbols
|
||||||
|
Quantized symbols S(k) as int64, shape (N,).
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=np.float64)
|
||||||
|
|
||||||
|
scale = 2.0 ** (-0.25 * float(alpha))
|
||||||
|
ax = np.abs(x) * scale
|
||||||
|
|
||||||
|
y = np.power(ax, 0.75, dtype=np.float64)
|
||||||
|
|
||||||
|
# "int" in the assignment corresponds to truncation.
|
||||||
|
q = np.floor(y + MAGIC_NUMBER).astype(np.int64)
|
||||||
|
return (np.sign(x).astype(np.int64) * q).astype(np.int64)
|
||||||
|
|
||||||
|
|
||||||
|
def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Inverse quantizer (dequantization of symbols).
|
||||||
|
|
||||||
|
Implements Eq. (13):
|
||||||
|
Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
S : QuantizedSymbols
|
||||||
|
Quantized symbols S(k), int64, shape (N,).
|
||||||
|
alpha : float
|
||||||
|
Scalefactor gain for the corresponding scalefactor band.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Reconstructed MDCT coefficients Xhat(k), float64, shape (N,).
|
||||||
|
"""
|
||||||
|
S = np.asarray(S, dtype=np.int64)
|
||||||
|
|
||||||
|
scale = 2.0 ** (0.25 * float(alpha))
|
||||||
|
aS = np.abs(S).astype(np.float64)
|
||||||
|
y = np.power(aS, 4.0 / 3.0, dtype=np.float64)
|
||||||
|
|
||||||
|
return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Alpha initialization (Eq. 14)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _alpha_initial_from_frame_max(x_all: FloatArray) -> int:
|
||||||
|
"""
|
||||||
|
Compute the initial scalefactor gain alpha_hat for a frame.
|
||||||
|
|
||||||
|
Implements Eq. (14):
|
||||||
|
alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ )
|
||||||
|
|
||||||
|
The same initial value is used for all bands before the per-band refinement.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_all : FloatArray
|
||||||
|
All MDCT coefficients of a frame (one channel), flattened.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int
|
||||||
|
Initial alpha value (integer).
|
||||||
|
"""
|
||||||
|
x_all = np.asarray(x_all, dtype=np.float64).reshape(-1)
|
||||||
|
if x_all.size == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
max_abs = float(np.max(np.abs(x_all)))
|
||||||
|
if max_abs <= 0.0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
val = (max_abs ** 0.75) / float(MQ)
|
||||||
|
if val <= 0.0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
alpha_hat = (16.0 / 3.0) * np.log2(val)
|
||||||
|
return int(np.floor(alpha_hat))
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Band utilities
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type.
|
||||||
|
|
||||||
|
These are derived from the psychoacoustic tables (TableB219),
|
||||||
|
and map directly to MDCT indices:
|
||||||
|
- long: 0..1023
|
||||||
|
- short (ESH subframe): 0..127
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
Frame type ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[tuple[int, int]]
|
||||||
|
List of (lo, hi) inclusive index pairs for each band.
|
||||||
|
"""
|
||||||
|
table, _Nfft = get_table(frame_type)
|
||||||
|
wlow, whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
|
||||||
|
bands: list[tuple[int, int]] = []
|
||||||
|
for lo, hi in zip(wlow, whigh):
|
||||||
|
bands.append((int(lo), int(hi)))
|
||||||
|
return bands
|
||||||
|
|
||||||
|
|
||||||
|
def _band_energy(x: FloatArray, lo: int, hi: int) -> float:
|
||||||
|
"""
|
||||||
|
Compute energy of a spectral segment x[lo:hi+1].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : FloatArray
|
||||||
|
MDCT coefficient vector.
|
||||||
|
lo, hi : int
|
||||||
|
Inclusive index range.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
Sum of squares (energy) within the band.
|
||||||
|
"""
|
||||||
|
sec = x[lo : hi + 1]
|
||||||
|
return float(np.sum(sec * sec))
|
||||||
|
|
||||||
|
|
||||||
|
def _threshold_T_from_SMR(
|
||||||
|
X: FloatArray,
|
||||||
|
SMR_col: FloatArray,
|
||||||
|
bands: list[tuple[int, int]],
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute psychoacoustic thresholds T(b) per band.
|
||||||
|
|
||||||
|
Uses:
|
||||||
|
P(b) = sum_{k in band} X(k)^2
|
||||||
|
T(b) = P(b) / SMR(b)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : FloatArray
|
||||||
|
MDCT coefficients for a frame (long) or one ESH subframe (short).
|
||||||
|
SMR_col : FloatArray
|
||||||
|
SMR values for this frame/subframe, shape (NB,).
|
||||||
|
bands : list[tuple[int, int]]
|
||||||
|
Band index ranges.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Threshold vector T(b), shape (NB,).
|
||||||
|
"""
|
||||||
|
nb = len(bands)
|
||||||
|
T = np.zeros((nb,), dtype=np.float64)
|
||||||
|
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = _band_energy(X, lo, hi)
|
||||||
|
smr = float(SMR_col[b])
|
||||||
|
if smr <= EPS:
|
||||||
|
T[b] = 0.0
|
||||||
|
else:
|
||||||
|
T[b] = P / smr
|
||||||
|
|
||||||
|
return T
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Alpha selection per band + neighbor-difference constraint
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _best_alpha_for_band(
|
||||||
|
X: FloatArray,
|
||||||
|
lo: int,
|
||||||
|
hi: int,
|
||||||
|
T_b: float,
|
||||||
|
alpha0: int,
|
||||||
|
alpha_min: int,
|
||||||
|
alpha_max: int,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha.
|
||||||
|
|
||||||
|
The algorithm increases alpha until the quantization error power exceeds
|
||||||
|
the band threshold:
|
||||||
|
|
||||||
|
P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2
|
||||||
|
select the largest alpha such that P_e(b) <= T(b)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : FloatArray
|
||||||
|
Full MDCT vector (one frame or one subframe), shape (N,).
|
||||||
|
lo, hi : int
|
||||||
|
Inclusive MDCT index range defining the band.
|
||||||
|
T_b : float
|
||||||
|
Psychoacoustic threshold for this band.
|
||||||
|
alpha0 : int
|
||||||
|
Initial alpha value for the band.
|
||||||
|
alpha_min, alpha_max : int
|
||||||
|
Bounds for alpha, used as a safeguard.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int
|
||||||
|
Selected integer alpha(b).
|
||||||
|
"""
|
||||||
|
if T_b <= 0.0:
|
||||||
|
return int(alpha0)
|
||||||
|
|
||||||
|
Xsec = X[lo : hi + 1]
|
||||||
|
|
||||||
|
alpha = int(alpha0)
|
||||||
|
alpha = max(alpha_min, min(alpha, alpha_max))
|
||||||
|
|
||||||
|
# Evaluate at current alpha
|
||||||
|
Ssec = _quantize_symbol(Xsec, alpha)
|
||||||
|
Xhat = _dequantize_symbol(Ssec, alpha)
|
||||||
|
Pe = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
|
if Pe > T_b:
|
||||||
|
return alpha
|
||||||
|
|
||||||
|
iters = 0
|
||||||
|
while iters < MAX_ALPHA_ITERS:
|
||||||
|
iters += 1
|
||||||
|
alpha_next = alpha + 1
|
||||||
|
if alpha_next > alpha_max:
|
||||||
|
break
|
||||||
|
|
||||||
|
Ssec = _quantize_symbol(Xsec, alpha_next)
|
||||||
|
Xhat = _dequantize_symbol(Ssec, alpha_next)
|
||||||
|
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
|
if Pe_next > T_b:
|
||||||
|
break
|
||||||
|
|
||||||
|
alpha = alpha_next
|
||||||
|
Pe = Pe_next
|
||||||
|
|
||||||
|
return alpha
|
||||||
|
|
||||||
|
|
||||||
|
def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping.
|
||||||
|
|
||||||
|
Uses a forward pass and a backward pass to reduce drift.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
alpha : np.ndarray
|
||||||
|
Alpha vector, shape (NB,).
|
||||||
|
max_diff : int
|
||||||
|
Maximum allowed absolute difference between adjacent bands.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray
|
||||||
|
Clamped alpha vector, int64, shape (NB,).
|
||||||
|
"""
|
||||||
|
a = np.asarray(alpha, dtype=np.int64).copy()
|
||||||
|
nb = a.shape[0]
|
||||||
|
|
||||||
|
for b in range(1, nb):
|
||||||
|
lo = int(a[b - 1] - max_diff)
|
||||||
|
hi = int(a[b - 1] + max_diff)
|
||||||
|
if a[b] < lo:
|
||||||
|
a[b] = lo
|
||||||
|
elif a[b] > hi:
|
||||||
|
a[b] = hi
|
||||||
|
|
||||||
|
for b in range(nb - 2, -1, -1):
|
||||||
|
lo = int(a[b + 1] - max_diff)
|
||||||
|
hi = int(a[b + 1] + max_diff)
|
||||||
|
if a[b] < lo:
|
||||||
|
a[b] = lo
|
||||||
|
elif a[b] > hi:
|
||||||
|
a[b] = hi
|
||||||
|
|
||||||
|
return a
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def aac_quantizer(
|
||||||
|
frame_F: FrameChannelF,
|
||||||
|
frame_type: FrameType,
|
||||||
|
SMR: FloatArray,
|
||||||
|
) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]:
|
||||||
|
"""
|
||||||
|
AAC quantizer for one channel (Level 3).
|
||||||
|
|
||||||
|
Quantizes MDCT coefficients using band-wise scalefactors derived from SMR.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameChannelF
|
||||||
|
MDCT coefficients after TNS, one channel.
|
||||||
|
Shapes:
|
||||||
|
- Long frames: (1024,) or (1024, 1)
|
||||||
|
- ESH: (128, 8)
|
||||||
|
frame_type : FrameType
|
||||||
|
AAC frame type ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
SMR : FloatArray
|
||||||
|
Signal-to-Mask Ratio per band.
|
||||||
|
Shapes:
|
||||||
|
- Long: (NB,) or (NB, 1)
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
S : QuantizedSymbols
|
||||||
|
Quantized symbols S(k), shape (1024, 1) for all frame types.
|
||||||
|
sfc : ScaleFactors
|
||||||
|
DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1).
|
||||||
|
Shapes:
|
||||||
|
- Long: (NB, 1)
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
G : GlobalGain
|
||||||
|
Global gain G = alpha(0).
|
||||||
|
- Long: scalar float
|
||||||
|
- ESH: array shape (1, 8)
|
||||||
|
"""
|
||||||
|
bands = _band_slices(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
X = np.asarray(frame_F, dtype=np.float64)
|
||||||
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if X.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F must have shape (128, 8).")
|
||||||
|
if SMR.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
|
||||||
|
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
G_arr = np.zeros((1, 8), dtype=np.int64)
|
||||||
|
|
||||||
|
for j in range(8):
|
||||||
|
Xj = X[:, j].reshape(128)
|
||||||
|
SMRj = SMR[:, j].reshape(NB)
|
||||||
|
|
||||||
|
T = _threshold_T_from_SMR(Xj, SMRj, bands)
|
||||||
|
|
||||||
|
alpha0 = _alpha_initial_from_frame_max(Xj)
|
||||||
|
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
||||||
|
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
alpha[b] = _best_alpha_for_band(
|
||||||
|
X=Xj, lo=lo, hi=hi, T_b=float(T[b]),
|
||||||
|
alpha0=int(alpha[b]),
|
||||||
|
alpha_min=-4096,
|
||||||
|
alpha_max=4096,
|
||||||
|
)
|
||||||
|
|
||||||
|
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
||||||
|
|
||||||
|
G_arr[0, j] = int(alpha[0])
|
||||||
|
sfc[0, j] = int(alpha[0])
|
||||||
|
for b in range(1, NB):
|
||||||
|
sfc[b, j] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
|
Sj = np.zeros((128,), dtype=np.int64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
|
# Place this subframe in the packed output (column-major subframe layout)
|
||||||
|
S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj
|
||||||
|
|
||||||
|
return S_out, sfc, G_arr.astype(np.float64)
|
||||||
|
|
||||||
|
# Long frames
|
||||||
|
if X.shape == (1024,):
|
||||||
|
Xv = X
|
||||||
|
elif X.shape == (1024, 1):
|
||||||
|
Xv = X[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
if SMR.shape == (NB,):
|
||||||
|
SMRv = SMR
|
||||||
|
elif SMR.shape == (NB, 1):
|
||||||
|
SMRv = SMR[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
T = _threshold_T_from_SMR(Xv, SMRv, bands)
|
||||||
|
|
||||||
|
alpha0 = _alpha_initial_from_frame_max(Xv)
|
||||||
|
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
||||||
|
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
alpha[b] = _best_alpha_for_band(
|
||||||
|
X=Xv, lo=lo, hi=hi, T_b=float(T[b]),
|
||||||
|
alpha0=int(alpha[b]),
|
||||||
|
alpha_min=-4096,
|
||||||
|
alpha_max=4096,
|
||||||
|
)
|
||||||
|
|
||||||
|
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
||||||
|
|
||||||
|
sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc[0, 0] = int(alpha[0])
|
||||||
|
for b in range(1, NB):
|
||||||
|
sfc[b, 0] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
|
G: float = float(alpha[0])
|
||||||
|
|
||||||
|
S_vec = np.zeros((1024,), dtype=np.int64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
|
return S_vec.reshape(1024, 1), sfc, G
|
||||||
|
|
||||||
|
|
||||||
|
def aac_i_quantizer(
|
||||||
|
S: QuantizedSymbols,
|
||||||
|
sfc: ScaleFactors,
|
||||||
|
G: GlobalGain,
|
||||||
|
frame_type: FrameType,
|
||||||
|
) -> FrameChannelF:
|
||||||
|
"""
|
||||||
|
Inverse quantizer (iQuantizer) for one channel.
|
||||||
|
|
||||||
|
Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
S : QuantizedSymbols
|
||||||
|
Quantized symbols, shape (1024, 1) (or any array with 1024 elements).
|
||||||
|
sfc : ScaleFactors
|
||||||
|
DPCM-coded scalefactors.
|
||||||
|
Shapes:
|
||||||
|
- Long: (NB, 1)
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
G : GlobalGain
|
||||||
|
Global gain (not strictly required if sfc includes sfc(0)=alpha(0)).
|
||||||
|
Present for API compatibility with the assignment.
|
||||||
|
frame_type : FrameType
|
||||||
|
AAC frame type.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelF
|
||||||
|
Reconstructed MDCT coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- Long: (1024, 1)
|
||||||
|
"""
|
||||||
|
bands = _band_slices(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
S_flat = np.asarray(S, dtype=np.int64).reshape(-1)
|
||||||
|
if S_flat.shape[0] != 1024:
|
||||||
|
raise ValueError("S must contain 1024 symbols.")
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc = np.asarray(sfc, dtype=np.int64)
|
||||||
|
if sfc.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
S_128x8 = _esh_unpack_from_1024(S_flat)
|
||||||
|
|
||||||
|
Xrec = np.zeros((128, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
for j in range(8):
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha[0] = int(sfc[0, j])
|
||||||
|
for b in range(1, NB):
|
||||||
|
alpha[b] = int(alpha[b - 1] + sfc[b, j])
|
||||||
|
|
||||||
|
Xj = np.zeros((128,), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b]))
|
||||||
|
|
||||||
|
Xrec[:, j] = Xj
|
||||||
|
|
||||||
|
return Xrec
|
||||||
|
|
||||||
|
sfc = np.asarray(sfc, dtype=np.int64)
|
||||||
|
if sfc.shape != (NB, 1):
|
||||||
|
raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).")
|
||||||
|
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha[0] = int(sfc[0, 0])
|
||||||
|
for b in range(1, NB):
|
||||||
|
alpha[b] = int(alpha[b - 1] + sfc[b, 0])
|
||||||
|
|
||||||
|
Xrec = np.zeros((1024,), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
|
return Xrec.reshape(1024, 1)
|
||||||
@ -176,7 +176,7 @@ def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
|||||||
# Public Function prototypes
|
# Public Function prototypes
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
||||||
"""
|
"""
|
||||||
Sequence Segmentation Control (SSC).
|
Sequence Segmentation Control (SSC).
|
||||||
|
|
||||||
|
|||||||
@ -30,9 +30,6 @@ from __future__ import annotations
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from scipy.io import loadmat
|
|
||||||
|
|
||||||
from core.aac_utils import load_b219_tables
|
from core.aac_utils import load_b219_tables
|
||||||
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
|
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
@ -377,7 +374,9 @@ def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
|
|||||||
sw = _compute_sw(x)
|
sw = _compute_sw(x)
|
||||||
|
|
||||||
eps = 1e-12
|
eps = 1e-12
|
||||||
xw = np.where(sw > eps, x / sw, 0.0)
|
xw = np.zeros_like(x, dtype=np.float64)
|
||||||
|
mask = sw > eps
|
||||||
|
np.divide(x, sw, out=xw, where=mask)
|
||||||
|
|
||||||
a = _lpc_coeffs(xw, PRED_ORDER)
|
a = _lpc_coeffs(xw, PRED_ORDER)
|
||||||
a_q = _quantize_coeffs(a)
|
a_q = _quantize_coeffs(a)
|
||||||
|
|||||||
@ -212,6 +212,42 @@ BandValueArray: TypeAlias = FloatArray
|
|||||||
Per-band psychoacoustic values (e.g. Bark position, thresholds).
|
Per-band psychoacoustic values (e.g. Bark position, thresholds).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Quantizer-related semantic aliases
|
||||||
|
|
||||||
|
QuantizedSymbols: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Quantized MDCT symbols S(k).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols).
|
||||||
|
"""
|
||||||
|
|
||||||
|
ScaleFactors: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long frames: (NB, 1)
|
||||||
|
- ESH frames: (NB, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
GlobalGain: TypeAlias = float | NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Global gain G = alpha(0).
|
||||||
|
|
||||||
|
- Long frames: scalar float
|
||||||
|
- ESH frames: array shape (1, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Huffman semantic aliases
|
||||||
|
|
||||||
|
HuffmanBitstream: TypeAlias = str
|
||||||
|
"""Huffman-coded bitstream stored as a string of '0'/'1'."""
|
||||||
|
|
||||||
|
HuffmanCodebook: TypeAlias = int
|
||||||
|
"""Huffman codebook id (e.g., 0..11)."""
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Level 1 AAC sequence payload types
|
# Level 1 AAC sequence payload types
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -299,3 +335,77 @@ Level 2 adds:
|
|||||||
and stores:
|
and stores:
|
||||||
- per-channel "frame_F" after applying TNS.
|
- per-channel "frame_F" after applying TNS.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 3 AAC sequence payload types (Quantizer + Huffman)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF3(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
tns_coeffs:
|
||||||
|
Quantized TNS predictor coefficients for ONE channel.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (PRED_ORDER, 8)
|
||||||
|
- else: (PRED_ORDER, 1)
|
||||||
|
|
||||||
|
T:
|
||||||
|
Psychoacoustic thresholds per band.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
- else: (NB, 1)
|
||||||
|
Note: Stored for completeness / debugging; not entropy-coded.
|
||||||
|
|
||||||
|
G:
|
||||||
|
Quantized global gains.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (1, 8) (one per short subframe)
|
||||||
|
- else: scalar (or compatible np scalar)
|
||||||
|
|
||||||
|
sfc:
|
||||||
|
Huffman-coded scalefactor differences (DPCM sequence).
|
||||||
|
|
||||||
|
stream:
|
||||||
|
Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols).
|
||||||
|
|
||||||
|
codebook:
|
||||||
|
Huffman codebook id used for MDCT symbols (stream).
|
||||||
|
(Scalefactors typically use fixed codebook 11 and do not need to store it.)
|
||||||
|
"""
|
||||||
|
tns_coeffs: TnsCoeffs
|
||||||
|
T: FloatArray
|
||||||
|
G: FloatArray | float
|
||||||
|
sfc: HuffmanBitstream
|
||||||
|
stream: HuffmanBitstream
|
||||||
|
codebook: HuffmanCodebook
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq3Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_3 (Level 3).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF3
|
||||||
|
chr: AACChannelFrameF3
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq3: TypeAlias = List[AACSeq3Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 3:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
|
||||||
|
Level 3 adds (per channel):
|
||||||
|
- "tns_coeffs"
|
||||||
|
- "T" thresholds (not entropy-coded)
|
||||||
|
- "G" global gain(s)
|
||||||
|
- "sfc" Huffman-coded scalefactor differences
|
||||||
|
- "stream" Huffman-coded MDCT quantized symbols
|
||||||
|
- "codebook" Huffman codebook for MDCT symbols
|
||||||
|
"""
|
||||||
|
|||||||
@ -19,10 +19,10 @@ import numpy as np
|
|||||||
import pytest
|
import pytest
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
from core.aac_coder import aac_coder_1, aac_coder_2, aac_read_wav_stereo_48k
|
from core.aac_coder import aac_coder_1, aac_coder_2, aac_coder_3, aac_read_wav_stereo_48k
|
||||||
from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_remove_padding
|
from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_decoder_3, aac_remove_padding
|
||||||
from core.aac_types import *
|
|
||||||
from core.aac_utils import snr_db
|
from core.aac_utils import snr_db
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
||||||
@ -223,3 +223,152 @@ def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> No
|
|||||||
|
|
||||||
snr = snr_db(x_ref, x_hat)
|
snr = snr_db(x_ref, x_hat)
|
||||||
assert snr > 80
|
assert snr > 80
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 3 tests (Quantizer + Huffman)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def wav_in_path() -> Path:
|
||||||
|
"""
|
||||||
|
Input WAV used for end-to-end tests.
|
||||||
|
|
||||||
|
This should point to the provided test audio under material/.
|
||||||
|
Adjust this path if your project layout differs.
|
||||||
|
"""
|
||||||
|
# Typical layout in this project:
|
||||||
|
# source/material/LicorDeCalandraca.wav
|
||||||
|
return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav"
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None:
|
||||||
|
"""
|
||||||
|
Validate Level-3 per-frame schema (keys + basic types only).
|
||||||
|
"""
|
||||||
|
assert "frame_type" in frame
|
||||||
|
assert "win_type" in frame
|
||||||
|
assert "chl" in frame
|
||||||
|
assert "chr" in frame
|
||||||
|
|
||||||
|
for ch_key in ("chl", "chr"):
|
||||||
|
ch = frame[ch_key] # type: ignore[index]
|
||||||
|
assert "tns_coeffs" in ch
|
||||||
|
assert "T" in ch
|
||||||
|
assert "G" in ch
|
||||||
|
assert "sfc" in ch
|
||||||
|
assert "stream" in ch
|
||||||
|
assert "codebook" in ch
|
||||||
|
|
||||||
|
assert isinstance(ch["sfc"], str)
|
||||||
|
assert isinstance(ch["stream"], str)
|
||||||
|
assert isinstance(ch["codebook"], int)
|
||||||
|
|
||||||
|
# Arrays: only check they are numpy arrays with expected dtype categories.
|
||||||
|
assert isinstance(ch["tns_coeffs"], np.ndarray)
|
||||||
|
assert isinstance(ch["T"], np.ndarray)
|
||||||
|
|
||||||
|
# Global gain: long frames may be scalar float, ESH may be ndarray
|
||||||
|
assert np.isscalar(ch["G"]) or isinstance(ch["G"], np.ndarray)
|
||||||
|
|
||||||
|
|
||||||
|
def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Contract test:
|
||||||
|
- aac_coder_3 returns AACSeq3
|
||||||
|
- Per-frame keys exist and types are consistent
|
||||||
|
- Basic shape expectations hold for ESH vs non-ESH cases
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This test uses a short excerpt (a few frames) to keep runtime bounded.
|
||||||
|
"""
|
||||||
|
# Use only a few frames to avoid long runtimes in the quantizer loop.
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
n_frames = 4
|
||||||
|
n_samples = win + (n_frames - 1) * hop
|
||||||
|
|
||||||
|
x, fs = aac_read_wav_stereo_48k(wav_in_path)
|
||||||
|
x_short = x[:n_samples, :]
|
||||||
|
|
||||||
|
short_wav = tmp_path / "input_short.wav"
|
||||||
|
sf.write(str(short_wav), x_short, fs)
|
||||||
|
|
||||||
|
aac_seq_3: AACSeq3 = aac_coder_3(short_wav)
|
||||||
|
|
||||||
|
assert isinstance(aac_seq_3, list)
|
||||||
|
assert len(aac_seq_3) > 0
|
||||||
|
|
||||||
|
for fr in aac_seq_3:
|
||||||
|
_assert_level3_frame_schema(fr)
|
||||||
|
|
||||||
|
frame_type = fr["frame_type"]
|
||||||
|
for ch_key in ("chl", "chr"):
|
||||||
|
ch = fr[ch_key] # type: ignore[index]
|
||||||
|
|
||||||
|
tns = np.asarray(ch["tns_coeffs"])
|
||||||
|
if frame_type == "ESH":
|
||||||
|
assert tns.ndim == 2
|
||||||
|
assert tns.shape[1] == 8
|
||||||
|
else:
|
||||||
|
assert tns.ndim == 2
|
||||||
|
assert tns.shape[1] == 1
|
||||||
|
|
||||||
|
T = np.asarray(ch["T"])
|
||||||
|
if frame_type == "ESH":
|
||||||
|
assert T.ndim == 2
|
||||||
|
assert T.shape[1] == 8
|
||||||
|
else:
|
||||||
|
assert T.ndim == 2
|
||||||
|
assert T.shape[1] == 1
|
||||||
|
|
||||||
|
G = ch["G"]
|
||||||
|
if frame_type == "ESH":
|
||||||
|
assert isinstance(G, np.ndarray)
|
||||||
|
assert np.asarray(G).shape == (1, 8)
|
||||||
|
else:
|
||||||
|
assert np.isscalar(G)
|
||||||
|
|
||||||
|
assert isinstance(ch["sfc"], str)
|
||||||
|
assert isinstance(ch["stream"], str)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_end_to_end_level_3_high_snr(wav_in_path: Path, tmp_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
End-to-end test for Level 3 (Quantizer + Huffman):
|
||||||
|
|
||||||
|
coder_3 -> decoder_3 should reconstruct a waveform with acceptable SNR.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
- Level 3 includes quantization, so SNR is expected to be lower than Level 1/2.
|
||||||
|
- We intentionally use a short excerpt (few frames) to keep runtime bounded,
|
||||||
|
since the reference quantizer implementation is computationally expensive.
|
||||||
|
"""
|
||||||
|
# Use only a few frames to avoid long runtimes.
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
n_frames = 4
|
||||||
|
n_samples = win + (n_frames - 1) * hop
|
||||||
|
|
||||||
|
x_ref, fs = aac_read_wav_stereo_48k(wav_in_path)
|
||||||
|
x_short = x_ref[:n_samples, :]
|
||||||
|
|
||||||
|
short_wav = tmp_path / "input_short_l3.wav"
|
||||||
|
sf.write(str(short_wav), x_short, fs)
|
||||||
|
|
||||||
|
out_wav = tmp_path / "decoded_level3.wav"
|
||||||
|
|
||||||
|
aac_seq_3: AACSeq3 = aac_coder_3(short_wav)
|
||||||
|
y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav)
|
||||||
|
|
||||||
|
# Align lengths defensively (padding removal may differ by a few samples)
|
||||||
|
n = min(x_short.shape[0], y_hat.shape[0])
|
||||||
|
x2 = x_short[:n, :]
|
||||||
|
y2 = y_hat[:n, :]
|
||||||
|
|
||||||
|
s = snr_db(x2, y2)
|
||||||
|
|
||||||
|
# Conservative threshold: Level 3 is lossy by design.
|
||||||
|
assert s > 10.0
|
||||||
139
source/core/tests/test_huffman.py
Normal file
139
source/core/tests/test_huffman.py
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Huffman Wrapper Tests (Level 3)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Contract tests for the Huffman coding stage, using the provided
|
||||||
|
# Huffman utilities (material/huff_utils.py).
|
||||||
|
#
|
||||||
|
# The Huffman encoder/decoder itself is GIVEN by the assignment and
|
||||||
|
# is not re-implemented here. These tests only verify that:
|
||||||
|
#
|
||||||
|
# - The wrapper functions (aac_encode_huff / aac_decode_huff) expose
|
||||||
|
# the API described in the assignment.
|
||||||
|
# - Forced codebook selection works as expected (e.g. scalefactors).
|
||||||
|
# - Tuple-based Huffman coding semantics are respected.
|
||||||
|
#
|
||||||
|
# Notes on tuple coding:
|
||||||
|
# Huffman coding operates on tuples of symbols. As a result,
|
||||||
|
# decode(encode(x)) may return extra trailing symbols due to padding.
|
||||||
|
# The AAC decoder always knows the true section length (from band limits)
|
||||||
|
# and truncates accordingly. Therefore, these tests only enforce that
|
||||||
|
# the decoded PREFIX matches the original data.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.aac_huffman import aac_encode_huff, aac_decode_huff
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Fixtures
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def huff_LUT():
|
||||||
|
"""
|
||||||
|
Load Huffman Look-Up Tables (LUTs) once per test module.
|
||||||
|
|
||||||
|
The LUTs are provided by the assignment (huffCodebooks.mat) via
|
||||||
|
material.huff_utils.load_LUT().
|
||||||
|
"""
|
||||||
|
return load_LUT()
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Roundtrip (prefix) tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"coeff_sec",
|
||||||
|
[
|
||||||
|
np.array([1, -1, 2, -2, 0, 0, 3], dtype=np.int64),
|
||||||
|
np.array([0, 0, 0, 0], dtype=np.int64),
|
||||||
|
np.array([5], dtype=np.int64),
|
||||||
|
np.array([-3, -3, -3, -3], dtype=np.int64),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_huffman_roundtrip_prefix_matches(
|
||||||
|
coeff_sec: np.ndarray,
|
||||||
|
huff_LUT,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Contract test for Huffman encode/decode.
|
||||||
|
|
||||||
|
Guarantees:
|
||||||
|
- Encoding followed by decoding does not crash.
|
||||||
|
- The decoded output has at least as many symbols as the input.
|
||||||
|
- The prefix of the decoded output matches the original coefficients.
|
||||||
|
|
||||||
|
Rationale:
|
||||||
|
Huffman tuple coding may introduce padding, so exact length equality
|
||||||
|
is NOT required or expected.
|
||||||
|
"""
|
||||||
|
huff_sec, cb = aac_encode_huff(coeff_sec, huff_LUT)
|
||||||
|
dec = aac_decode_huff(huff_sec, cb, huff_LUT)
|
||||||
|
|
||||||
|
if cb == 0:
|
||||||
|
# Codebook 0 represents an all-zero section.
|
||||||
|
assert np.all(coeff_sec == 0)
|
||||||
|
assert dec.size == 0
|
||||||
|
return
|
||||||
|
|
||||||
|
assert dec.size >= coeff_sec.size
|
||||||
|
np.testing.assert_array_equal(dec[: coeff_sec.size], coeff_sec)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Forced codebook tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_huffman_force_codebook_returns_requested_codebook(huff_LUT) -> None:
|
||||||
|
"""
|
||||||
|
Verify forced codebook selection.
|
||||||
|
|
||||||
|
According to the assignment, scalefactors must be encoded using
|
||||||
|
Huffman codebook 11. This test checks that:
|
||||||
|
- The requested codebook is actually used.
|
||||||
|
- The decoded prefix matches the original scalefactors.
|
||||||
|
"""
|
||||||
|
scalefactors = np.array([10, -2, 1, 0, -1, 3], dtype=np.int64)
|
||||||
|
|
||||||
|
huff_sec, cb = aac_encode_huff(
|
||||||
|
scalefactors,
|
||||||
|
huff_LUT,
|
||||||
|
force_codebook=11,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert cb == 11
|
||||||
|
assert isinstance(huff_sec, str)
|
||||||
|
|
||||||
|
dec = aac_decode_huff(huff_sec, cb, huff_LUT)
|
||||||
|
|
||||||
|
assert dec.size >= scalefactors.size
|
||||||
|
np.testing.assert_array_equal(dec[: scalefactors.size], scalefactors)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Error handling
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_huffman_invalid_codebook_raises(huff_LUT) -> None:
|
||||||
|
"""
|
||||||
|
Decoding with an invalid Huffman codebook index must raise an error.
|
||||||
|
"""
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
_ = aac_decode_huff(
|
||||||
|
huff_sec="010101",
|
||||||
|
huff_codebook=99,
|
||||||
|
huff_LUT=huff_LUT,
|
||||||
|
)
|
||||||
395
source/core/tests/test_quantizer.py
Normal file
395
source/core/tests/test_quantizer.py
Normal file
@ -0,0 +1,395 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Quantizer Tests
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Tests for Quantizer / iQuantizer module.
|
||||||
|
#
|
||||||
|
# These tests are deliberately "contract-oriented":
|
||||||
|
# - They validate shapes, dtypes and invariants that downstream stages
|
||||||
|
# (e.g., Huffman coding) depend on.
|
||||||
|
# - They do not attempt to validate psychoacoustic optimality (that would
|
||||||
|
# require a reference implementation and careful numerical baselines).
|
||||||
|
#
|
||||||
|
# Validates:
|
||||||
|
# - I/O shapes for long and ESH modes
|
||||||
|
# - DPCM scalefactor coding consistency (sfc)
|
||||||
|
# - ESH packing order of quantized symbols (128x8 <-> 1024)
|
||||||
|
# - Edge cases (zeros / near silence)
|
||||||
|
# - Sanity (finite outputs, no extreme numerical blow-up)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.aac_quantizer import aac_quantizer, aac_i_quantizer
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from core.aac_types import FrameType
|
||||||
|
|
||||||
|
|
||||||
|
# Small epsilon to avoid divisions by zero in sanity ratios
|
||||||
|
EPS = 1e-12
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper utilities
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _nbands(frame_type: FrameType) -> int:
|
||||||
|
"""
|
||||||
|
Return number of scalefactor bands for the given frame type.
|
||||||
|
|
||||||
|
This is derived from TableB219 (psycho tables) via aac_utils helpers,
|
||||||
|
so the tests remain consistent even if tables are updated.
|
||||||
|
"""
|
||||||
|
table, _nfft = get_table(frame_type)
|
||||||
|
wlow, _whigh, _bval, _qthr = band_limits(table)
|
||||||
|
return int(len(wlow))
|
||||||
|
|
||||||
|
|
||||||
|
def _make_smr(frame_type: FrameType, seed: int = 0) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Create a strictly positive SMR array with the correct shape.
|
||||||
|
|
||||||
|
These tests are not about psycho correctness; they only need SMR > 0
|
||||||
|
to avoid division by zero and to make the quantizer's threshold logic
|
||||||
|
behave deterministically.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(seed)
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
# ESH uses 8 short windows, thus SMR has 8 columns.
|
||||||
|
return (1.0 + np.abs(rng.normal(size=(NB, 8)))).astype(np.float64)
|
||||||
|
|
||||||
|
# Long frames: use a column vector (NB, 1).
|
||||||
|
return (1.0 + np.abs(rng.normal(size=(NB, 1)))).astype(np.float64)
|
||||||
|
|
||||||
|
|
||||||
|
def _reconstruct_alpha_from_sfc(sfc: np.ndarray) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Reconstruct alpha(b) from DPCM-coded scalefactors sfc(b).
|
||||||
|
|
||||||
|
By definition in the assignment:
|
||||||
|
sfc(0) = alpha(0)
|
||||||
|
alpha(b) = alpha(b-1) + sfc(b) for b > 0
|
||||||
|
|
||||||
|
This reconstruction is useful to validate the internal consistency
|
||||||
|
of the produced scalefactor information.
|
||||||
|
"""
|
||||||
|
sfc = np.asarray(sfc, dtype=np.int64)
|
||||||
|
|
||||||
|
# Long frames: sfc shape (NB, 1)
|
||||||
|
if sfc.ndim == 2 and sfc.shape[1] == 1:
|
||||||
|
NB = sfc.shape[0]
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha[0] = int(sfc[0, 0])
|
||||||
|
for b in range(1, NB):
|
||||||
|
alpha[b] = int(alpha[b - 1] + sfc[b, 0])
|
||||||
|
return alpha
|
||||||
|
|
||||||
|
# ESH frames: sfc shape (NB, 8)
|
||||||
|
if sfc.ndim == 2 and sfc.shape[1] == 8:
|
||||||
|
NB = sfc.shape[0]
|
||||||
|
alpha = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
alpha[0, :] = sfc[0, :]
|
||||||
|
for b in range(1, NB):
|
||||||
|
alpha[b, :] = alpha[b - 1, :] + sfc[b, :]
|
||||||
|
return alpha
|
||||||
|
|
||||||
|
raise ValueError("Unsupported sfc shape.")
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Shape / contract tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_quantizer_shapes_long(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test for long frames:
|
||||||
|
- Input: MDCT coefficients shape (1024, 1)
|
||||||
|
- Output S: always (1024, 1)
|
||||||
|
- Output sfc: (NB, 1)
|
||||||
|
- G: scalar float for long frames
|
||||||
|
- iQuantizer output: (1024, 1)
|
||||||
|
"""
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
rng = np.random.default_rng(1)
|
||||||
|
|
||||||
|
X = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||||
|
SMR = _make_smr(frame_type, seed=2)
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
|
||||||
|
assert S.shape == (1024, 1)
|
||||||
|
assert sfc.shape == (NB, 1)
|
||||||
|
assert isinstance(G, (float, np.floating))
|
||||||
|
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
assert Xhat.shape == (1024, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_quantizer_shapes_esh() -> None:
|
||||||
|
"""
|
||||||
|
Contract test for ESH frames:
|
||||||
|
- Input: MDCT coefficients shape (128, 8)
|
||||||
|
- Output S: packed to (1024, 1)
|
||||||
|
- Output sfc: (NB, 8)
|
||||||
|
- G: array shape (1, 8) for ESH (one gain per short window)
|
||||||
|
- iQuantizer output: (128, 8)
|
||||||
|
"""
|
||||||
|
frame_type: FrameType = "ESH"
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
rng = np.random.default_rng(3)
|
||||||
|
|
||||||
|
X = rng.normal(size=(128, 8)).astype(np.float64)
|
||||||
|
SMR = _make_smr(frame_type, seed=4)
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
|
||||||
|
assert S.shape == (1024, 1)
|
||||||
|
assert sfc.shape == (NB, 8)
|
||||||
|
assert isinstance(G, np.ndarray)
|
||||||
|
assert G.shape == (1, 8)
|
||||||
|
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
assert Xhat.shape == (128, 8)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# DPCM consistency tests
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_quantizer_dpcm_reconstructs_alpha_long(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Verify the DPCM coding rule for long frames.
|
||||||
|
|
||||||
|
The quantizer returns:
|
||||||
|
sfc(0) = alpha(0)
|
||||||
|
sfc(b) = alpha(b) - alpha(b-1), b>0
|
||||||
|
|
||||||
|
Reconstruct alpha from sfc and check:
|
||||||
|
alpha(0) == sfc(0) == G
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(5)
|
||||||
|
|
||||||
|
X = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||||
|
SMR = _make_smr(frame_type, seed=6)
|
||||||
|
|
||||||
|
_S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
|
||||||
|
alpha = _reconstruct_alpha_from_sfc(sfc)
|
||||||
|
|
||||||
|
assert int(sfc[0, 0]) == int(alpha[0])
|
||||||
|
assert float(alpha[0]) == float(G)
|
||||||
|
|
||||||
|
|
||||||
|
def test_quantizer_dpcm_reconstructs_alpha_esh() -> None:
|
||||||
|
"""
|
||||||
|
Verify the DPCM coding rule for ESH frames.
|
||||||
|
|
||||||
|
For each short window j:
|
||||||
|
sfc(0, j) = alpha(0, j) == G(0, j)
|
||||||
|
"""
|
||||||
|
frame_type: FrameType = "ESH"
|
||||||
|
rng = np.random.default_rng(7)
|
||||||
|
|
||||||
|
X = rng.normal(size=(128, 8)).astype(np.float64)
|
||||||
|
SMR = _make_smr(frame_type, seed=8)
|
||||||
|
|
||||||
|
_S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
|
||||||
|
alpha = _reconstruct_alpha_from_sfc(sfc)
|
||||||
|
|
||||||
|
assert np.all(alpha[0, :] == sfc[0, :])
|
||||||
|
assert np.all(alpha[0, :] == G.reshape(-1))
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# ESH packing order test
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_quantizer_esh_packing_order_matches_iquantizer_layout() -> None:
|
||||||
|
"""
|
||||||
|
Verify ESH packing order.
|
||||||
|
|
||||||
|
The quantizer outputs S in packed shape (1024, 1). The expected packing
|
||||||
|
is column-major concatenation of the 8 short subframes.
|
||||||
|
|
||||||
|
This test constructs a deterministic input where each subframe column
|
||||||
|
has a distinct constant value. After quantize+inverse-quantize, the
|
||||||
|
reconstructed columns should remain distinguishable in the same order.
|
||||||
|
|
||||||
|
This primarily tests ordering, not exact numerical values.
|
||||||
|
"""
|
||||||
|
frame_type: FrameType = "ESH"
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
|
||||||
|
# Create 8 distinct subframes: column j is constant (j+1)
|
||||||
|
X = np.zeros((128, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
X[:, j] = float(j + 1)
|
||||||
|
|
||||||
|
# Use very large SMR so thresholds are permissive and alpha changes are
|
||||||
|
# minimal. This helps keep the ordering signal strong.
|
||||||
|
SMR = np.ones((NB, 8), dtype=np.float64) * 1e6
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
|
||||||
|
# The average magnitude per column must be increasing with the original order.
|
||||||
|
col_means = np.mean(Xhat, axis=0)
|
||||||
|
assert np.all(np.diff(col_means) > 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Edge cases: zeros and near-silence
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_quantizer_zero_input_long_is_finite(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Edge case: zero MDCT coefficients should not produce NaN/Inf.
|
||||||
|
|
||||||
|
We do not require identity here (quantizer is lossy), but we require
|
||||||
|
the pipeline to remain numerically safe and produce finite outputs.
|
||||||
|
"""
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
|
||||||
|
X = np.zeros((1024, 1), dtype=np.float64)
|
||||||
|
SMR = np.ones((NB, 1), dtype=np.float64)
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
assert np.isfinite(S).all()
|
||||||
|
assert np.isfinite(sfc).all()
|
||||||
|
assert isinstance(G, (float, np.floating))
|
||||||
|
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
assert np.isfinite(Xhat).all()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quantizer_zero_input_esh_is_finite() -> None:
|
||||||
|
"""
|
||||||
|
Edge case: same as above, for ESH mode.
|
||||||
|
"""
|
||||||
|
frame_type: FrameType = "ESH"
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
|
||||||
|
X = np.zeros((128, 8), dtype=np.float64)
|
||||||
|
SMR = np.ones((NB, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
assert np.isfinite(S).all()
|
||||||
|
assert np.isfinite(sfc).all()
|
||||||
|
assert np.isfinite(G).all()
|
||||||
|
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
assert np.isfinite(Xhat).all()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_quantizer_near_silence_long_is_finite(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Edge case: extremely small values.
|
||||||
|
|
||||||
|
This stresses numerical guards (EPS usage) and ensures no invalid operations.
|
||||||
|
"""
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
|
||||||
|
X = (1e-15 * np.ones((1024, 1), dtype=np.float64))
|
||||||
|
SMR = np.ones((NB, 1), dtype=np.float64)
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
assert np.isfinite(S).all()
|
||||||
|
assert np.isfinite(sfc).all()
|
||||||
|
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
assert np.isfinite(Xhat).all()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quantizer_near_silence_esh_is_finite() -> None:
|
||||||
|
"""
|
||||||
|
Edge case: extremely small values, ESH mode.
|
||||||
|
"""
|
||||||
|
frame_type: FrameType = "ESH"
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
|
||||||
|
X = (1e-15 * np.ones((128, 8), dtype=np.float64))
|
||||||
|
SMR = np.ones((NB, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
assert np.isfinite(S).all()
|
||||||
|
assert np.isfinite(sfc).all()
|
||||||
|
assert np.isfinite(G).all()
|
||||||
|
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
assert np.isfinite(Xhat).all()
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Sanity: avoid catastrophic numerical blow-up
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_quantizer_sanity_no_extreme_blowup_long(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Loose sanity guard.
|
||||||
|
|
||||||
|
The quantizer is lossy, but it should not produce reconstructions with
|
||||||
|
catastrophic peak/energy growth compared to the input.
|
||||||
|
"""
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
rng = np.random.default_rng(11)
|
||||||
|
|
||||||
|
X = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||||
|
SMR = np.ones((NB, 1), dtype=np.float64) * 10.0
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
|
||||||
|
in_peak = float(np.max(np.abs(X)))
|
||||||
|
out_peak = float(np.max(np.abs(Xhat)))
|
||||||
|
peak_ratio = out_peak / (in_peak + EPS)
|
||||||
|
|
||||||
|
in_energy = float(np.sum(X * X))
|
||||||
|
out_energy = float(np.sum(Xhat * Xhat))
|
||||||
|
energy_ratio = out_energy / (in_energy + EPS)
|
||||||
|
|
||||||
|
# Very loose thresholds: only catch severe regressions.
|
||||||
|
assert peak_ratio < 100.0
|
||||||
|
assert energy_ratio < 1e4
|
||||||
|
|
||||||
|
|
||||||
|
def test_quantizer_sanity_no_extreme_blowup_esh() -> None:
|
||||||
|
"""
|
||||||
|
Same loose sanity guard for ESH mode.
|
||||||
|
"""
|
||||||
|
frame_type: FrameType = "ESH"
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
rng = np.random.default_rng(12)
|
||||||
|
|
||||||
|
X = rng.normal(size=(128, 8)).astype(np.float64)
|
||||||
|
SMR = np.ones((NB, 8), dtype=np.float64) * 10.0
|
||||||
|
|
||||||
|
S, sfc, G = aac_quantizer(X, frame_type, SMR)
|
||||||
|
Xhat = aac_i_quantizer(S, sfc, G, frame_type)
|
||||||
|
|
||||||
|
in_peak = float(np.max(np.abs(X)))
|
||||||
|
out_peak = float(np.max(np.abs(Xhat)))
|
||||||
|
peak_ratio = out_peak / (in_peak + EPS)
|
||||||
|
|
||||||
|
in_energy = float(np.sum(X * X))
|
||||||
|
out_energy = float(np.sum(Xhat * Xhat))
|
||||||
|
energy_ratio = out_energy / (in_energy + EPS)
|
||||||
|
|
||||||
|
assert peak_ratio < 100.0
|
||||||
|
assert energy_ratio < 1e4
|
||||||
@ -16,7 +16,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from core.aac_ssc import aac_SSC
|
from core.aac_ssc import aac_ssc
|
||||||
from core.aac_types import FrameT
|
from core.aac_types import FrameT
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -117,10 +117,10 @@ def test_ssc_fixed_cases_prev_lss_and_lps() -> None:
|
|||||||
|
|
||||||
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
next_attack = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
out1 = aac_SSC(frame_t, next_attack, "LSS")
|
out1 = aac_ssc(frame_t, next_attack, "LSS")
|
||||||
assert out1 == "ESH"
|
assert out1 == "ESH"
|
||||||
|
|
||||||
out2 = aac_SSC(frame_t, next_attack, "LPS")
|
out2 = aac_ssc(frame_t, next_attack, "LPS")
|
||||||
assert out2 == "OLS"
|
assert out2 == "OLS"
|
||||||
|
|
||||||
|
|
||||||
@ -138,7 +138,7 @@ def test_prev_ols_next_not_esh_returns_ols() -> None:
|
|||||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
next_t = _next_frame_no_attack()
|
next_t = _next_frame_no_attack()
|
||||||
|
|
||||||
out = aac_SSC(frame_t, next_t, "OLS")
|
out = aac_ssc(frame_t, next_t, "OLS")
|
||||||
assert out == "OLS"
|
assert out == "OLS"
|
||||||
|
|
||||||
|
|
||||||
@ -151,7 +151,7 @@ def test_prev_ols_next_esh_both_channels_returns_lss() -> None:
|
|||||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
out = aac_SSC(frame_t, next_t, "OLS")
|
out = aac_ssc(frame_t, next_t, "OLS")
|
||||||
assert out == "LSS"
|
assert out == "LSS"
|
||||||
|
|
||||||
|
|
||||||
@ -165,11 +165,11 @@ def test_prev_ols_next_esh_one_channel_returns_lss() -> None:
|
|||||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||||
out1 = aac_SSC(frame_t, next1_t, "OLS")
|
out1 = aac_ssc(frame_t, next1_t, "OLS")
|
||||||
assert out1 == "LSS"
|
assert out1 == "LSS"
|
||||||
|
|
||||||
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||||
out2 = aac_SSC(frame_t, next2_t, "OLS")
|
out2 = aac_ssc(frame_t, next2_t, "OLS")
|
||||||
assert out2 == "LSS"
|
assert out2 == "LSS"
|
||||||
|
|
||||||
|
|
||||||
@ -182,7 +182,7 @@ def test_prev_esh_next_esh_both_channels_returns_esh() -> None:
|
|||||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
next_t = _next_frame_strong_attack(attack_left=True, attack_right=True)
|
||||||
|
|
||||||
out = aac_SSC(frame_t, next_t, "ESH")
|
out = aac_ssc(frame_t, next_t, "ESH")
|
||||||
assert out == "ESH"
|
assert out == "ESH"
|
||||||
|
|
||||||
|
|
||||||
@ -195,7 +195,7 @@ def test_prev_esh_next_not_esh_both_channels_returns_lps() -> None:
|
|||||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
next_t = _next_frame_no_attack()
|
next_t = _next_frame_no_attack()
|
||||||
|
|
||||||
out = aac_SSC(frame_t, next_t, "ESH")
|
out = aac_ssc(frame_t, next_t, "ESH")
|
||||||
assert out == "LPS"
|
assert out == "LPS"
|
||||||
|
|
||||||
|
|
||||||
@ -209,11 +209,11 @@ def test_prev_esh_next_esh_one_channel_merged_is_esh() -> None:
|
|||||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
|
|
||||||
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
next1_t = _next_frame_strong_attack(attack_left=True, attack_right=False)
|
||||||
out1 = aac_SSC(frame_t, next1_t, "ESH")
|
out1 = aac_ssc(frame_t, next1_t, "ESH")
|
||||||
assert out1 == "ESH"
|
assert out1 == "ESH"
|
||||||
|
|
||||||
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
next2_t = _next_frame_strong_attack(attack_left=False, attack_right=True)
|
||||||
out2 = aac_SSC(frame_t, next2_t, "ESH")
|
out2 = aac_ssc(frame_t, next2_t, "ESH")
|
||||||
assert out2 == "ESH"
|
assert out2 == "ESH"
|
||||||
|
|
||||||
|
|
||||||
@ -230,5 +230,5 @@ def test_threshold_s_must_exceed_1e_3() -> None:
|
|||||||
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
frame_t: FrameT = np.zeros((2048, 2), dtype=np.float64)
|
||||||
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
|
next_t = _next_frame_below_s_threshold(left=True, right=True, impulse_amp=0.01)
|
||||||
|
|
||||||
out = aac_SSC(frame_t, next_t, "OLS")
|
out = aac_ssc(frame_t, next_t, "OLS")
|
||||||
assert out == "OLS"
|
assert out == "OLS"
|
||||||
@ -26,6 +26,7 @@ from core.aac_configuration import PRED_ORDER, QUANT_MAX, QUANT_STEP
|
|||||||
from core.aac_tns import aac_tns, aac_i_tns
|
from core.aac_tns import aac_tns, aac_i_tns
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
EPS = 1e-12
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Helper utilities
|
# Helper utilities
|
||||||
@ -194,3 +195,132 @@ def test_tns_outputs_are_finite() -> None:
|
|||||||
out_esh, coeffs_esh = aac_tns(frame_F_esh, "ESH")
|
out_esh, coeffs_esh = aac_tns(frame_F_esh, "ESH")
|
||||||
assert np.isfinite(out_esh).all()
|
assert np.isfinite(out_esh).all()
|
||||||
assert np.isfinite(coeffs_esh).all()
|
assert np.isfinite(coeffs_esh).all()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_tns_zero_input_is_identity_long(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Edge case: zero MDCT coefficients should remain zero after TNS and iTNS.
|
||||||
|
This checks that no NaN/Inf appears and the pipeline is numerically safe.
|
||||||
|
"""
|
||||||
|
frame_F_in = np.zeros((1024, 1), dtype=np.float64)
|
||||||
|
|
||||||
|
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||||
|
assert np.isfinite(frame_F_tns).all()
|
||||||
|
assert np.isfinite(tns_coeffs).all()
|
||||||
|
assert np.all(frame_F_tns == 0.0)
|
||||||
|
|
||||||
|
frame_F_hat = aac_i_tns(frame_F_tns, frame_type, tns_coeffs)
|
||||||
|
assert np.isfinite(frame_F_hat).all()
|
||||||
|
assert np.all(frame_F_hat == 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tns_zero_input_is_identity_esh() -> None:
|
||||||
|
"""
|
||||||
|
Edge case: zero MDCT coefficients should remain zero for ESH too.
|
||||||
|
"""
|
||||||
|
frame_F_in = np.zeros((128, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, "ESH")
|
||||||
|
assert np.isfinite(frame_F_tns).all()
|
||||||
|
assert np.isfinite(tns_coeffs).all()
|
||||||
|
assert np.all(frame_F_tns == 0.0)
|
||||||
|
|
||||||
|
frame_F_hat = aac_i_tns(frame_F_tns, "ESH", tns_coeffs)
|
||||||
|
assert np.isfinite(frame_F_hat).all()
|
||||||
|
assert np.all(frame_F_hat == 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_tns_near_silence_is_finite_and_roundtrips(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Edge case: extremely small values should not cause NaN/Inf,
|
||||||
|
and round-trip should remain close.
|
||||||
|
"""
|
||||||
|
frame_F_in = (1e-15 * np.ones((1024, 1), dtype=np.float64))
|
||||||
|
|
||||||
|
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||||
|
assert np.isfinite(frame_F_tns).all()
|
||||||
|
assert np.isfinite(tns_coeffs).all()
|
||||||
|
|
||||||
|
frame_F_hat = aac_i_tns(frame_F_tns, frame_type, tns_coeffs)
|
||||||
|
assert np.isfinite(frame_F_hat).all()
|
||||||
|
|
||||||
|
np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-6, atol=1e-12)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tns_near_silence_esh_is_finite_and_roundtrips() -> None:
|
||||||
|
"""
|
||||||
|
Near-silence test for ESH mode.
|
||||||
|
"""
|
||||||
|
frame_F_in = (1e-15 * np.ones((128, 8), dtype=np.float64))
|
||||||
|
|
||||||
|
frame_F_tns, tns_coeffs = aac_tns(frame_F_in, "ESH")
|
||||||
|
assert np.isfinite(frame_F_tns).all()
|
||||||
|
assert np.isfinite(tns_coeffs).all()
|
||||||
|
|
||||||
|
frame_F_hat = aac_i_tns(frame_F_tns, "ESH", tns_coeffs)
|
||||||
|
assert np.isfinite(frame_F_hat).all()
|
||||||
|
|
||||||
|
np.testing.assert_allclose(frame_F_hat, frame_F_in, rtol=1e-6, atol=1e-12)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_tns_accepts_flat_vector_shape_long(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Contract test: for non-ESH, aac_tns must accept input shape (1024,)
|
||||||
|
in addition to (1024, 1), and preserve the shape convention.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(7)
|
||||||
|
frame_F_in = rng.normal(size=(1024,)).astype(np.float64)
|
||||||
|
|
||||||
|
frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||||
|
|
||||||
|
assert frame_F_out.shape == (1024,)
|
||||||
|
assert tns_coeffs.shape == (PRED_ORDER, 1)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("frame_type", ["OLS", "LSS", "LPS"])
|
||||||
|
def test_tns_does_not_explode_peak_or_energy_long(frame_type: FrameType) -> None:
|
||||||
|
"""
|
||||||
|
Sanity: TNS should not cause extreme peak/energy blow-up on typical inputs.
|
||||||
|
This is a loose guard to catch regressions.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(8)
|
||||||
|
frame_F_in = rng.normal(size=(1024, 1)).astype(np.float64)
|
||||||
|
|
||||||
|
in_peak = float(np.max(np.abs(frame_F_in)))
|
||||||
|
in_energy = float(np.sum(frame_F_in * frame_F_in))
|
||||||
|
|
||||||
|
frame_F_out, _ = aac_tns(frame_F_in, frame_type)
|
||||||
|
|
||||||
|
out_peak = float(np.max(np.abs(frame_F_out)))
|
||||||
|
out_energy = float(np.sum(frame_F_out * frame_F_out))
|
||||||
|
|
||||||
|
peak_ratio = out_peak / (in_peak + EPS)
|
||||||
|
energy_ratio = out_energy / (in_energy + EPS)
|
||||||
|
|
||||||
|
assert peak_ratio < 50.0
|
||||||
|
assert energy_ratio < 2500.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_tns_does_not_explode_peak_or_energy_esh() -> None:
|
||||||
|
"""
|
||||||
|
Sanity: same blow-up guard for ESH mode.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(9)
|
||||||
|
frame_F_in = rng.normal(size=(128, 8)).astype(np.float64)
|
||||||
|
|
||||||
|
in_peak = float(np.max(np.abs(frame_F_in)))
|
||||||
|
in_energy = float(np.sum(frame_F_in * frame_F_in))
|
||||||
|
|
||||||
|
frame_F_out, _ = aac_tns(frame_F_in, "ESH")
|
||||||
|
|
||||||
|
out_peak = float(np.max(np.abs(frame_F_out)))
|
||||||
|
out_energy = float(np.sum(frame_F_out * frame_F_out))
|
||||||
|
|
||||||
|
peak_ratio = out_peak / (in_peak + EPS)
|
||||||
|
energy_ratio = out_energy / (in_energy + EPS)
|
||||||
|
|
||||||
|
assert peak_ratio < 50.0
|
||||||
|
assert energy_ratio < 2500.0
|
||||||
|
|||||||
@ -9,8 +9,16 @@
|
|||||||
# cchoutou@ece.auth.gr
|
# cchoutou@ece.auth.gr
|
||||||
#
|
#
|
||||||
# Description:
|
# Description:
|
||||||
# - Level 1 AAC encoder orchestration.
|
# Level 1 AAC encoder orchestration.
|
||||||
# - Level 2 AAC encoder orchestration.
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Reads WAV via soundfile
|
||||||
|
# - Validates stereo and 48 kHz
|
||||||
|
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
|
||||||
|
# - SSC decision uses next-frame attack detection
|
||||||
|
# - Filterbank analysis (MDCT)
|
||||||
|
# - Stores per-channel spectra in AACSeq1 schema:
|
||||||
|
# * ESH: (128, 8)
|
||||||
|
# * else: (1024, 1)
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@ -18,14 +26,106 @@ from pathlib import Path
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
from scipy.io import savemat
|
||||||
|
|
||||||
from core.aac_configuration import WIN_TYPE
|
from core.aac_configuration import WIN_TYPE
|
||||||
from core.aac_filterbank import aac_filter_bank
|
from core.aac_filterbank import aac_filter_bank
|
||||||
from core.aac_ssc import aac_SSC
|
from core.aac_ssc import aac_ssc
|
||||||
from core.aac_tns import aac_tns
|
from core.aac_tns import aac_tns
|
||||||
|
from core.aac_psycho import aac_psycho
|
||||||
|
from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py
|
||||||
|
from core.aac_huffman import aac_encode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
|
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helpers for thresholds (T(b))
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Return inclusive (lo, hi) band slices derived from TableB219.
|
||||||
|
"""
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)]
|
||||||
|
|
||||||
|
|
||||||
|
def _thresholds_from_smr(
|
||||||
|
frame_F_ch: FrameChannelF,
|
||||||
|
frame_type: FrameType,
|
||||||
|
SMR: FloatArray,
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long: returns (NB, 1)
|
||||||
|
- ESH: returns (NB, 8)
|
||||||
|
"""
|
||||||
|
bands = _band_slices_from_table(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
X = np.asarray(frame_F_ch, dtype=np.float64)
|
||||||
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if X.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F_ch must have shape (128, 8).")
|
||||||
|
if SMR.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
Xj = X[:, j]
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xj[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMR[b, j])
|
||||||
|
T[b, j] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
return T
|
||||||
|
|
||||||
|
# Long
|
||||||
|
if X.shape == (1024,):
|
||||||
|
Xv = X
|
||||||
|
elif X.shape == (1024, 1):
|
||||||
|
Xv = X[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
if SMR.shape == (NB,):
|
||||||
|
SMRv = SMR
|
||||||
|
elif SMR.shape == (NB, 1):
|
||||||
|
SMRv = SMR[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 1), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xv[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMRv[b])
|
||||||
|
T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
|
||||||
|
return T
|
||||||
|
|
||||||
|
def _normalize_global_gain(G: GlobalGain) -> float | FloatArray:
|
||||||
|
"""
|
||||||
|
Normalize GlobalGain to match AACChannelFrameF3["G"] type:
|
||||||
|
- long: return float
|
||||||
|
- ESH: return float64 ndarray of shape (1, 8)
|
||||||
|
"""
|
||||||
|
if np.isscalar(G):
|
||||||
|
return float(G)
|
||||||
|
|
||||||
|
G_arr = np.asarray(G)
|
||||||
|
if G_arr.size == 1:
|
||||||
|
return float(G_arr.reshape(-1)[0])
|
||||||
|
|
||||||
|
return np.asarray(G_arr, dtype=np.float64)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public helpers (useful for level_x demo wrappers)
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -174,7 +274,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
@ -191,10 +291,6 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Level 2 encoder
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC encoder (Level 1 + TNS).
|
Level-2 AAC encoder (Level 1 + TNS).
|
||||||
@ -246,7 +342,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
# Level 1 analysis (packed stereo container)
|
# Level 1 analysis (packed stereo container)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
@ -278,3 +374,180 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
|
def aac_coder_3(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
filename_aac_coded: Union[str, Path] | None = None,
|
||||||
|
) -> AACSeq3:
|
||||||
|
"""
|
||||||
|
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
filename_aac_coded : Union[str, Path] | None
|
||||||
|
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq3
|
||||||
|
Encoded AAC sequence (Level 3 payload schema).
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
aac_seq: AACSeq3 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
# Pin win_type to the WinType literal for type checkers.
|
||||||
|
win_type: WinType = WIN_TYPE
|
||||||
|
|
||||||
|
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
||||||
|
prev1_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev1_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start : start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop : start + hop + win, :]
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
|
# Analysis filterbank (stereo packed)
|
||||||
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
|
# TNS per channel
|
||||||
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
|
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||||
|
|
||||||
|
# Psychoacoustic model per channel (time-domain)
|
||||||
|
frame_L = np.asarray(frame_t[:, 0], dtype=np.float64)
|
||||||
|
frame_R = np.asarray(frame_t[:, 1], dtype=np.float64)
|
||||||
|
|
||||||
|
SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L)
|
||||||
|
SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R)
|
||||||
|
|
||||||
|
# Thresholds T(b) (stored, not entropy-coded)
|
||||||
|
T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L)
|
||||||
|
T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Quantizer per channel
|
||||||
|
S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L)
|
||||||
|
S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Normalize G types for AACSeq3 schema (float | float64 ndarray).
|
||||||
|
G_Ln = _normalize_global_gain(G_L)
|
||||||
|
G_Rn = _normalize_global_gain(G_R)
|
||||||
|
|
||||||
|
# Huffman-code ONLY the DPCM differences for b>0.
|
||||||
|
# sfc[0] corresponds to alpha(0)=G and is stored separately in the frame.
|
||||||
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
|
# Codebook 11:
|
||||||
|
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
||||||
|
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
||||||
|
sf_cb = 11
|
||||||
|
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
||||||
|
|
||||||
|
sfc_L_dpcm = np.clip(
|
||||||
|
sfc_L_dpcm,
|
||||||
|
-sf_max_abs,
|
||||||
|
sf_max_abs,
|
||||||
|
).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
sfc_R_dpcm = np.clip(
|
||||||
|
sfc_R_dpcm,
|
||||||
|
-sf_max_abs,
|
||||||
|
sf_max_abs,
|
||||||
|
).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
sfc_L_stream, _ = aac_encode_huff(
|
||||||
|
sfc_L_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
force_codebook=sf_cb,
|
||||||
|
)
|
||||||
|
sfc_R_stream, _ = aac_encode_huff(
|
||||||
|
sfc_R_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
force_codebook=sf_cb,
|
||||||
|
)
|
||||||
|
|
||||||
|
mdct_L_stream, cb_L = aac_encode_huff(
|
||||||
|
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
mdct_R_stream, cb_R = aac_encode_huff(
|
||||||
|
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
|
frame_out: AACSeq3Frame = {
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": win_type,
|
||||||
|
"chl": {
|
||||||
|
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_L, dtype=np.float64),
|
||||||
|
"G": G_Ln,
|
||||||
|
"sfc": sfc_L_stream,
|
||||||
|
"stream": mdct_L_stream,
|
||||||
|
"codebook": int(cb_L),
|
||||||
|
},
|
||||||
|
"chr": {
|
||||||
|
"tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_R, dtype=np.float64),
|
||||||
|
"G": G_Rn,
|
||||||
|
"sfc": sfc_R_stream,
|
||||||
|
"stream": mdct_R_stream,
|
||||||
|
"codebook": int(cb_R),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
aac_seq.append(frame_out)
|
||||||
|
|
||||||
|
# Update psycho history (shift register)
|
||||||
|
prev2_L = prev1_L
|
||||||
|
prev1_L = frame_L
|
||||||
|
prev2_R = prev1_R
|
||||||
|
prev1_R = frame_R
|
||||||
|
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
|
# Optional: store to .mat for the assignment wrapper
|
||||||
|
if filename_aac_coded is not None:
|
||||||
|
filename_aac_coded = Path(filename_aac_coded)
|
||||||
|
savemat(
|
||||||
|
str(filename_aac_coded),
|
||||||
|
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
||||||
|
do_compression=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return aac_seq
|
||||||
|
|
||||||
|
|||||||
@ -22,9 +22,22 @@ import soundfile as sf
|
|||||||
|
|
||||||
from core.aac_filterbank import aac_i_filter_bank
|
from core.aac_filterbank import aac_i_filter_bank
|
||||||
from core.aac_tns import aac_i_tns
|
from core.aac_tns import aac_i_tns
|
||||||
|
from core.aac_quantizer import aac_i_quantizer
|
||||||
|
from core.aac_huffman import aac_decode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper for NB
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _nbands(frame_type: FrameType) -> int:
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, _whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return int(len(wlow))
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public helpers
|
# Public helpers
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -252,3 +265,144 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-3 AAC decoder (inverse of aac_coder_3).
|
||||||
|
|
||||||
|
Steps per frame:
|
||||||
|
- Huffman decode scalefactors (sfc) using codebook 11
|
||||||
|
- Huffman decode MDCT symbols (stream) using stored codebook
|
||||||
|
- iQuantizer -> MDCT coefficients after TNS
|
||||||
|
- iTNS using stored predictor coefficients
|
||||||
|
- IMDCT filterbank -> time domain
|
||||||
|
- Overlap-add, remove padding, write WAV
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_3 : AACSeq3
|
||||||
|
Encoded sequence as produced by aac_coder_3.
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_3)
|
||||||
|
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("aac_seq_3 must contain at least one frame.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
for i, fr in enumerate(aac_seq_3):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
# We store G separately, so Huffman stream contains only (NB-1) DPCM differences.
|
||||||
|
sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Left channel
|
||||||
|
# -------------------------
|
||||||
|
tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_L = fr["chl"]["G"]
|
||||||
|
sfc_bits_L = fr["chl"]["sfc"]
|
||||||
|
mdct_bits_L = fr["chl"]["stream"]
|
||||||
|
cb_L = int(fr["chl"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_L[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
else:
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_L[0, 0] = int(float(G_L))
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
|
||||||
|
# MDCT symbols: codebook 0 means "all-zero section"
|
||||||
|
if cb_L == 0:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
# Tuple coding may produce extra trailing symbols; caller knows the true length (1024).
|
||||||
|
# Also guard against short outputs by zero-padding.
|
||||||
|
if S_tmp_L.size < 1024:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_L[: S_tmp_L.size] = S_tmp_L
|
||||||
|
else:
|
||||||
|
S_dec_L = S_tmp_L[:1024]
|
||||||
|
|
||||||
|
S_L = S_dec_L.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type)
|
||||||
|
X_L = aac_i_tns(Xq_L, frame_type, tns_L)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Right channel
|
||||||
|
# -------------------------
|
||||||
|
tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_R = fr["chr"]["G"]
|
||||||
|
sfc_bits_R = fr["chr"]["sfc"]
|
||||||
|
mdct_bits_R = fr["chr"]["stream"]
|
||||||
|
cb_R = int(fr["chr"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_R[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
else:
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_R[0, 0] = int(float(G_R))
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
|
||||||
|
if cb_R == 0:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
if S_tmp_R.size < 1024:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_R[: S_tmp_R.size] = S_tmp_R
|
||||||
|
else:
|
||||||
|
S_dec_R = S_tmp_R[:1024]
|
||||||
|
|
||||||
|
S_R = S_dec_R.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type)
|
||||||
|
X_R = aac_i_tns(Xq_R, frame_type, tns_R)
|
||||||
|
|
||||||
|
# Re-pack to stereo container and inverse filterbank
|
||||||
|
frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R))
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|||||||
@ -176,7 +176,7 @@ def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
|||||||
# Public Function prototypes
|
# Public Function prototypes
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
||||||
"""
|
"""
|
||||||
Sequence Segmentation Control (SSC).
|
Sequence Segmentation Control (SSC).
|
||||||
|
|
||||||
|
|||||||
@ -212,6 +212,42 @@ BandValueArray: TypeAlias = FloatArray
|
|||||||
Per-band psychoacoustic values (e.g. Bark position, thresholds).
|
Per-band psychoacoustic values (e.g. Bark position, thresholds).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Quantizer-related semantic aliases
|
||||||
|
|
||||||
|
QuantizedSymbols: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Quantized MDCT symbols S(k).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols).
|
||||||
|
"""
|
||||||
|
|
||||||
|
ScaleFactors: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long frames: (NB, 1)
|
||||||
|
- ESH frames: (NB, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
GlobalGain: TypeAlias = float | NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Global gain G = alpha(0).
|
||||||
|
|
||||||
|
- Long frames: scalar float
|
||||||
|
- ESH frames: array shape (1, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Huffman semantic aliases
|
||||||
|
|
||||||
|
HuffmanBitstream: TypeAlias = str
|
||||||
|
"""Huffman-coded bitstream stored as a string of '0'/'1'."""
|
||||||
|
|
||||||
|
HuffmanCodebook: TypeAlias = int
|
||||||
|
"""Huffman codebook id (e.g., 0..11)."""
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Level 1 AAC sequence payload types
|
# Level 1 AAC sequence payload types
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -299,3 +335,77 @@ Level 2 adds:
|
|||||||
and stores:
|
and stores:
|
||||||
- per-channel "frame_F" after applying TNS.
|
- per-channel "frame_F" after applying TNS.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 3 AAC sequence payload types (Quantizer + Huffman)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF3(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
tns_coeffs:
|
||||||
|
Quantized TNS predictor coefficients for ONE channel.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (PRED_ORDER, 8)
|
||||||
|
- else: (PRED_ORDER, 1)
|
||||||
|
|
||||||
|
T:
|
||||||
|
Psychoacoustic thresholds per band.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
- else: (NB, 1)
|
||||||
|
Note: Stored for completeness / debugging; not entropy-coded.
|
||||||
|
|
||||||
|
G:
|
||||||
|
Quantized global gains.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (1, 8) (one per short subframe)
|
||||||
|
- else: scalar (or compatible np scalar)
|
||||||
|
|
||||||
|
sfc:
|
||||||
|
Huffman-coded scalefactor differences (DPCM sequence).
|
||||||
|
|
||||||
|
stream:
|
||||||
|
Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols).
|
||||||
|
|
||||||
|
codebook:
|
||||||
|
Huffman codebook id used for MDCT symbols (stream).
|
||||||
|
(Scalefactors typically use fixed codebook 11 and do not need to store it.)
|
||||||
|
"""
|
||||||
|
tns_coeffs: TnsCoeffs
|
||||||
|
T: FloatArray
|
||||||
|
G: FloatArray | float
|
||||||
|
sfc: HuffmanBitstream
|
||||||
|
stream: HuffmanBitstream
|
||||||
|
codebook: HuffmanCodebook
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq3Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_3 (Level 3).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF3
|
||||||
|
chr: AACChannelFrameF3
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq3: TypeAlias = List[AACSeq3Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 3:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
|
||||||
|
Level 3 adds (per channel):
|
||||||
|
- "tns_coeffs"
|
||||||
|
- "T" thresholds (not entropy-coded)
|
||||||
|
- "G" global gain(s)
|
||||||
|
- "sfc" Huffman-coded scalefactor differences
|
||||||
|
- "stream" Huffman-coded MDCT quantized symbols
|
||||||
|
- "codebook" Huffman codebook for MDCT symbols
|
||||||
|
"""
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@ -1,400 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import scipy.io as sio
|
|
||||||
import os
|
|
||||||
|
|
||||||
# ------------------ LOAD LUT ------------------
|
|
||||||
|
|
||||||
def load_LUT(mat_filename=None):
|
|
||||||
"""
|
|
||||||
Loads the list of Huffman Codebooks (LUTs)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
huffLUT : list (index 1..11 used, index 0 unused)
|
|
||||||
"""
|
|
||||||
if mat_filename is None:
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
|
|
||||||
|
|
||||||
mat = sio.loadmat(mat_filename)
|
|
||||||
|
|
||||||
|
|
||||||
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
|
|
||||||
|
|
||||||
huffCodebooks = []
|
|
||||||
for i in range(11):
|
|
||||||
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
|
|
||||||
|
|
||||||
# Build inverse VLC tables
|
|
||||||
invTable = [None] * 11
|
|
||||||
|
|
||||||
for i in range(11):
|
|
||||||
h = huffCodebooks[i][:, 2].astype(int) # column 3
|
|
||||||
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
|
|
||||||
|
|
||||||
hbin = []
|
|
||||||
for j in range(len(h)):
|
|
||||||
hbin.append(format(h[j], f'0{hlength[j]}b'))
|
|
||||||
|
|
||||||
invTable[i] = vlc_table(hbin)
|
|
||||||
|
|
||||||
# Build Huffman LUT dicts
|
|
||||||
huffLUT = [None] * 12 # index 0 unused
|
|
||||||
params = [
|
|
||||||
(4, 1, True),
|
|
||||||
(4, 1, True),
|
|
||||||
(4, 2, False),
|
|
||||||
(4, 2, False),
|
|
||||||
(2, 4, True),
|
|
||||||
(2, 4, True),
|
|
||||||
(2, 7, False),
|
|
||||||
(2, 7, False),
|
|
||||||
(2, 12, False),
|
|
||||||
(2, 12, False),
|
|
||||||
(2, 16, False),
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
|
|
||||||
huffLUT[i] = {
|
|
||||||
'LUT': huffCodebooks[i-1],
|
|
||||||
'invTable': invTable[i-1],
|
|
||||||
'codebook': i,
|
|
||||||
'nTupleSize': nTupleSize,
|
|
||||||
'maxAbsCodeVal': maxAbs,
|
|
||||||
'signedValues': signed
|
|
||||||
}
|
|
||||||
|
|
||||||
return huffLUT
|
|
||||||
|
|
||||||
def vlc_table(code_array):
|
|
||||||
"""
|
|
||||||
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
|
|
||||||
returns:
|
|
||||||
h : NumPy array of shape (num_nodes, 3)
|
|
||||||
columns:
|
|
||||||
[ next_if_0 , next_if_1 , symbol_index ]
|
|
||||||
"""
|
|
||||||
h = np.zeros((1, 3), dtype=int)
|
|
||||||
|
|
||||||
for code_index, code in enumerate(code_array, start=1):
|
|
||||||
word = [int(bit) for bit in code]
|
|
||||||
h_index = 0
|
|
||||||
|
|
||||||
for bit in word:
|
|
||||||
k = bit
|
|
||||||
next_node = h[h_index, k]
|
|
||||||
if next_node == 0:
|
|
||||||
h = np.vstack([h, [0, 0, 0]])
|
|
||||||
new_index = h.shape[0] - 1
|
|
||||||
h[h_index, k] = new_index
|
|
||||||
h_index = new_index
|
|
||||||
else:
|
|
||||||
h_index = next_node
|
|
||||||
|
|
||||||
h[h_index, 2] = code_index
|
|
||||||
|
|
||||||
return h
|
|
||||||
|
|
||||||
# ------------------ ENCODE ------------------
|
|
||||||
|
|
||||||
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
|
|
||||||
"""
|
|
||||||
Huffman-encode a sequence of quantized coefficients.
|
|
||||||
|
|
||||||
This function selects the appropriate Huffman codebook based on the
|
|
||||||
maximum absolute value of the input coefficients, encodes the coefficients
|
|
||||||
into a binary Huffman bitstream, and returns both the bitstream and the
|
|
||||||
selected codebook index.
|
|
||||||
|
|
||||||
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
|
|
||||||
in audio/image coding (e.g., scale factor band encoding). The input
|
|
||||||
coefficient sequence is grouped into fixed-size tuples as defined by
|
|
||||||
the chosen Huffman LUT. Zero-padding may be applied internally.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
coeff_sec : array_like of int
|
|
||||||
1-D array of quantized integer coefficients to encode.
|
|
||||||
Typically corresponds to a "section" or scale-factor band.
|
|
||||||
|
|
||||||
huff_LUT_list : list
|
|
||||||
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
|
|
||||||
Index 1..11 correspond to valid Huffman codebooks.
|
|
||||||
Index 0 is unused.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
huffSec : str
|
|
||||||
Huffman-encoded bitstream represented as a string of '0' and '1'
|
|
||||||
characters.
|
|
||||||
|
|
||||||
huffCodebook : int
|
|
||||||
Index (1..11) of the Huffman codebook used for encoding.
|
|
||||||
A value of 0 indicates a special all-zero section.
|
|
||||||
"""
|
|
||||||
if force_codebook is not None:
|
|
||||||
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
|
|
||||||
|
|
||||||
maxAbsVal = np.max(np.abs(coeff_sec))
|
|
||||||
|
|
||||||
if maxAbsVal == 0:
|
|
||||||
huffCodebook = 0
|
|
||||||
huffSec = huff_LUT_code_0()
|
|
||||||
|
|
||||||
elif maxAbsVal == 1:
|
|
||||||
candidates = [1, 2]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal == 2:
|
|
||||||
candidates = [3, 4]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (3, 4):
|
|
||||||
candidates = [5, 6]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (5, 6, 7):
|
|
||||||
candidates = [7, 8]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (8, 9, 10, 11, 12):
|
|
||||||
candidates = [9, 10]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (13, 14, 15):
|
|
||||||
huffCodebook = 11
|
|
||||||
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
|
|
||||||
|
|
||||||
else:
|
|
||||||
huffCodebook = 11
|
|
||||||
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
|
|
||||||
|
|
||||||
return huffSec, huffCodebook
|
|
||||||
|
|
||||||
def huff_LUT_code_1(huff_LUT, coeff_sec):
|
|
||||||
LUT = huff_LUT['LUT']
|
|
||||||
nTupleSize = huff_LUT['nTupleSize']
|
|
||||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
|
||||||
signedValues = huff_LUT['signedValues']
|
|
||||||
|
|
||||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
|
||||||
|
|
||||||
if signedValues:
|
|
||||||
coeff = coeff_sec + maxAbsCodeVal
|
|
||||||
base = 2 * maxAbsCodeVal + 1
|
|
||||||
else:
|
|
||||||
coeff = coeff_sec
|
|
||||||
base = maxAbsCodeVal + 1
|
|
||||||
|
|
||||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
|
||||||
coeffPad[:len(coeff)] = coeff
|
|
||||||
|
|
||||||
huffSec = []
|
|
||||||
|
|
||||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
|
||||||
|
|
||||||
for i in range(numTuples):
|
|
||||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
|
||||||
huffIndex = int(np.abs(nTuple) @ powers)
|
|
||||||
|
|
||||||
hexVal = LUT[huffIndex, 2]
|
|
||||||
huffLen = LUT[huffIndex, 1]
|
|
||||||
|
|
||||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
|
||||||
|
|
||||||
if signedValues:
|
|
||||||
huffSec.append(bits)
|
|
||||||
else:
|
|
||||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
|
||||||
huffSec.append(bits + signBits)
|
|
||||||
|
|
||||||
return ''.join(huffSec)
|
|
||||||
|
|
||||||
def huff_LUT_code_0():
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
|
|
||||||
LUT = huff_LUT['LUT']
|
|
||||||
nTupleSize = huff_LUT['nTupleSize']
|
|
||||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
|
||||||
|
|
||||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
|
||||||
base = maxAbsCodeVal + 1
|
|
||||||
|
|
||||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
|
||||||
coeffPad[:len(coeff_sec)] = coeff_sec
|
|
||||||
|
|
||||||
huffSec = []
|
|
||||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
|
||||||
|
|
||||||
for i in range(numTuples):
|
|
||||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
|
||||||
|
|
||||||
lnTuple = nTuple.astype(float)
|
|
||||||
lnTuple[lnTuple == 0] = np.finfo(float).eps
|
|
||||||
|
|
||||||
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
|
|
||||||
N = np.maximum(0, N4 - 4)
|
|
||||||
esc = np.abs(nTuple) > 15
|
|
||||||
|
|
||||||
nTupleESC = nTuple.copy()
|
|
||||||
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
|
|
||||||
|
|
||||||
huffIndex = int(np.abs(nTupleESC) @ powers)
|
|
||||||
|
|
||||||
hexVal = LUT[huffIndex, 2]
|
|
||||||
huffLen = LUT[huffIndex, 1]
|
|
||||||
|
|
||||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
|
||||||
|
|
||||||
escSeq = ''
|
|
||||||
for k in range(nTupleSize):
|
|
||||||
if esc[k]:
|
|
||||||
escSeq += '1' * N[k]
|
|
||||||
escSeq += '0'
|
|
||||||
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
|
|
||||||
|
|
||||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
|
||||||
huffSec.append(bits + signBits + escSeq)
|
|
||||||
|
|
||||||
return ''.join(huffSec)
|
|
||||||
|
|
||||||
# ------------------ DECODE ------------------
|
|
||||||
|
|
||||||
def decode_huff(huff_sec, huff_LUT):
|
|
||||||
"""
|
|
||||||
Decode a Huffman-encoded stream.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
huff_sec : array-like of int or str
|
|
||||||
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
|
|
||||||
huff_LUT : dict
|
|
||||||
Huffman lookup table with keys:
|
|
||||||
- 'invTable': inverse table (numpy array)
|
|
||||||
- 'codebook': codebook number
|
|
||||||
- 'nTupleSize': tuple size
|
|
||||||
- 'maxAbsCodeVal': maximum absolute code value
|
|
||||||
- 'signedValues': True/False
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
decCoeffs : list of int
|
|
||||||
Decoded quantized coefficients.
|
|
||||||
"""
|
|
||||||
|
|
||||||
h = huff_LUT['invTable']
|
|
||||||
huffCodebook = huff_LUT['codebook']
|
|
||||||
nTupleSize = huff_LUT['nTupleSize']
|
|
||||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
|
||||||
signedValues = huff_LUT['signedValues']
|
|
||||||
|
|
||||||
# Convert string to array of ints
|
|
||||||
if isinstance(huff_sec, str):
|
|
||||||
huff_sec = np.array([int(b) for b in huff_sec])
|
|
||||||
|
|
||||||
eos = False
|
|
||||||
decCoeffs = []
|
|
||||||
streamIndex = 0
|
|
||||||
|
|
||||||
while not eos:
|
|
||||||
wordbit = 0
|
|
||||||
r = 0 # start at root
|
|
||||||
|
|
||||||
# Decode Huffman word using inverse table
|
|
||||||
while True:
|
|
||||||
b = huff_sec[streamIndex + wordbit]
|
|
||||||
wordbit += 1
|
|
||||||
rOld = r
|
|
||||||
r = h[rOld, b]
|
|
||||||
if h[r, 0] == 0 and h[r, 1] == 0:
|
|
||||||
symbolIndex = h[r, 2] - 1 # zero-based
|
|
||||||
streamIndex += wordbit
|
|
||||||
break
|
|
||||||
|
|
||||||
# Decode n-tuple magnitudes
|
|
||||||
if signedValues:
|
|
||||||
base = 2 * maxAbsCodeVal + 1
|
|
||||||
nTupleDec = []
|
|
||||||
tmp = symbolIndex
|
|
||||||
for p in reversed(range(nTupleSize)):
|
|
||||||
val = tmp // (base ** p)
|
|
||||||
nTupleDec.append(val - maxAbsCodeVal)
|
|
||||||
tmp = tmp % (base ** p)
|
|
||||||
nTupleDec = np.array(nTupleDec)
|
|
||||||
else:
|
|
||||||
base = maxAbsCodeVal + 1
|
|
||||||
nTupleDec = []
|
|
||||||
tmp = symbolIndex
|
|
||||||
for p in reversed(range(nTupleSize)):
|
|
||||||
val = tmp // (base ** p)
|
|
||||||
nTupleDec.append(val)
|
|
||||||
tmp = tmp % (base ** p)
|
|
||||||
nTupleDec = np.array(nTupleDec)
|
|
||||||
|
|
||||||
# Apply sign bits
|
|
||||||
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
|
|
||||||
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
|
|
||||||
streamIndex += nTupleSize
|
|
||||||
nTupleDec = nTupleDec * nTupleSign
|
|
||||||
|
|
||||||
# Handle escape sequences
|
|
||||||
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
|
|
||||||
if huffCodebook == 11 and escIndex.size > 0:
|
|
||||||
for idx in escIndex:
|
|
||||||
N = 0
|
|
||||||
b = huff_sec[streamIndex]
|
|
||||||
while b:
|
|
||||||
N += 1
|
|
||||||
b = huff_sec[streamIndex + N]
|
|
||||||
streamIndex += N
|
|
||||||
N4 = N + 4
|
|
||||||
escape_word = huff_sec[streamIndex:streamIndex + N4]
|
|
||||||
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
|
|
||||||
nTupleDec[idx] = escape_value
|
|
||||||
streamIndex += N4 + 1
|
|
||||||
# Apply signs again
|
|
||||||
nTupleDec[escIndex] *= nTupleSign[escIndex]
|
|
||||||
|
|
||||||
decCoeffs.extend(nTupleDec.tolist())
|
|
||||||
|
|
||||||
if streamIndex >= len(huff_sec):
|
|
||||||
eos = True
|
|
||||||
|
|
||||||
return decCoeffs
|
|
||||||
|
|
||||||
|
|
||||||
@ -9,8 +9,16 @@
|
|||||||
# cchoutou@ece.auth.gr
|
# cchoutou@ece.auth.gr
|
||||||
#
|
#
|
||||||
# Description:
|
# Description:
|
||||||
# - Level 1 AAC encoder orchestration.
|
# Level 1 AAC encoder orchestration.
|
||||||
# - Level 2 AAC encoder orchestration.
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Reads WAV via soundfile
|
||||||
|
# - Validates stereo and 48 kHz
|
||||||
|
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
|
||||||
|
# - SSC decision uses next-frame attack detection
|
||||||
|
# - Filterbank analysis (MDCT)
|
||||||
|
# - Stores per-channel spectra in AACSeq1 schema:
|
||||||
|
# * ESH: (128, 8)
|
||||||
|
# * else: (1024, 1)
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@ -18,14 +26,106 @@ from pathlib import Path
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
from scipy.io import savemat
|
||||||
|
|
||||||
from core.aac_configuration import WIN_TYPE
|
from core.aac_configuration import WIN_TYPE
|
||||||
from core.aac_filterbank import aac_filter_bank
|
from core.aac_filterbank import aac_filter_bank
|
||||||
from core.aac_ssc import aac_SSC
|
from core.aac_ssc import aac_ssc
|
||||||
from core.aac_tns import aac_tns
|
from core.aac_tns import aac_tns
|
||||||
|
from core.aac_psycho import aac_psycho
|
||||||
|
from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py
|
||||||
|
from core.aac_huffman import aac_encode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
|
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helpers for thresholds (T(b))
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Return inclusive (lo, hi) band slices derived from TableB219.
|
||||||
|
"""
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)]
|
||||||
|
|
||||||
|
|
||||||
|
def _thresholds_from_smr(
|
||||||
|
frame_F_ch: FrameChannelF,
|
||||||
|
frame_type: FrameType,
|
||||||
|
SMR: FloatArray,
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long: returns (NB, 1)
|
||||||
|
- ESH: returns (NB, 8)
|
||||||
|
"""
|
||||||
|
bands = _band_slices_from_table(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
X = np.asarray(frame_F_ch, dtype=np.float64)
|
||||||
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if X.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F_ch must have shape (128, 8).")
|
||||||
|
if SMR.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
Xj = X[:, j]
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xj[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMR[b, j])
|
||||||
|
T[b, j] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
return T
|
||||||
|
|
||||||
|
# Long
|
||||||
|
if X.shape == (1024,):
|
||||||
|
Xv = X
|
||||||
|
elif X.shape == (1024, 1):
|
||||||
|
Xv = X[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
if SMR.shape == (NB,):
|
||||||
|
SMRv = SMR
|
||||||
|
elif SMR.shape == (NB, 1):
|
||||||
|
SMRv = SMR[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 1), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xv[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMRv[b])
|
||||||
|
T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
|
||||||
|
return T
|
||||||
|
|
||||||
|
def _normalize_global_gain(G: GlobalGain) -> float | FloatArray:
|
||||||
|
"""
|
||||||
|
Normalize GlobalGain to match AACChannelFrameF3["G"] type:
|
||||||
|
- long: return float
|
||||||
|
- ESH: return float64 ndarray of shape (1, 8)
|
||||||
|
"""
|
||||||
|
if np.isscalar(G):
|
||||||
|
return float(G)
|
||||||
|
|
||||||
|
G_arr = np.asarray(G)
|
||||||
|
if G_arr.size == 1:
|
||||||
|
return float(G_arr.reshape(-1)[0])
|
||||||
|
|
||||||
|
return np.asarray(G_arr, dtype=np.float64)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public helpers (useful for level_x demo wrappers)
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -174,7 +274,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
@ -191,10 +291,6 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Level 2 encoder
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC encoder (Level 1 + TNS).
|
Level-2 AAC encoder (Level 1 + TNS).
|
||||||
@ -246,7 +342,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_SSC(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
# Level 1 analysis (packed stereo container)
|
# Level 1 analysis (packed stereo container)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
@ -278,3 +374,180 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
|
def aac_coder_3(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
filename_aac_coded: Union[str, Path] | None = None,
|
||||||
|
) -> AACSeq3:
|
||||||
|
"""
|
||||||
|
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
filename_aac_coded : Union[str, Path] | None
|
||||||
|
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq3
|
||||||
|
Encoded AAC sequence (Level 3 payload schema).
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
aac_seq: AACSeq3 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
# Pin win_type to the WinType literal for type checkers.
|
||||||
|
win_type: WinType = WIN_TYPE
|
||||||
|
|
||||||
|
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
||||||
|
prev1_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev1_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start : start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop : start + hop + win, :]
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
|
# Analysis filterbank (stereo packed)
|
||||||
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type)
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
|
# TNS per channel
|
||||||
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
|
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||||
|
|
||||||
|
# Psychoacoustic model per channel (time-domain)
|
||||||
|
frame_L = np.asarray(frame_t[:, 0], dtype=np.float64)
|
||||||
|
frame_R = np.asarray(frame_t[:, 1], dtype=np.float64)
|
||||||
|
|
||||||
|
SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L)
|
||||||
|
SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R)
|
||||||
|
|
||||||
|
# Thresholds T(b) (stored, not entropy-coded)
|
||||||
|
T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L)
|
||||||
|
T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Quantizer per channel
|
||||||
|
S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L)
|
||||||
|
S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Normalize G types for AACSeq3 schema (float | float64 ndarray).
|
||||||
|
G_Ln = _normalize_global_gain(G_L)
|
||||||
|
G_Rn = _normalize_global_gain(G_R)
|
||||||
|
|
||||||
|
# Huffman-code ONLY the DPCM differences for b>0.
|
||||||
|
# sfc[0] corresponds to alpha(0)=G and is stored separately in the frame.
|
||||||
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
|
# Codebook 11:
|
||||||
|
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
||||||
|
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
||||||
|
sf_cb = 11
|
||||||
|
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
||||||
|
|
||||||
|
sfc_L_dpcm = np.clip(
|
||||||
|
sfc_L_dpcm,
|
||||||
|
-sf_max_abs,
|
||||||
|
sf_max_abs,
|
||||||
|
).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
sfc_R_dpcm = np.clip(
|
||||||
|
sfc_R_dpcm,
|
||||||
|
-sf_max_abs,
|
||||||
|
sf_max_abs,
|
||||||
|
).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
sfc_L_stream, _ = aac_encode_huff(
|
||||||
|
sfc_L_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
force_codebook=sf_cb,
|
||||||
|
)
|
||||||
|
sfc_R_stream, _ = aac_encode_huff(
|
||||||
|
sfc_R_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
force_codebook=sf_cb,
|
||||||
|
)
|
||||||
|
|
||||||
|
mdct_L_stream, cb_L = aac_encode_huff(
|
||||||
|
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
mdct_R_stream, cb_R = aac_encode_huff(
|
||||||
|
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
|
frame_out: AACSeq3Frame = {
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": win_type,
|
||||||
|
"chl": {
|
||||||
|
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_L, dtype=np.float64),
|
||||||
|
"G": G_Ln,
|
||||||
|
"sfc": sfc_L_stream,
|
||||||
|
"stream": mdct_L_stream,
|
||||||
|
"codebook": int(cb_L),
|
||||||
|
},
|
||||||
|
"chr": {
|
||||||
|
"tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_R, dtype=np.float64),
|
||||||
|
"G": G_Rn,
|
||||||
|
"sfc": sfc_R_stream,
|
||||||
|
"stream": mdct_R_stream,
|
||||||
|
"codebook": int(cb_R),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
aac_seq.append(frame_out)
|
||||||
|
|
||||||
|
# Update psycho history (shift register)
|
||||||
|
prev2_L = prev1_L
|
||||||
|
prev1_L = frame_L
|
||||||
|
prev2_R = prev1_R
|
||||||
|
prev1_R = frame_R
|
||||||
|
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
|
||||||
|
# Optional: store to .mat for the assignment wrapper
|
||||||
|
if filename_aac_coded is not None:
|
||||||
|
filename_aac_coded = Path(filename_aac_coded)
|
||||||
|
savemat(
|
||||||
|
str(filename_aac_coded),
|
||||||
|
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
||||||
|
do_compression=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return aac_seq
|
||||||
|
|
||||||
|
|||||||
@ -22,9 +22,22 @@ import soundfile as sf
|
|||||||
|
|
||||||
from core.aac_filterbank import aac_i_filter_bank
|
from core.aac_filterbank import aac_i_filter_bank
|
||||||
from core.aac_tns import aac_i_tns
|
from core.aac_tns import aac_i_tns
|
||||||
|
from core.aac_quantizer import aac_i_quantizer
|
||||||
|
from core.aac_huffman import aac_decode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper for NB
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _nbands(frame_type: FrameType) -> int:
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, _whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return int(len(wlow))
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public helpers
|
# Public helpers
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -252,3 +265,144 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-3 AAC decoder (inverse of aac_coder_3).
|
||||||
|
|
||||||
|
Steps per frame:
|
||||||
|
- Huffman decode scalefactors (sfc) using codebook 11
|
||||||
|
- Huffman decode MDCT symbols (stream) using stored codebook
|
||||||
|
- iQuantizer -> MDCT coefficients after TNS
|
||||||
|
- iTNS using stored predictor coefficients
|
||||||
|
- IMDCT filterbank -> time domain
|
||||||
|
- Overlap-add, remove padding, write WAV
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_3 : AACSeq3
|
||||||
|
Encoded sequence as produced by aac_coder_3.
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_3)
|
||||||
|
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("aac_seq_3 must contain at least one frame.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
for i, fr in enumerate(aac_seq_3):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
# We store G separately, so Huffman stream contains only (NB-1) DPCM differences.
|
||||||
|
sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Left channel
|
||||||
|
# -------------------------
|
||||||
|
tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_L = fr["chl"]["G"]
|
||||||
|
sfc_bits_L = fr["chl"]["sfc"]
|
||||||
|
mdct_bits_L = fr["chl"]["stream"]
|
||||||
|
cb_L = int(fr["chl"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_L[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
else:
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_L[0, 0] = int(float(G_L))
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
|
||||||
|
# MDCT symbols: codebook 0 means "all-zero section"
|
||||||
|
if cb_L == 0:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
# Tuple coding may produce extra trailing symbols; caller knows the true length (1024).
|
||||||
|
# Also guard against short outputs by zero-padding.
|
||||||
|
if S_tmp_L.size < 1024:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_L[: S_tmp_L.size] = S_tmp_L
|
||||||
|
else:
|
||||||
|
S_dec_L = S_tmp_L[:1024]
|
||||||
|
|
||||||
|
S_L = S_dec_L.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type)
|
||||||
|
X_L = aac_i_tns(Xq_L, frame_type, tns_L)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Right channel
|
||||||
|
# -------------------------
|
||||||
|
tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_R = fr["chr"]["G"]
|
||||||
|
sfc_bits_R = fr["chr"]["sfc"]
|
||||||
|
mdct_bits_R = fr["chr"]["stream"]
|
||||||
|
cb_R = int(fr["chr"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_R[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
else:
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_R[0, 0] = int(float(G_R))
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
|
||||||
|
if cb_R == 0:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
if S_tmp_R.size < 1024:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_R[: S_tmp_R.size] = S_tmp_R
|
||||||
|
else:
|
||||||
|
S_dec_R = S_tmp_R[:1024]
|
||||||
|
|
||||||
|
S_R = S_dec_R.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type)
|
||||||
|
X_R = aac_i_tns(Xq_R, frame_type, tns_R)
|
||||||
|
|
||||||
|
# Re-pack to stereo container and inverse filterbank
|
||||||
|
frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R))
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|||||||
@ -176,7 +176,7 @@ def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
|||||||
# Public Function prototypes
|
# Public Function prototypes
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
||||||
"""
|
"""
|
||||||
Sequence Segmentation Control (SSC).
|
Sequence Segmentation Control (SSC).
|
||||||
|
|
||||||
|
|||||||
@ -30,9 +30,6 @@ from __future__ import annotations
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from scipy.io import loadmat
|
|
||||||
|
|
||||||
from core.aac_utils import load_b219_tables
|
from core.aac_utils import load_b219_tables
|
||||||
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
|
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
|
||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
@ -377,7 +374,9 @@ def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
|
|||||||
sw = _compute_sw(x)
|
sw = _compute_sw(x)
|
||||||
|
|
||||||
eps = 1e-12
|
eps = 1e-12
|
||||||
xw = np.where(sw > eps, x / sw, 0.0)
|
xw = np.zeros_like(x, dtype=np.float64)
|
||||||
|
mask = sw > eps
|
||||||
|
np.divide(x, sw, out=xw, where=mask)
|
||||||
|
|
||||||
a = _lpc_coeffs(xw, PRED_ORDER)
|
a = _lpc_coeffs(xw, PRED_ORDER)
|
||||||
a_q = _quantize_coeffs(a)
|
a_q = _quantize_coeffs(a)
|
||||||
|
|||||||
@ -212,6 +212,42 @@ BandValueArray: TypeAlias = FloatArray
|
|||||||
Per-band psychoacoustic values (e.g. Bark position, thresholds).
|
Per-band psychoacoustic values (e.g. Bark position, thresholds).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Quantizer-related semantic aliases
|
||||||
|
|
||||||
|
QuantizedSymbols: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Quantized MDCT symbols S(k).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols).
|
||||||
|
"""
|
||||||
|
|
||||||
|
ScaleFactors: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long frames: (NB, 1)
|
||||||
|
- ESH frames: (NB, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
GlobalGain: TypeAlias = float | NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Global gain G = alpha(0).
|
||||||
|
|
||||||
|
- Long frames: scalar float
|
||||||
|
- ESH frames: array shape (1, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Huffman semantic aliases
|
||||||
|
|
||||||
|
HuffmanBitstream: TypeAlias = str
|
||||||
|
"""Huffman-coded bitstream stored as a string of '0'/'1'."""
|
||||||
|
|
||||||
|
HuffmanCodebook: TypeAlias = int
|
||||||
|
"""Huffman codebook id (e.g., 0..11)."""
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Level 1 AAC sequence payload types
|
# Level 1 AAC sequence payload types
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -299,3 +335,77 @@ Level 2 adds:
|
|||||||
and stores:
|
and stores:
|
||||||
- per-channel "frame_F" after applying TNS.
|
- per-channel "frame_F" after applying TNS.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 3 AAC sequence payload types (Quantizer + Huffman)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF3(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
tns_coeffs:
|
||||||
|
Quantized TNS predictor coefficients for ONE channel.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (PRED_ORDER, 8)
|
||||||
|
- else: (PRED_ORDER, 1)
|
||||||
|
|
||||||
|
T:
|
||||||
|
Psychoacoustic thresholds per band.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
- else: (NB, 1)
|
||||||
|
Note: Stored for completeness / debugging; not entropy-coded.
|
||||||
|
|
||||||
|
G:
|
||||||
|
Quantized global gains.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (1, 8) (one per short subframe)
|
||||||
|
- else: scalar (or compatible np scalar)
|
||||||
|
|
||||||
|
sfc:
|
||||||
|
Huffman-coded scalefactor differences (DPCM sequence).
|
||||||
|
|
||||||
|
stream:
|
||||||
|
Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols).
|
||||||
|
|
||||||
|
codebook:
|
||||||
|
Huffman codebook id used for MDCT symbols (stream).
|
||||||
|
(Scalefactors typically use fixed codebook 11 and do not need to store it.)
|
||||||
|
"""
|
||||||
|
tns_coeffs: TnsCoeffs
|
||||||
|
T: FloatArray
|
||||||
|
G: FloatArray | float
|
||||||
|
sfc: HuffmanBitstream
|
||||||
|
stream: HuffmanBitstream
|
||||||
|
codebook: HuffmanCodebook
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq3Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_3 (Level 3).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF3
|
||||||
|
chr: AACChannelFrameF3
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq3: TypeAlias = List[AACSeq3Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 3:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
|
||||||
|
Level 3 adds (per channel):
|
||||||
|
- "tns_coeffs"
|
||||||
|
- "T" thresholds (not entropy-coded)
|
||||||
|
- "G" global gain(s)
|
||||||
|
- "sfc" Huffman-coded scalefactor differences
|
||||||
|
- "stream" Huffman-coded MDCT quantized symbols
|
||||||
|
- "codebook" Huffman codebook for MDCT symbols
|
||||||
|
"""
|
||||||
|
|||||||
Binary file not shown.
@ -1,400 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import scipy.io as sio
|
|
||||||
import os
|
|
||||||
|
|
||||||
# ------------------ LOAD LUT ------------------
|
|
||||||
|
|
||||||
def load_LUT(mat_filename=None):
|
|
||||||
"""
|
|
||||||
Loads the list of Huffman Codebooks (LUTs)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
huffLUT : list (index 1..11 used, index 0 unused)
|
|
||||||
"""
|
|
||||||
if mat_filename is None:
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
|
|
||||||
|
|
||||||
mat = sio.loadmat(mat_filename)
|
|
||||||
|
|
||||||
|
|
||||||
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
|
|
||||||
|
|
||||||
huffCodebooks = []
|
|
||||||
for i in range(11):
|
|
||||||
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
|
|
||||||
|
|
||||||
# Build inverse VLC tables
|
|
||||||
invTable = [None] * 11
|
|
||||||
|
|
||||||
for i in range(11):
|
|
||||||
h = huffCodebooks[i][:, 2].astype(int) # column 3
|
|
||||||
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
|
|
||||||
|
|
||||||
hbin = []
|
|
||||||
for j in range(len(h)):
|
|
||||||
hbin.append(format(h[j], f'0{hlength[j]}b'))
|
|
||||||
|
|
||||||
invTable[i] = vlc_table(hbin)
|
|
||||||
|
|
||||||
# Build Huffman LUT dicts
|
|
||||||
huffLUT = [None] * 12 # index 0 unused
|
|
||||||
params = [
|
|
||||||
(4, 1, True),
|
|
||||||
(4, 1, True),
|
|
||||||
(4, 2, False),
|
|
||||||
(4, 2, False),
|
|
||||||
(2, 4, True),
|
|
||||||
(2, 4, True),
|
|
||||||
(2, 7, False),
|
|
||||||
(2, 7, False),
|
|
||||||
(2, 12, False),
|
|
||||||
(2, 12, False),
|
|
||||||
(2, 16, False),
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
|
|
||||||
huffLUT[i] = {
|
|
||||||
'LUT': huffCodebooks[i-1],
|
|
||||||
'invTable': invTable[i-1],
|
|
||||||
'codebook': i,
|
|
||||||
'nTupleSize': nTupleSize,
|
|
||||||
'maxAbsCodeVal': maxAbs,
|
|
||||||
'signedValues': signed
|
|
||||||
}
|
|
||||||
|
|
||||||
return huffLUT
|
|
||||||
|
|
||||||
def vlc_table(code_array):
|
|
||||||
"""
|
|
||||||
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
|
|
||||||
returns:
|
|
||||||
h : NumPy array of shape (num_nodes, 3)
|
|
||||||
columns:
|
|
||||||
[ next_if_0 , next_if_1 , symbol_index ]
|
|
||||||
"""
|
|
||||||
h = np.zeros((1, 3), dtype=int)
|
|
||||||
|
|
||||||
for code_index, code in enumerate(code_array, start=1):
|
|
||||||
word = [int(bit) for bit in code]
|
|
||||||
h_index = 0
|
|
||||||
|
|
||||||
for bit in word:
|
|
||||||
k = bit
|
|
||||||
next_node = h[h_index, k]
|
|
||||||
if next_node == 0:
|
|
||||||
h = np.vstack([h, [0, 0, 0]])
|
|
||||||
new_index = h.shape[0] - 1
|
|
||||||
h[h_index, k] = new_index
|
|
||||||
h_index = new_index
|
|
||||||
else:
|
|
||||||
h_index = next_node
|
|
||||||
|
|
||||||
h[h_index, 2] = code_index
|
|
||||||
|
|
||||||
return h
|
|
||||||
|
|
||||||
# ------------------ ENCODE ------------------
|
|
||||||
|
|
||||||
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
|
|
||||||
"""
|
|
||||||
Huffman-encode a sequence of quantized coefficients.
|
|
||||||
|
|
||||||
This function selects the appropriate Huffman codebook based on the
|
|
||||||
maximum absolute value of the input coefficients, encodes the coefficients
|
|
||||||
into a binary Huffman bitstream, and returns both the bitstream and the
|
|
||||||
selected codebook index.
|
|
||||||
|
|
||||||
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
|
|
||||||
in audio/image coding (e.g., scale factor band encoding). The input
|
|
||||||
coefficient sequence is grouped into fixed-size tuples as defined by
|
|
||||||
the chosen Huffman LUT. Zero-padding may be applied internally.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
coeff_sec : array_like of int
|
|
||||||
1-D array of quantized integer coefficients to encode.
|
|
||||||
Typically corresponds to a "section" or scale-factor band.
|
|
||||||
|
|
||||||
huff_LUT_list : list
|
|
||||||
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
|
|
||||||
Index 1..11 correspond to valid Huffman codebooks.
|
|
||||||
Index 0 is unused.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
huffSec : str
|
|
||||||
Huffman-encoded bitstream represented as a string of '0' and '1'
|
|
||||||
characters.
|
|
||||||
|
|
||||||
huffCodebook : int
|
|
||||||
Index (1..11) of the Huffman codebook used for encoding.
|
|
||||||
A value of 0 indicates a special all-zero section.
|
|
||||||
"""
|
|
||||||
if force_codebook is not None:
|
|
||||||
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
|
|
||||||
|
|
||||||
maxAbsVal = np.max(np.abs(coeff_sec))
|
|
||||||
|
|
||||||
if maxAbsVal == 0:
|
|
||||||
huffCodebook = 0
|
|
||||||
huffSec = huff_LUT_code_0()
|
|
||||||
|
|
||||||
elif maxAbsVal == 1:
|
|
||||||
candidates = [1, 2]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal == 2:
|
|
||||||
candidates = [3, 4]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (3, 4):
|
|
||||||
candidates = [5, 6]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (5, 6, 7):
|
|
||||||
candidates = [7, 8]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (8, 9, 10, 11, 12):
|
|
||||||
candidates = [9, 10]
|
|
||||||
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
|
||||||
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
|
||||||
if len(huffSec1) <= len(huffSec2):
|
|
||||||
huffSec = huffSec1
|
|
||||||
huffCodebook = candidates[0]
|
|
||||||
else:
|
|
||||||
huffSec = huffSec2
|
|
||||||
huffCodebook = candidates[1]
|
|
||||||
|
|
||||||
elif maxAbsVal in (13, 14, 15):
|
|
||||||
huffCodebook = 11
|
|
||||||
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
|
|
||||||
|
|
||||||
else:
|
|
||||||
huffCodebook = 11
|
|
||||||
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
|
|
||||||
|
|
||||||
return huffSec, huffCodebook
|
|
||||||
|
|
||||||
def huff_LUT_code_1(huff_LUT, coeff_sec):
|
|
||||||
LUT = huff_LUT['LUT']
|
|
||||||
nTupleSize = huff_LUT['nTupleSize']
|
|
||||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
|
||||||
signedValues = huff_LUT['signedValues']
|
|
||||||
|
|
||||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
|
||||||
|
|
||||||
if signedValues:
|
|
||||||
coeff = coeff_sec + maxAbsCodeVal
|
|
||||||
base = 2 * maxAbsCodeVal + 1
|
|
||||||
else:
|
|
||||||
coeff = coeff_sec
|
|
||||||
base = maxAbsCodeVal + 1
|
|
||||||
|
|
||||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
|
||||||
coeffPad[:len(coeff)] = coeff
|
|
||||||
|
|
||||||
huffSec = []
|
|
||||||
|
|
||||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
|
||||||
|
|
||||||
for i in range(numTuples):
|
|
||||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
|
||||||
huffIndex = int(np.abs(nTuple) @ powers)
|
|
||||||
|
|
||||||
hexVal = LUT[huffIndex, 2]
|
|
||||||
huffLen = LUT[huffIndex, 1]
|
|
||||||
|
|
||||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
|
||||||
|
|
||||||
if signedValues:
|
|
||||||
huffSec.append(bits)
|
|
||||||
else:
|
|
||||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
|
||||||
huffSec.append(bits + signBits)
|
|
||||||
|
|
||||||
return ''.join(huffSec)
|
|
||||||
|
|
||||||
def huff_LUT_code_0():
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
|
|
||||||
LUT = huff_LUT['LUT']
|
|
||||||
nTupleSize = huff_LUT['nTupleSize']
|
|
||||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
|
||||||
|
|
||||||
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
|
||||||
base = maxAbsCodeVal + 1
|
|
||||||
|
|
||||||
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
|
||||||
coeffPad[:len(coeff_sec)] = coeff_sec
|
|
||||||
|
|
||||||
huffSec = []
|
|
||||||
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
|
||||||
|
|
||||||
for i in range(numTuples):
|
|
||||||
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
|
||||||
|
|
||||||
lnTuple = nTuple.astype(float)
|
|
||||||
lnTuple[lnTuple == 0] = np.finfo(float).eps
|
|
||||||
|
|
||||||
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
|
|
||||||
N = np.maximum(0, N4 - 4)
|
|
||||||
esc = np.abs(nTuple) > 15
|
|
||||||
|
|
||||||
nTupleESC = nTuple.copy()
|
|
||||||
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
|
|
||||||
|
|
||||||
huffIndex = int(np.abs(nTupleESC) @ powers)
|
|
||||||
|
|
||||||
hexVal = LUT[huffIndex, 2]
|
|
||||||
huffLen = LUT[huffIndex, 1]
|
|
||||||
|
|
||||||
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
|
||||||
|
|
||||||
escSeq = ''
|
|
||||||
for k in range(nTupleSize):
|
|
||||||
if esc[k]:
|
|
||||||
escSeq += '1' * N[k]
|
|
||||||
escSeq += '0'
|
|
||||||
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
|
|
||||||
|
|
||||||
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
|
||||||
huffSec.append(bits + signBits + escSeq)
|
|
||||||
|
|
||||||
return ''.join(huffSec)
|
|
||||||
|
|
||||||
# ------------------ DECODE ------------------
|
|
||||||
|
|
||||||
def decode_huff(huff_sec, huff_LUT):
|
|
||||||
"""
|
|
||||||
Decode a Huffman-encoded stream.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
huff_sec : array-like of int or str
|
|
||||||
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
|
|
||||||
huff_LUT : dict
|
|
||||||
Huffman lookup table with keys:
|
|
||||||
- 'invTable': inverse table (numpy array)
|
|
||||||
- 'codebook': codebook number
|
|
||||||
- 'nTupleSize': tuple size
|
|
||||||
- 'maxAbsCodeVal': maximum absolute code value
|
|
||||||
- 'signedValues': True/False
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
decCoeffs : list of int
|
|
||||||
Decoded quantized coefficients.
|
|
||||||
"""
|
|
||||||
|
|
||||||
h = huff_LUT['invTable']
|
|
||||||
huffCodebook = huff_LUT['codebook']
|
|
||||||
nTupleSize = huff_LUT['nTupleSize']
|
|
||||||
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
|
||||||
signedValues = huff_LUT['signedValues']
|
|
||||||
|
|
||||||
# Convert string to array of ints
|
|
||||||
if isinstance(huff_sec, str):
|
|
||||||
huff_sec = np.array([int(b) for b in huff_sec])
|
|
||||||
|
|
||||||
eos = False
|
|
||||||
decCoeffs = []
|
|
||||||
streamIndex = 0
|
|
||||||
|
|
||||||
while not eos:
|
|
||||||
wordbit = 0
|
|
||||||
r = 0 # start at root
|
|
||||||
|
|
||||||
# Decode Huffman word using inverse table
|
|
||||||
while True:
|
|
||||||
b = huff_sec[streamIndex + wordbit]
|
|
||||||
wordbit += 1
|
|
||||||
rOld = r
|
|
||||||
r = h[rOld, b]
|
|
||||||
if h[r, 0] == 0 and h[r, 1] == 0:
|
|
||||||
symbolIndex = h[r, 2] - 1 # zero-based
|
|
||||||
streamIndex += wordbit
|
|
||||||
break
|
|
||||||
|
|
||||||
# Decode n-tuple magnitudes
|
|
||||||
if signedValues:
|
|
||||||
base = 2 * maxAbsCodeVal + 1
|
|
||||||
nTupleDec = []
|
|
||||||
tmp = symbolIndex
|
|
||||||
for p in reversed(range(nTupleSize)):
|
|
||||||
val = tmp // (base ** p)
|
|
||||||
nTupleDec.append(val - maxAbsCodeVal)
|
|
||||||
tmp = tmp % (base ** p)
|
|
||||||
nTupleDec = np.array(nTupleDec)
|
|
||||||
else:
|
|
||||||
base = maxAbsCodeVal + 1
|
|
||||||
nTupleDec = []
|
|
||||||
tmp = symbolIndex
|
|
||||||
for p in reversed(range(nTupleSize)):
|
|
||||||
val = tmp // (base ** p)
|
|
||||||
nTupleDec.append(val)
|
|
||||||
tmp = tmp % (base ** p)
|
|
||||||
nTupleDec = np.array(nTupleDec)
|
|
||||||
|
|
||||||
# Apply sign bits
|
|
||||||
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
|
|
||||||
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
|
|
||||||
streamIndex += nTupleSize
|
|
||||||
nTupleDec = nTupleDec * nTupleSign
|
|
||||||
|
|
||||||
# Handle escape sequences
|
|
||||||
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
|
|
||||||
if huffCodebook == 11 and escIndex.size > 0:
|
|
||||||
for idx in escIndex:
|
|
||||||
N = 0
|
|
||||||
b = huff_sec[streamIndex]
|
|
||||||
while b:
|
|
||||||
N += 1
|
|
||||||
b = huff_sec[streamIndex + N]
|
|
||||||
streamIndex += N
|
|
||||||
N4 = N + 4
|
|
||||||
escape_word = huff_sec[streamIndex:streamIndex + N4]
|
|
||||||
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
|
|
||||||
nTupleDec[idx] = escape_value
|
|
||||||
streamIndex += N4 + 1
|
|
||||||
# Apply signs again
|
|
||||||
nTupleDec[escIndex] *= nTupleSign[escIndex]
|
|
||||||
|
|
||||||
decCoeffs.extend(nTupleDec.tolist())
|
|
||||||
|
|
||||||
if streamIndex >= len(huff_sec):
|
|
||||||
eos = True
|
|
||||||
|
|
||||||
return decCoeffs
|
|
||||||
|
|
||||||
|
|
||||||
@ -2,3 +2,6 @@
|
|||||||
pythonpath = .
|
pythonpath = .
|
||||||
testpaths =
|
testpaths =
|
||||||
core/tests
|
core/tests
|
||||||
|
|
||||||
|
filterwarnings =
|
||||||
|
error::RuntimeWarning
|
||||||
Loading…
x
Reference in New Issue
Block a user