575 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC encoder orchestration.
# Keeps the same functional behavior as the original level_1 implementation:
# - Reads WAV via soundfile
# - Validates stereo and 48 kHz
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
# - SSC decision uses next-frame attack detection
# - Filterbank analysis (MDCT)
# - Stores per-channel spectra in AACSeq1 schema:
# * ESH: (128, 8)
# * else: (1024, 1)
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from scipy.io import savemat
from core.aac_configuration import WIN_TYPE
from core.aac_filterbank import aac_filter_bank
from core.aac_ssc import aac_ssc
from core.aac_tns import aac_tns
from core.aac_psycho import aac_psycho
from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py
from core.aac_huffman import aac_encode_huff
from core.aac_utils import get_table, band_limits
from material.huff_utils import load_LUT
from core.aac_types import *
# -----------------------------------------------------------------------------
# Helpers for thresholds (T(b))
# -----------------------------------------------------------------------------
def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]:
"""
Return inclusive (lo, hi) band slices derived from TableB219.
"""
table, _ = get_table(frame_type)
wlow, whigh, _bval, _qthr_db = band_limits(table)
return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)]
def _thresholds_from_smr(
frame_F_ch: FrameChannelF,
frame_type: FrameType,
SMR: FloatArray,
) -> FloatArray:
"""
Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy.
Shapes:
- Long: returns (NB, 1)
- ESH: returns (NB, 8)
"""
bands = _band_slices_from_table(frame_type)
NB = len(bands)
X = np.asarray(frame_F_ch, dtype=np.float64)
SMR = np.asarray(SMR, dtype=np.float64)
if frame_type == "ESH":
if X.shape != (128, 8):
raise ValueError("For ESH, frame_F_ch must have shape (128, 8).")
if SMR.shape != (NB, 8):
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
T = np.zeros((NB, 8), dtype=np.float64)
for j in range(8):
Xj = X[:, j]
for b, (lo, hi) in enumerate(bands):
P = float(np.sum(Xj[lo : hi + 1] ** 2))
smr = float(SMR[b, j])
T[b, j] = 0.0 if smr <= 1e-12 else (P / smr)
return T
# Long
if X.shape == (1024,):
Xv = X
elif X.shape == (1024, 1):
Xv = X[:, 0]
else:
raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).")
if SMR.shape == (NB,):
SMRv = SMR
elif SMR.shape == (NB, 1):
SMRv = SMR[:, 0]
else:
raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).")
T = np.zeros((NB, 1), dtype=np.float64)
for b, (lo, hi) in enumerate(bands):
P = float(np.sum(Xv[lo : hi + 1] ** 2))
smr = float(SMRv[b])
T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr)
return T
def _normalize_global_gain(G: GlobalGain) -> float | FloatArray:
"""
Normalize GlobalGain to match AACChannelFrameF3["G"] type:
- long: return float
- ESH: return float64 ndarray of shape (1, 8)
"""
if np.isscalar(G):
return float(G)
G_arr = np.asarray(G)
if G_arr.size == 1:
return float(G_arr.reshape(-1)[0])
return np.asarray(G_arr, dtype=np.float64)
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
"""
Read a WAV file using soundfile and validate the Level-1 assumptions.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Returns
-------
x : StereoSignal (np.ndarray)
Stereo samples as float64, shape (N, 2).
fs : int
Sampling rate (Hz). Must be 48000.
Raises
------
ValueError
If the input is not stereo or the sampling rate is not 48 kHz.
"""
filename_in = Path(filename_in)
x, fs = sf.read(str(filename_in), always_2d=True)
x = np.asarray(x, dtype=np.float64)
if x.shape[1] != 2:
raise ValueError("Input must be stereo (2 channels).")
if int(fs) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
return x, int(fs)
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
as required by the Level-1 AACSeq1 schema.
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
frame_f : FrameF
Output of aac_filter_bank():
- If frame_type != "ESH": shape (1024, 2)
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
Returns
-------
chl_f : FrameChannelF
Left channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
"""
if frame_type == "ESH":
if frame_f.shape != (128, 16):
raise ValueError("For ESH, frame_f must have shape (128, 16).")
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f[:, 2 * j + 0]
chr_f[:, j] = frame_f[:, 2 * j + 1]
return chl_f, chr_f
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
if frame_f.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
return chl_f, chr_f
# -----------------------------------------------------------------------------
# Level 1 encoder
# -----------------------------------------------------------------------------
def aac_coder_1(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq1:
"""
Level-1 AAC encoder.
This function preserves the behavior of the original level_1 implementation:
- Read stereo 48 kHz WAV
- Pad hop samples at start and hop samples at end
- Frame with win=2048, hop=1024
- Use SSC with next-frame lookahead
- Apply filterbank analysis
- Store per-channel coefficients using AACSeq1 schema
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
verbose : bool
Optional argument to print encoding status
Returns
-------
AACSeq1
List of encoded frames (Level 1 schema).
"""
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
# Pad at the beginning to support the first overlap region.
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
# Number of frames such that current frame fits; next frame will be padded if needed.
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS"
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start:start + win, :]
if frame_t.shape != (win, 2):
# This should not happen due to K definition, but keep it explicit.
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop:start + hop + win, :]
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f},
})
prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq
def aac_coder_2(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq2:
"""
Level-2 AAC encoder (Level 1 + TNS).
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
verbose : bool
Optional argument to print encoding status
Returns
-------
AACSeq2
Encoded AAC sequence (Level 2 payload schema).
For each frame i:
- "frame_type": FrameType
- "win_type": WinType
- "chl"/"chr":
- "frame_F": FrameChannelF (after TNS)
- "tns_coeffs": TnsCoeffs
"""
filename_in = Path(filename_in)
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS"
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start : start + win, :]
if frame_t.shape != (win, 2):
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop : start + hop + win, :]
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
# Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
# Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
aac_seq.append(
{
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
}
)
prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq
def aac_coder_3(
filename_in: Union[str, Path],
filename_aac_coded: Union[str, Path] | None = None,
verbose: bool = False,
) -> AACSeq3:
"""
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
filename_aac_coded : Union[str, Path] | None
Optional .mat filename to store aac_seq_3 (assignment convenience).
verbose : bool
Optional argument to print encoding status
Returns
-------
AACSeq3
Encoded AAC sequence (Level 3 payload schema).
"""
filename_in = Path(filename_in)
x, _ = aac_read_wav_stereo_48k(filename_in)
hop = 1024
win = 2048
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
# Load Huffman LUTs once.
huff_LUT_list = load_LUT()
aac_seq: AACSeq3 = []
prev_frame_type: FrameType = "OLS"
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
prev1_L = np.zeros((2048,), dtype=np.float64)
prev2_L = np.zeros((2048,), dtype=np.float64)
prev1_R = np.zeros((2048,), dtype=np.float64)
prev2_R = np.zeros((2048,), dtype=np.float64)
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start : start + win, :]
if frame_t.shape != (win, 2):
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop : start + hop + win, :]
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
# Analysis filterbank (stereo packed)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
# TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
# Psychoacoustic model per channel (time-domain)
frame_L = np.asarray(frame_t[:, 0], dtype=np.float64)
frame_R = np.asarray(frame_t[:, 1], dtype=np.float64)
SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L)
SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R)
# Thresholds T(b) (stored, not entropy-coded)
T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L)
T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R)
# Quantizer per channel
S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L)
S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R)
# Normalize G types for AACSeq3 schema (float | float64 ndarray).
G_Ln = _normalize_global_gain(G_L)
G_Rn = _normalize_global_gain(G_R)
# Huffman-code ONLY the DPCM differences for b>0.
# sfc[0] corresponds to alpha(0)=G and is stored separately in the frame.
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
# Codebook 11:
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
# We must stay strictly within [-15, +15] to avoid escape decoding.
# sf_cb = 11
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
#
# sfc_L_dpcm = np.clip(
# sfc_L_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
#
# sfc_R_dpcm = np.clip(
# sfc_R_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
sfc_L_stream, cb_sfc_L = aac_encode_huff(
sfc_L_dpcm.reshape(-1, order="F"),
huff_LUT_list,
# force_codebook=11,
)
sfc_R_stream, cb_sfc_R = aac_encode_huff(
sfc_R_dpcm.reshape(-1, order="F"),
huff_LUT_list,
# force_codebook=11,
)
if cb_sfc_L != 11 or cb_sfc_R != 11:
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
mdct_L_stream, cb_L = aac_encode_huff(
np.asarray(S_L, dtype=np.int64).reshape(-1),
huff_LUT_list,
)
mdct_R_stream, cb_R = aac_encode_huff(
np.asarray(S_R, dtype=np.int64).reshape(-1),
huff_LUT_list,
)
# Typed dict construction helps static analyzers validate the schema.
frame_out: AACSeq3Frame = {
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
"T": np.asarray(T_L, dtype=np.float64),
"G": G_Ln,
"sfc": sfc_L_stream,
"stream": mdct_L_stream,
"codebook": int(cb_L),
},
"chr": {
"tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64),
"T": np.asarray(T_R, dtype=np.float64),
"G": G_Rn,
"sfc": sfc_R_stream,
"stream": mdct_R_stream,
"codebook": int(cb_R),
},
}
aac_seq.append(frame_out)
# Update psycho history (shift register)
prev2_L = prev1_L
prev1_L = frame_L
prev2_R = prev1_R
prev1_R = frame_R
prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
# Optional: store to .mat for the assignment wrapper
if filename_aac_coded is not None:
filename_aac_coded = Path(filename_aac_coded)
savemat(
str(filename_aac_coded),
{"aac_seq_3": np.array(aac_seq, dtype=object)},
do_compression=True,
)
return aac_seq