Level 3: First positive SNR version

This commit is contained in:
Christos Choutouridis 2026-02-15 21:16:54 +02:00
parent 4ebee28e4e
commit cd2b89bd73
28 changed files with 5043 additions and 275 deletions

View File

@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) ->
# Level 1 encoder # Level 1 encoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: def aac_coder_1(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq1:
""" """
Level-1 AAC encoder. Level-1 AAC encoder.
@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
filename_in : Union[str, Path] filename_in : Union[str, Path]
Input WAV filename. Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz. Assumption: stereo audio, sampling rate 48 kHz.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
aac_seq: AACSeq1 = [] aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
win_type: WinType = WIN_TYPE if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
next_t = np.vstack([next_t, tail]) next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, win_type) frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({ aac_seq.append({
"frame_type": frame_type, "frame_type": frame_type,
"win_type": win_type, "win_type": WIN_TYPE,
"chl": {"frame_F": chl_f}, "chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f}, "chr": {"frame_F": chr_f},
}) })
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq return aac_seq
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: def aac_coder_2(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq2:
""" """
Level-2 AAC encoder (Level 1 + TNS). Level-2 AAC encoder (Level 1 + TNS).
@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
---------- ----------
filename_in : Union[str, Path] filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz). Input WAV filename (stereo, 48 kHz).
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
aac_seq: AACSeq2 = [] aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
# Level 1 analysis (packed stereo container) # Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
# Unpack to per-channel (as you already do in Level 1) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
if frame_type == "ESH":
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
else:
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
# Level 2: apply TNS per channel # Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
} }
) )
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq return aac_seq
@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
def aac_coder_3( def aac_coder_3(
filename_in: Union[str, Path], filename_in: Union[str, Path],
filename_aac_coded: Union[str, Path] | None = None, filename_aac_coded: Union[str, Path] | None = None,
verbose: bool = False,
) -> AACSeq3: ) -> AACSeq3:
""" """
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
@ -389,6 +402,8 @@ def aac_coder_3(
Input WAV filename (stereo, 48 kHz). Input WAV filename (stereo, 48 kHz).
filename_aac_coded : Union[str, Path] | None filename_aac_coded : Union[str, Path] | None
Optional .mat filename to store aac_seq_3 (assignment convenience). Optional .mat filename to store aac_seq_3 (assignment convenience).
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -416,15 +431,14 @@ def aac_coder_3(
aac_seq: AACSeq3 = [] aac_seq: AACSeq3 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
# Pin win_type to the WinType literal for type checkers.
win_type: WinType = WIN_TYPE
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
prev1_L = np.zeros((2048,), dtype=np.float64) prev1_L = np.zeros((2048,), dtype=np.float64)
prev2_L = np.zeros((2048,), dtype=np.float64) prev2_L = np.zeros((2048,), dtype=np.float64)
prev1_R = np.zeros((2048,), dtype=np.float64) prev1_R = np.zeros((2048,), dtype=np.float64)
prev2_R = np.zeros((2048,), dtype=np.float64) prev2_R = np.zeros((2048,), dtype=np.float64)
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -440,7 +454,7 @@ def aac_coder_3(
frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
# Analysis filterbank (stereo packed) # Analysis filterbank (stereo packed)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
# TNS per channel # TNS per channel
@ -474,32 +488,35 @@ def aac_coder_3(
# Codebook 11: # Codebook 11:
# maxAbsCodeVal = 16 is RESERVED for ESCAPE. # maxAbsCodeVal = 16 is RESERVED for ESCAPE.
# We must stay strictly within [-15, +15] to avoid escape decoding. # We must stay strictly within [-15, +15] to avoid escape decoding.
sf_cb = 11 # sf_cb = 11
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
#
# sfc_L_dpcm = np.clip(
# sfc_L_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
#
# sfc_R_dpcm = np.clip(
# sfc_R_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
sfc_L_dpcm = np.clip( sfc_L_stream, cb_sfc_L = aac_encode_huff(
sfc_L_dpcm,
-sf_max_abs,
sf_max_abs,
).astype(np.int64, copy=False)
sfc_R_dpcm = np.clip(
sfc_R_dpcm,
-sf_max_abs,
sf_max_abs,
).astype(np.int64, copy=False)
sfc_L_stream, _ = aac_encode_huff(
sfc_L_dpcm.reshape(-1, order="F"), sfc_L_dpcm.reshape(-1, order="F"),
huff_LUT_list, huff_LUT_list,
force_codebook=sf_cb, # force_codebook=11,
) )
sfc_R_stream, _ = aac_encode_huff( sfc_R_stream, cb_sfc_R = aac_encode_huff(
sfc_R_dpcm.reshape(-1, order="F"), sfc_R_dpcm.reshape(-1, order="F"),
huff_LUT_list, huff_LUT_list,
force_codebook=sf_cb, # force_codebook=11,
) )
if cb_sfc_L != 11 or cb_sfc_R != 11:
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
mdct_L_stream, cb_L = aac_encode_huff( mdct_L_stream, cb_L = aac_encode_huff(
np.asarray(S_L, dtype=np.int64).reshape(-1), np.asarray(S_L, dtype=np.int64).reshape(-1),
huff_LUT_list, huff_LUT_list,
@ -512,7 +529,7 @@ def aac_coder_3(
# Typed dict construction helps static analyzers validate the schema. # Typed dict construction helps static analyzers validate the schema.
frame_out: AACSeq3Frame = { frame_out: AACSeq3Frame = {
"frame_type": frame_type, "frame_type": frame_type,
"win_type": win_type, "win_type": WIN_TYPE,
"chl": { "chl": {
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
"T": np.asarray(T_L, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64),
@ -539,6 +556,11 @@ def aac_coder_3(
prev1_R = frame_R prev1_R = frame_R
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
# Optional: store to .mat for the assignment wrapper # Optional: store to .mat for the assignment wrapper
if filename_aac_coded is not None: if filename_aac_coded is not None:
@ -548,6 +570,5 @@ def aac_coder_3(
{"aac_seq_3": np.array(aac_seq, dtype=object)}, {"aac_seq_3": np.array(aac_seq, dtype=object)},
do_compression=True, do_compression=True,
) )
return aac_seq return aac_seq

View File

@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
# Level 1 decoder # Level 1 decoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_1(
aac_seq_1: AACSeq1,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
""" """
Level-1 AAC decoder (inverse of aac_coder_1()). Level-1 AAC decoder (inverse of aac_coder_1()).
@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_1(). Encoded sequence as produced by aac_coder_1().
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo. Output WAV filename. Assumption: 48 kHz, stereo.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_1): for i, fr in enumerate(aac_seq_1):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start:start + win, :] += frame_t_hat y_pad[start:start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y: StereoSignal = aac_remove_padding(y_pad, hop=hop) y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
# Level 1 assumption: 48 kHz output. # Level 1 assumption: 48 kHz output.
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
# Level 2 decoder # Level 2 decoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_2(
aac_seq_2: AACSeq2,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
""" """
Level-2 AAC decoder (inverse of aac_coder_2). Level-2 AAC decoder (inverse of aac_coder_2).
@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_2(). Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64) y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_2): for i, fr in enumerate(aac_seq_2):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start : start + win, :] += frame_t_hat y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop) y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_3(
aac_seq_3: AACSeq3,
filename_out: Union[str, Path],
verbose: bool = False,
) -> StereoSignal:
""" """
Level-3 AAC decoder (inverse of aac_coder_3). Level-3 AAC decoder (inverse of aac_coder_3).
@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_3. Encoded sequence as produced by aac_coder_3.
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64) y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_3): for i, fr in enumerate(aac_seq_3):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start : start + win, :] += frame_t_hat y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop) y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y

View File

@ -316,8 +316,8 @@ def _psycho_one_window(
nb = en * bc nb = en * bc
# Threshold in quiet (convert from dB to power domain): # Threshold in quiet (convert from dB to power domain):
# qthr_power = (N/2) * 10^(qthr_db/10) # qthr_power = eps * (N/2) * 10^(qthr_db/10)
qthr_power = (N / 2.0) * (10.0 ** (qthr_db / 10.0)) qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0))
# Final masking threshold per band: # Final masking threshold per band:
# np(b) = max(nb(b), qthr(b)) # np(b) = max(nb(b), qthr(b))

View File

@ -25,38 +25,64 @@ from core.aac_utils import snr_db
from core.aac_types import * from core.aac_types import *
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1 # -----------------------------------------------------------------------------
# Fixtures (small wav logic)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@pytest.fixture(scope="session")
def wav_in_path() -> Path:
"""
Provided input WAV used for end-to-end tests.
Expected project layout:
source/material/LicorDeCalandraca.wav
"""
return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav"
@pytest.fixture() @pytest.fixture()
def tmp_stereo_wav(tmp_path: Path) -> Path: def mk_random_stereo_wav(tmp_path: Path, request: pytest.FixtureRequest) -> Path:
""" """
Create a temporary 48 kHz stereo WAV with random samples. Create a temporary 48 kHz stereo WAV with random samples.
Length (in seconds) must be provided via indirect parametrization.
""" """
length = float(request.param)
rng = np.random.default_rng(123) rng = np.random.default_rng(123)
fs = 48000 fs = 48000
# ~1 second of audio (kept small for test speed). n = int(fs * length)
n = fs
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64) x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
wav_path = tmp_path / "in.wav" wav_path = tmp_path / "in_random.wav"
sf.write(str(wav_path), x, fs) sf.write(str(wav_path), x, fs)
return wav_path return wav_path
@pytest.fixture()
def mk_actual_stereo_wav(tmp_path: Path, wav_in_path: Path, request: pytest.FixtureRequest) -> Path:
"""
Create a temporary 48 kHz stereo WAV by chopping from the provided material WAV.
Length can be overridden via indirect parametrization.
"""
length = float(getattr(request, "param", 0.25)) # seconds (default: small)
x, fs = aac_read_wav_stereo_48k(wav_in_path)
n = int(fs * length)
x_short = x[:n, :]
wav_path = tmp_path / "in_actual.wav"
sf.write(str(wav_path), x_short, fs)
return wav_path
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Helper-function tests # Helper-function tests
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None: @pytest.mark.parametrize("mk_random_stereo_wav", [2.0], indirect=True)
""" def test_aac_read_wav_stereo_48k_roundtrip(mk_random_stereo_wav: Path) -> None:
Contract test for aac_read_wav_stereo_48k(): x, fs = aac_read_wav_stereo_48k(mk_random_stereo_wav)
- Reads stereo WAV
- Returns float64 array with shape (N,2)
- Returns fs = 48000
"""
x, fs = aac_read_wav_stereo_48k(tmp_stereo_wav)
assert int(fs) == 48000 assert int(fs) == 48000
assert isinstance(x, np.ndarray) assert isinstance(x, np.ndarray)
@ -67,10 +93,6 @@ def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None:
def test_aac_remove_padding_removes_hop_from_both_ends() -> None: def test_aac_remove_padding_removes_hop_from_both_ends() -> None:
"""
Contract test for aac_remove_padding():
- Removes 'hop' samples from start and end.
"""
hop = 1024 hop = 1024
n = 10000 n = 10000
@ -82,9 +104,6 @@ def test_aac_remove_padding_removes_hop_from_both_ends() -> None:
def test_aac_remove_padding_errors_on_too_short_input() -> None: def test_aac_remove_padding_errors_on_too_short_input() -> None:
"""
aac_remove_padding must raise if y_pad is shorter than 2*hop.
"""
hop = 1024 hop = 1024
y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64) y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64)
@ -95,12 +114,10 @@ def test_aac_remove_padding_errors_on_too_short_input() -> None:
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Level 1 tests # Level 1 tests
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
""" @pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
Module-level contract test: def test_aac_coder_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None:
Ensure aac_seq_1 follows the expected schema and per-frame shapes. aac_seq: AACSeq1 = aac_coder_1(mk_random_stereo_wav)
"""
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
assert isinstance(aac_seq, list) assert isinstance(aac_seq, list)
assert len(aac_seq) > 0 assert len(aac_seq) > 0
@ -108,7 +125,6 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
for fr in aac_seq: for fr in aac_seq:
assert isinstance(fr, dict) assert isinstance(fr, dict)
# Required keys
assert "frame_type" in fr assert "frame_type" in fr
assert "win_type" in fr assert "win_type" in fr
assert "chl" in fr assert "chl" in fr
@ -136,41 +152,32 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
assert chr_f.shape == (1024, 1) assert chr_f.shape == (1024, 1)
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: @pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
""" def test_end_to_end_aac_coder_decoder_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None:
End-to-end test: x_ref, fs = sf.read(str(mk_random_stereo_wav), always_2d=True)
Encode + decode and check SNR is very high (numerical-noise only).
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
"""
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
x_ref = np.asarray(x_ref, dtype=np.float64) x_ref = np.asarray(x_ref, dtype=np.float64)
assert int(fs) == 48000 assert int(fs) == 48000
out_wav = tmp_path / "out.wav" out_wav = tmp_path / "out.wav"
aac_seq = aac_coder_1(tmp_stereo_wav) aac_seq = aac_coder_1(mk_random_stereo_wav)
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav) x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
# Basic sanity: output file exists and is readable
assert out_wav.exists() assert out_wav.exists()
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True) _, fs_hat = sf.read(str(out_wav), always_2d=True)
assert int(fs_hat) == 48000 assert int(fs_hat) == 48000
# SNR against returned array (file should match closely, but we do not require it here).
snr = snr_db(x_ref, x_hat) snr = snr_db(x_ref, x_hat)
assert snr > 80.0 assert snr > 80.0
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Level 2 tests (new) # Level 2 tests
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: @pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
""" def test_aac_coder_2_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None:
Module-level contract test (Level 2): aac_seq: AACSeq2 = aac_coder_2(mk_random_stereo_wav)
Ensure aac_seq_2 follows the expected schema and per-frame shapes, including tns_coeffs.
"""
aac_seq: AACSeq2 = aac_coder_2(tmp_stereo_wav)
assert isinstance(aac_seq, list) assert isinstance(aac_seq, list)
assert len(aac_seq) > 0 assert len(aac_seq) > 0
@ -194,27 +201,20 @@ def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
if frame_type == "ESH": if frame_type == "ESH":
assert frame_f.shape == (128, 8) assert frame_f.shape == (128, 8)
assert coeffs.shape[0] == 4 assert coeffs.shape == (4, 8)
assert coeffs.shape[1] == 8
else: else:
assert frame_f.shape == (1024, 1) assert frame_f.shape == (1024, 1)
assert coeffs.shape == (4, 1) assert coeffs.shape == (4, 1)
def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: @pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
""" def test_end_to_end_level_2_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None:
End-to-end test (Level 2): x_ref, fs = sf.read(str(mk_random_stereo_wav), always_2d=True)
Encode + decode and check SNR remains very high.
Level 2 is still floating-point (TNS is reversible), so reconstruction
should remain numerical-noise only.
"""
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
x_ref = np.asarray(x_ref, dtype=np.float64) x_ref = np.asarray(x_ref, dtype=np.float64)
assert int(fs) == 48000 assert int(fs) == 48000
out_wav = tmp_path / "out_l2.wav" out_wav = tmp_path / "out_l2.wav"
aac_seq = aac_coder_2(tmp_stereo_wav) aac_seq = aac_coder_2(mk_random_stereo_wav)
x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav) x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav)
assert out_wav.exists() assert out_wav.exists()
@ -222,30 +222,14 @@ def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> No
assert int(fs_hat) == 48000 assert int(fs_hat) == 48000
snr = snr_db(x_ref, x_hat) snr = snr_db(x_ref, x_hat)
assert snr > 80 assert snr > 80.0
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Level 3 tests (Quantizer + Huffman) # Level 3 tests (Quantizer + Huffman)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@pytest.fixture(scope="module")
def wav_in_path() -> Path:
"""
Input WAV used for end-to-end tests.
This should point to the provided test audio under material/.
Adjust this path if your project layout differs.
"""
# Typical layout in this project:
# source/material/LicorDeCalandraca.wav
return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav"
def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None: def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None:
"""
Validate Level-3 per-frame schema (keys + basic types only).
"""
assert "frame_type" in frame assert "frame_type" in frame
assert "win_type" in frame assert "win_type" in frame
assert "chl" in frame assert "chl" in frame
@ -264,37 +248,18 @@ def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None:
assert isinstance(ch["stream"], str) assert isinstance(ch["stream"], str)
assert isinstance(ch["codebook"], int) assert isinstance(ch["codebook"], int)
# Arrays: only check they are numpy arrays with expected dtype categories.
assert isinstance(ch["tns_coeffs"], np.ndarray) assert isinstance(ch["tns_coeffs"], np.ndarray)
assert isinstance(ch["T"], np.ndarray) assert isinstance(ch["T"], np.ndarray)
# Global gain: long frames may be scalar float, ESH may be ndarray
assert np.isscalar(ch["G"]) or isinstance(ch["G"], np.ndarray) assert np.isscalar(ch["G"]) or isinstance(ch["G"], np.ndarray)
def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) -> None: @pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True)
def test_aac_coder_3_seq_schema_and_shapes(mk_actual_stereo_wav: Path) -> None:
""" """
Contract test: Uses a short WAV excerpt produced by mk_actual_stereo_wav.
- aac_coder_3 returns AACSeq3 0.11s is enough for a few frames at 48 kHz.
- Per-frame keys exist and types are consistent
- Basic shape expectations hold for ESH vs non-ESH cases
Note:
This test uses a short excerpt (a few frames) to keep runtime bounded.
""" """
# Use only a few frames to avoid long runtimes in the quantizer loop. aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav)
hop = 1024
win = 2048
n_frames = 4
n_samples = win + (n_frames - 1) * hop
x, fs = aac_read_wav_stereo_48k(wav_in_path)
x_short = x[:n_samples, :]
short_wav = tmp_path / "input_short.wav"
sf.write(str(short_wav), x_short, fs)
aac_seq_3: AACSeq3 = aac_coder_3(short_wav)
assert isinstance(aac_seq_3, list) assert isinstance(aac_seq_3, list)
assert len(aac_seq_3) > 0 assert len(aac_seq_3) > 0
@ -329,46 +294,21 @@ def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) ->
else: else:
assert np.isscalar(G) assert np.isscalar(G)
assert isinstance(ch["sfc"], str)
assert isinstance(ch["stream"], str)
@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True)
def test_end_to_end_level_3_high_snr(mk_actual_stereo_wav: Path, tmp_path: Path) -> None:
def test_end_to_end_level_3_high_snr(wav_in_path: Path, tmp_path: Path) -> None:
""" """
End-to-end test for Level 3 (Quantizer + Huffman): End-to-end Level 3 using a small WAV excerpt produced by mk_actual_stereo_wav.
coder_3 -> decoder_3 should reconstruct a waveform with acceptable SNR.
Notes
-----
- Level 3 includes quantization, so SNR is expected to be lower than Level 1/2.
- We intentionally use a short excerpt (few frames) to keep runtime bounded,
since the reference quantizer implementation is computationally expensive.
""" """
# Use only a few frames to avoid long runtimes. x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav)
hop = 1024 assert int(fs) == 48000
win = 2048
n_frames = 4
n_samples = win + (n_frames - 1) * hop
x_ref, fs = aac_read_wav_stereo_48k(wav_in_path)
x_short = x_ref[:n_samples, :]
short_wav = tmp_path / "input_short_l3.wav"
sf.write(str(short_wav), x_short, fs)
out_wav = tmp_path / "decoded_level3.wav" out_wav = tmp_path / "decoded_level3.wav"
aac_seq_3: AACSeq3 = aac_coder_3(short_wav) aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav)
y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav) y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav)
# Align lengths defensively (padding removal may differ by a few samples) n = min(x_ref.shape[0], y_hat.shape[0])
n = min(x_short.shape[0], y_hat.shape[0]) s = snr_db(x_ref[:n, :], y_hat[:n, :])
x2 = x_short[:n, :] print(f"SNR={s}")
y2 = y_hat[:n, :] assert s > 2.0
s = snr_db(x2, y2)
# Conservative threshold: Level 3 is lossy by design.
assert s > 10.0

View File

@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) ->
# Level 1 encoder # Level 1 encoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: def aac_coder_1(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq1:
""" """
Level-1 AAC encoder. Level-1 AAC encoder.
@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
filename_in : Union[str, Path] filename_in : Union[str, Path]
Input WAV filename. Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz. Assumption: stereo audio, sampling rate 48 kHz.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
aac_seq: AACSeq1 = [] aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
win_type: WinType = WIN_TYPE if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
next_t = np.vstack([next_t, tail]) next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, win_type) frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({ aac_seq.append({
"frame_type": frame_type, "frame_type": frame_type,
"win_type": win_type, "win_type": WIN_TYPE,
"chl": {"frame_F": chl_f}, "chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f}, "chr": {"frame_F": chr_f},
}) })
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq return aac_seq
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: def aac_coder_2(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq2:
""" """
Level-2 AAC encoder (Level 1 + TNS). Level-2 AAC encoder (Level 1 + TNS).
@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
---------- ----------
filename_in : Union[str, Path] filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz). Input WAV filename (stereo, 48 kHz).
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
aac_seq: AACSeq2 = [] aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
# Level 1 analysis (packed stereo container) # Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
# Unpack to per-channel (as you already do in Level 1) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
if frame_type == "ESH":
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
else:
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
# Level 2: apply TNS per channel # Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
} }
) )
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq return aac_seq
@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
def aac_coder_3( def aac_coder_3(
filename_in: Union[str, Path], filename_in: Union[str, Path],
filename_aac_coded: Union[str, Path] | None = None, filename_aac_coded: Union[str, Path] | None = None,
verbose: bool = False,
) -> AACSeq3: ) -> AACSeq3:
""" """
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
@ -389,6 +402,8 @@ def aac_coder_3(
Input WAV filename (stereo, 48 kHz). Input WAV filename (stereo, 48 kHz).
filename_aac_coded : Union[str, Path] | None filename_aac_coded : Union[str, Path] | None
Optional .mat filename to store aac_seq_3 (assignment convenience). Optional .mat filename to store aac_seq_3 (assignment convenience).
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -416,15 +431,14 @@ def aac_coder_3(
aac_seq: AACSeq3 = [] aac_seq: AACSeq3 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
# Pin win_type to the WinType literal for type checkers.
win_type: WinType = WIN_TYPE
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
prev1_L = np.zeros((2048,), dtype=np.float64) prev1_L = np.zeros((2048,), dtype=np.float64)
prev2_L = np.zeros((2048,), dtype=np.float64) prev2_L = np.zeros((2048,), dtype=np.float64)
prev1_R = np.zeros((2048,), dtype=np.float64) prev1_R = np.zeros((2048,), dtype=np.float64)
prev2_R = np.zeros((2048,), dtype=np.float64) prev2_R = np.zeros((2048,), dtype=np.float64)
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -440,7 +454,7 @@ def aac_coder_3(
frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
# Analysis filterbank (stereo packed) # Analysis filterbank (stereo packed)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
# TNS per channel # TNS per channel
@ -474,32 +488,35 @@ def aac_coder_3(
# Codebook 11: # Codebook 11:
# maxAbsCodeVal = 16 is RESERVED for ESCAPE. # maxAbsCodeVal = 16 is RESERVED for ESCAPE.
# We must stay strictly within [-15, +15] to avoid escape decoding. # We must stay strictly within [-15, +15] to avoid escape decoding.
sf_cb = 11 # sf_cb = 11
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
#
# sfc_L_dpcm = np.clip(
# sfc_L_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
#
# sfc_R_dpcm = np.clip(
# sfc_R_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
sfc_L_dpcm = np.clip( sfc_L_stream, cb_sfc_L = aac_encode_huff(
sfc_L_dpcm,
-sf_max_abs,
sf_max_abs,
).astype(np.int64, copy=False)
sfc_R_dpcm = np.clip(
sfc_R_dpcm,
-sf_max_abs,
sf_max_abs,
).astype(np.int64, copy=False)
sfc_L_stream, _ = aac_encode_huff(
sfc_L_dpcm.reshape(-1, order="F"), sfc_L_dpcm.reshape(-1, order="F"),
huff_LUT_list, huff_LUT_list,
force_codebook=sf_cb, # force_codebook=11,
) )
sfc_R_stream, _ = aac_encode_huff( sfc_R_stream, cb_sfc_R = aac_encode_huff(
sfc_R_dpcm.reshape(-1, order="F"), sfc_R_dpcm.reshape(-1, order="F"),
huff_LUT_list, huff_LUT_list,
force_codebook=sf_cb, # force_codebook=11,
) )
if cb_sfc_L != 11 or cb_sfc_R != 11:
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
mdct_L_stream, cb_L = aac_encode_huff( mdct_L_stream, cb_L = aac_encode_huff(
np.asarray(S_L, dtype=np.int64).reshape(-1), np.asarray(S_L, dtype=np.int64).reshape(-1),
huff_LUT_list, huff_LUT_list,
@ -512,7 +529,7 @@ def aac_coder_3(
# Typed dict construction helps static analyzers validate the schema. # Typed dict construction helps static analyzers validate the schema.
frame_out: AACSeq3Frame = { frame_out: AACSeq3Frame = {
"frame_type": frame_type, "frame_type": frame_type,
"win_type": win_type, "win_type": WIN_TYPE,
"chl": { "chl": {
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
"T": np.asarray(T_L, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64),
@ -539,6 +556,11 @@ def aac_coder_3(
prev1_R = frame_R prev1_R = frame_R
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
# Optional: store to .mat for the assignment wrapper # Optional: store to .mat for the assignment wrapper
if filename_aac_coded is not None: if filename_aac_coded is not None:
@ -548,6 +570,5 @@ def aac_coder_3(
{"aac_seq_3": np.array(aac_seq, dtype=object)}, {"aac_seq_3": np.array(aac_seq, dtype=object)},
do_compression=True, do_compression=True,
) )
return aac_seq return aac_seq

View File

@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
# Level 1 decoder # Level 1 decoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_1(
aac_seq_1: AACSeq1,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
""" """
Level-1 AAC decoder (inverse of aac_coder_1()). Level-1 AAC decoder (inverse of aac_coder_1()).
@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_1(). Encoded sequence as produced by aac_coder_1().
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo. Output WAV filename. Assumption: 48 kHz, stereo.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_1): for i, fr in enumerate(aac_seq_1):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start:start + win, :] += frame_t_hat y_pad[start:start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y: StereoSignal = aac_remove_padding(y_pad, hop=hop) y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
# Level 1 assumption: 48 kHz output. # Level 1 assumption: 48 kHz output.
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
# Level 2 decoder # Level 2 decoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_2(
aac_seq_2: AACSeq2,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
""" """
Level-2 AAC decoder (inverse of aac_coder_2). Level-2 AAC decoder (inverse of aac_coder_2).
@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_2(). Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64) y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_2): for i, fr in enumerate(aac_seq_2):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start : start + win, :] += frame_t_hat y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop) y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_3(
aac_seq_3: AACSeq3,
filename_out: Union[str, Path],
verbose: bool = False,
) -> StereoSignal:
""" """
Level-3 AAC decoder (inverse of aac_coder_3). Level-3 AAC decoder (inverse of aac_coder_3).
@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_3. Encoded sequence as produced by aac_coder_3.
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64) y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_3): for i, fr in enumerate(aac_seq_3):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start : start + win, :] += frame_t_hat y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop) y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y

View File

@ -20,6 +20,7 @@
from __future__ import annotations from __future__ import annotations
from pathlib import Path from pathlib import Path
from tabnanny import verbose
from typing import Union from typing import Union
import soundfile as sf import soundfile as sf
@ -52,7 +53,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
AACSeq1 AACSeq1
List of encoded frames (Level 1 schema). List of encoded frames (Level 1 schema).
""" """
return core_aac_coder_1(filename_in) return core_aac_coder_1(filename_in, verbose=True)
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
@ -73,7 +74,7 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
StereoSignal StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
""" """
return core_aac_decoder_1(aac_seq_1, filename_out) return core_aac_decoder_1(aac_seq_1, filename_out, verbose=True)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------

View File

@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) ->
# Level 1 encoder # Level 1 encoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: def aac_coder_1(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq1:
""" """
Level-1 AAC encoder. Level-1 AAC encoder.
@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
filename_in : Union[str, Path] filename_in : Union[str, Path]
Input WAV filename. Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz. Assumption: stereo audio, sampling rate 48 kHz.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
aac_seq: AACSeq1 = [] aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
win_type: WinType = WIN_TYPE if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
next_t = np.vstack([next_t, tail]) next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, win_type) frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({ aac_seq.append({
"frame_type": frame_type, "frame_type": frame_type,
"win_type": win_type, "win_type": WIN_TYPE,
"chl": {"frame_F": chl_f}, "chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f}, "chr": {"frame_F": chr_f},
}) })
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq return aac_seq
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: def aac_coder_2(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq2:
""" """
Level-2 AAC encoder (Level 1 + TNS). Level-2 AAC encoder (Level 1 + TNS).
@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
---------- ----------
filename_in : Union[str, Path] filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz). Input WAV filename (stereo, 48 kHz).
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
aac_seq: AACSeq2 = [] aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
# Level 1 analysis (packed stereo container) # Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
# Unpack to per-channel (as you already do in Level 1) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
if frame_type == "ESH":
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
else:
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
# Level 2: apply TNS per channel # Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
} }
) )
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq return aac_seq
@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
def aac_coder_3( def aac_coder_3(
filename_in: Union[str, Path], filename_in: Union[str, Path],
filename_aac_coded: Union[str, Path] | None = None, filename_aac_coded: Union[str, Path] | None = None,
verbose: bool = False,
) -> AACSeq3: ) -> AACSeq3:
""" """
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
@ -389,6 +402,8 @@ def aac_coder_3(
Input WAV filename (stereo, 48 kHz). Input WAV filename (stereo, 48 kHz).
filename_aac_coded : Union[str, Path] | None filename_aac_coded : Union[str, Path] | None
Optional .mat filename to store aac_seq_3 (assignment convenience). Optional .mat filename to store aac_seq_3 (assignment convenience).
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -416,15 +431,14 @@ def aac_coder_3(
aac_seq: AACSeq3 = [] aac_seq: AACSeq3 = []
prev_frame_type: FrameType = "OLS" prev_frame_type: FrameType = "OLS"
# Pin win_type to the WinType literal for type checkers.
win_type: WinType = WIN_TYPE
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
prev1_L = np.zeros((2048,), dtype=np.float64) prev1_L = np.zeros((2048,), dtype=np.float64)
prev2_L = np.zeros((2048,), dtype=np.float64) prev2_L = np.zeros((2048,), dtype=np.float64)
prev1_R = np.zeros((2048,), dtype=np.float64) prev1_R = np.zeros((2048,), dtype=np.float64)
prev2_R = np.zeros((2048,), dtype=np.float64) prev2_R = np.zeros((2048,), dtype=np.float64)
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K): for i in range(K):
start = i * hop start = i * hop
@ -440,7 +454,7 @@ def aac_coder_3(
frame_type = aac_ssc(frame_t, next_t, prev_frame_type) frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
# Analysis filterbank (stereo packed) # Analysis filterbank (stereo packed)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
# TNS per channel # TNS per channel
@ -474,32 +488,35 @@ def aac_coder_3(
# Codebook 11: # Codebook 11:
# maxAbsCodeVal = 16 is RESERVED for ESCAPE. # maxAbsCodeVal = 16 is RESERVED for ESCAPE.
# We must stay strictly within [-15, +15] to avoid escape decoding. # We must stay strictly within [-15, +15] to avoid escape decoding.
sf_cb = 11 # sf_cb = 11
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
#
# sfc_L_dpcm = np.clip(
# sfc_L_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
#
# sfc_R_dpcm = np.clip(
# sfc_R_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
sfc_L_dpcm = np.clip( sfc_L_stream, cb_sfc_L = aac_encode_huff(
sfc_L_dpcm,
-sf_max_abs,
sf_max_abs,
).astype(np.int64, copy=False)
sfc_R_dpcm = np.clip(
sfc_R_dpcm,
-sf_max_abs,
sf_max_abs,
).astype(np.int64, copy=False)
sfc_L_stream, _ = aac_encode_huff(
sfc_L_dpcm.reshape(-1, order="F"), sfc_L_dpcm.reshape(-1, order="F"),
huff_LUT_list, huff_LUT_list,
force_codebook=sf_cb, # force_codebook=11,
) )
sfc_R_stream, _ = aac_encode_huff( sfc_R_stream, cb_sfc_R = aac_encode_huff(
sfc_R_dpcm.reshape(-1, order="F"), sfc_R_dpcm.reshape(-1, order="F"),
huff_LUT_list, huff_LUT_list,
force_codebook=sf_cb, # force_codebook=11,
) )
if cb_sfc_L != 11 or cb_sfc_R != 11:
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
mdct_L_stream, cb_L = aac_encode_huff( mdct_L_stream, cb_L = aac_encode_huff(
np.asarray(S_L, dtype=np.int64).reshape(-1), np.asarray(S_L, dtype=np.int64).reshape(-1),
huff_LUT_list, huff_LUT_list,
@ -512,7 +529,7 @@ def aac_coder_3(
# Typed dict construction helps static analyzers validate the schema. # Typed dict construction helps static analyzers validate the schema.
frame_out: AACSeq3Frame = { frame_out: AACSeq3Frame = {
"frame_type": frame_type, "frame_type": frame_type,
"win_type": win_type, "win_type": WIN_TYPE,
"chl": { "chl": {
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
"T": np.asarray(T_L, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64),
@ -539,6 +556,11 @@ def aac_coder_3(
prev1_R = frame_R prev1_R = frame_R
prev_frame_type = frame_type prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
# Optional: store to .mat for the assignment wrapper # Optional: store to .mat for the assignment wrapper
if filename_aac_coded is not None: if filename_aac_coded is not None:
@ -548,6 +570,5 @@ def aac_coder_3(
{"aac_seq_3": np.array(aac_seq, dtype=object)}, {"aac_seq_3": np.array(aac_seq, dtype=object)},
do_compression=True, do_compression=True,
) )
return aac_seq return aac_seq

View File

@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
# Level 1 decoder # Level 1 decoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_1(
aac_seq_1: AACSeq1,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
""" """
Level-1 AAC decoder (inverse of aac_coder_1()). Level-1 AAC decoder (inverse of aac_coder_1()).
@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_1(). Encoded sequence as produced by aac_coder_1().
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo. Output WAV filename. Assumption: 48 kHz, stereo.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_1): for i, fr in enumerate(aac_seq_1):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start:start + win, :] += frame_t_hat y_pad[start:start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y: StereoSignal = aac_remove_padding(y_pad, hop=hop) y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
# Level 1 assumption: 48 kHz output. # Level 1 assumption: 48 kHz output.
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
# Level 2 decoder # Level 2 decoder
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_2(
aac_seq_2: AACSeq2,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
""" """
Level-2 AAC decoder (inverse of aac_coder_2). Level-2 AAC decoder (inverse of aac_coder_2).
@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_2(). Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64) y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_2): for i, fr in enumerate(aac_seq_2):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start : start + win, :] += frame_t_hat y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop) y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: def aac_decoder_3(
aac_seq_3: AACSeq3,
filename_out: Union[str, Path],
verbose: bool = False,
) -> StereoSignal:
""" """
Level-3 AAC decoder (inverse of aac_coder_3). Level-3 AAC decoder (inverse of aac_coder_3).
@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
Encoded sequence as produced by aac_coder_3. Encoded sequence as produced by aac_coder_3.
filename_out : Union[str, Path] filename_out : Union[str, Path]
Output WAV filename. Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns Returns
------- -------
@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
n_pad = (K - 1) * hop + win n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64) y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_3): for i, fr in enumerate(aac_seq_3):
frame_type: FrameType = fr["frame_type"] frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"] win_type: WinType = fr["win_type"]
@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
start = i * hop start = i * hop
y_pad[start : start + win, :] += frame_t_hat y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop) y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000) sf.write(str(filename_out), y, 48000)
return y return y

View File

@ -51,7 +51,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
AACSeq2 AACSeq2
List of encoded frames (Level 2 schema). List of encoded frames (Level 2 schema).
""" """
return core_aac_coder_2(filename_in) return core_aac_coder_2(filename_in, verbose=True)
def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
@ -72,7 +72,7 @@ def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
StereoSignal StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
""" """
return core_aac_decoder_2(aac_seq_2, filename_out) return core_aac_decoder_2(aac_seq_2, filename_out, verbose=True)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------

View File

@ -0,0 +1,574 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 1 AAC encoder orchestration.
# Keeps the same functional behavior as the original level_1 implementation:
# - Reads WAV via soundfile
# - Validates stereo and 48 kHz
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
# - SSC decision uses next-frame attack detection
# - Filterbank analysis (MDCT)
# - Stores per-channel spectra in AACSeq1 schema:
# * ESH: (128, 8)
# * else: (1024, 1)
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from scipy.io import savemat
from core.aac_configuration import WIN_TYPE
from core.aac_filterbank import aac_filter_bank
from core.aac_ssc import aac_ssc
from core.aac_tns import aac_tns
from core.aac_psycho import aac_psycho
from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py
from core.aac_huffman import aac_encode_huff
from core.aac_utils import get_table, band_limits
from material.huff_utils import load_LUT
from core.aac_types import *
# -----------------------------------------------------------------------------
# Helpers for thresholds (T(b))
# -----------------------------------------------------------------------------
def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]:
"""
Return inclusive (lo, hi) band slices derived from TableB219.
"""
table, _ = get_table(frame_type)
wlow, whigh, _bval, _qthr_db = band_limits(table)
return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)]
def _thresholds_from_smr(
frame_F_ch: FrameChannelF,
frame_type: FrameType,
SMR: FloatArray,
) -> FloatArray:
"""
Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy.
Shapes:
- Long: returns (NB, 1)
- ESH: returns (NB, 8)
"""
bands = _band_slices_from_table(frame_type)
NB = len(bands)
X = np.asarray(frame_F_ch, dtype=np.float64)
SMR = np.asarray(SMR, dtype=np.float64)
if frame_type == "ESH":
if X.shape != (128, 8):
raise ValueError("For ESH, frame_F_ch must have shape (128, 8).")
if SMR.shape != (NB, 8):
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
T = np.zeros((NB, 8), dtype=np.float64)
for j in range(8):
Xj = X[:, j]
for b, (lo, hi) in enumerate(bands):
P = float(np.sum(Xj[lo : hi + 1] ** 2))
smr = float(SMR[b, j])
T[b, j] = 0.0 if smr <= 1e-12 else (P / smr)
return T
# Long
if X.shape == (1024,):
Xv = X
elif X.shape == (1024, 1):
Xv = X[:, 0]
else:
raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).")
if SMR.shape == (NB,):
SMRv = SMR
elif SMR.shape == (NB, 1):
SMRv = SMR[:, 0]
else:
raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).")
T = np.zeros((NB, 1), dtype=np.float64)
for b, (lo, hi) in enumerate(bands):
P = float(np.sum(Xv[lo : hi + 1] ** 2))
smr = float(SMRv[b])
T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr)
return T
def _normalize_global_gain(G: GlobalGain) -> float | FloatArray:
"""
Normalize GlobalGain to match AACChannelFrameF3["G"] type:
- long: return float
- ESH: return float64 ndarray of shape (1, 8)
"""
if np.isscalar(G):
return float(G)
G_arr = np.asarray(G)
if G_arr.size == 1:
return float(G_arr.reshape(-1)[0])
return np.asarray(G_arr, dtype=np.float64)
# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
"""
Read a WAV file using soundfile and validate the Level-1 assumptions.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Returns
-------
x : StereoSignal (np.ndarray)
Stereo samples as float64, shape (N, 2).
fs : int
Sampling rate (Hz). Must be 48000.
Raises
------
ValueError
If the input is not stereo or the sampling rate is not 48 kHz.
"""
filename_in = Path(filename_in)
x, fs = sf.read(str(filename_in), always_2d=True)
x = np.asarray(x, dtype=np.float64)
if x.shape[1] != 2:
raise ValueError("Input must be stereo (2 channels).")
if int(fs) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
return x, int(fs)
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
as required by the Level-1 AACSeq1 schema.
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
frame_f : FrameF
Output of aac_filter_bank():
- If frame_type != "ESH": shape (1024, 2)
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
Returns
-------
chl_f : FrameChannelF
Left channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: shape (128, 8)
- else: shape (1024, 1)
"""
if frame_type == "ESH":
if frame_f.shape != (128, 16):
raise ValueError("For ESH, frame_f must have shape (128, 16).")
chl_f = np.empty((128, 8), dtype=np.float64)
chr_f = np.empty((128, 8), dtype=np.float64)
for j in range(8):
chl_f[:, j] = frame_f[:, 2 * j + 0]
chr_f[:, j] = frame_f[:, 2 * j + 1]
return chl_f, chr_f
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
if frame_f.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
return chl_f, chr_f
# -----------------------------------------------------------------------------
# Level 1 encoder
# -----------------------------------------------------------------------------
def aac_coder_1(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq1:
"""
Level-1 AAC encoder.
This function preserves the behavior of the original level_1 implementation:
- Read stereo 48 kHz WAV
- Pad hop samples at start and hop samples at end
- Frame with win=2048, hop=1024
- Use SSC with next-frame lookahead
- Apply filterbank analysis
- Store per-channel coefficients using AACSeq1 schema
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
verbose : bool
Optional argument to print encoding status
Returns
-------
AACSeq1
List of encoded frames (Level 1 schema).
"""
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
# Pad at the beginning to support the first overlap region.
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
# Number of frames such that current frame fits; next frame will be padded if needed.
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq1 = []
prev_frame_type: FrameType = "OLS"
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start:start + win, :]
if frame_t.shape != (win, 2):
# This should not happen due to K definition, but keep it explicit.
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop:start + hop + win, :]
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
aac_seq.append({
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f},
"chr": {"frame_F": chr_f},
})
prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq
def aac_coder_2(
filename_in: Union[str, Path],
verbose: bool = False
) -> AACSeq2:
"""
Level-2 AAC encoder (Level 1 + TNS).
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
verbose : bool
Optional argument to print encoding status
Returns
-------
AACSeq2
Encoded AAC sequence (Level 2 payload schema).
For each frame i:
- "frame_type": FrameType
- "win_type": WinType
- "chl"/"chr":
- "frame_F": FrameChannelF (after TNS)
- "tns_coeffs": TnsCoeffs
"""
filename_in = Path(filename_in)
x, _ = aac_read_wav_stereo_48k(filename_in)
# The assignment assumes 48 kHz
hop = 1024
win = 2048
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
aac_seq: AACSeq2 = []
prev_frame_type: FrameType = "OLS"
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start : start + win, :]
if frame_t.shape != (win, 2):
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop : start + hop + win, :]
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
# Level 1 analysis (packed stereo container)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
# Level 2: apply TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
aac_seq.append(
{
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
}
)
prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
return aac_seq
def aac_coder_3(
filename_in: Union[str, Path],
filename_aac_coded: Union[str, Path] | None = None,
verbose: bool = False,
) -> AACSeq3:
"""
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
filename_aac_coded : Union[str, Path] | None
Optional .mat filename to store aac_seq_3 (assignment convenience).
verbose : bool
Optional argument to print encoding status
Returns
-------
AACSeq3
Encoded AAC sequence (Level 3 payload schema).
"""
filename_in = Path(filename_in)
x, _ = aac_read_wav_stereo_48k(filename_in)
hop = 1024
win = 2048
pad_pre = np.zeros((hop, 2), dtype=np.float64)
pad_post = np.zeros((hop, 2), dtype=np.float64)
x_pad = np.vstack([pad_pre, x, pad_post])
K = int((x_pad.shape[0] - win) // hop + 1)
if K <= 0:
raise ValueError("Input too short for framing.")
# Load Huffman LUTs once.
huff_LUT_list = load_LUT()
aac_seq: AACSeq3 = []
prev_frame_type: FrameType = "OLS"
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
prev1_L = np.zeros((2048,), dtype=np.float64)
prev2_L = np.zeros((2048,), dtype=np.float64)
prev1_R = np.zeros((2048,), dtype=np.float64)
prev2_R = np.zeros((2048,), dtype=np.float64)
if verbose:
print("Encoding ", end="", flush=True)
for i in range(K):
start = i * hop
frame_t: FrameT = x_pad[start : start + win, :]
if frame_t.shape != (win, 2):
raise ValueError("Internal framing error: frame_t has wrong shape.")
next_t = x_pad[start + hop : start + hop + win, :]
if next_t.shape[0] < win:
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
next_t = np.vstack([next_t, tail])
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
# Analysis filterbank (stereo packed)
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
# TNS per channel
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
# Psychoacoustic model per channel (time-domain)
frame_L = np.asarray(frame_t[:, 0], dtype=np.float64)
frame_R = np.asarray(frame_t[:, 1], dtype=np.float64)
SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L)
SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R)
# Thresholds T(b) (stored, not entropy-coded)
T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L)
T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R)
# Quantizer per channel
S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L)
S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R)
# Normalize G types for AACSeq3 schema (float | float64 ndarray).
G_Ln = _normalize_global_gain(G_L)
G_Rn = _normalize_global_gain(G_R)
# Huffman-code ONLY the DPCM differences for b>0.
# sfc[0] corresponds to alpha(0)=G and is stored separately in the frame.
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
# Codebook 11:
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
# We must stay strictly within [-15, +15] to avoid escape decoding.
# sf_cb = 11
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
#
# sfc_L_dpcm = np.clip(
# sfc_L_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
#
# sfc_R_dpcm = np.clip(
# sfc_R_dpcm,
# -sf_max_abs,
# sf_max_abs,
# ).astype(np.int64, copy=False)
sfc_L_stream, cb_sfc_L = aac_encode_huff(
sfc_L_dpcm.reshape(-1, order="F"),
huff_LUT_list,
# force_codebook=11,
)
sfc_R_stream, cb_sfc_R = aac_encode_huff(
sfc_R_dpcm.reshape(-1, order="F"),
huff_LUT_list,
# force_codebook=11,
)
if cb_sfc_L != 11 or cb_sfc_R != 11:
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
mdct_L_stream, cb_L = aac_encode_huff(
np.asarray(S_L, dtype=np.int64).reshape(-1),
huff_LUT_list,
)
mdct_R_stream, cb_R = aac_encode_huff(
np.asarray(S_R, dtype=np.int64).reshape(-1),
huff_LUT_list,
)
# Typed dict construction helps static analyzers validate the schema.
frame_out: AACSeq3Frame = {
"frame_type": frame_type,
"win_type": WIN_TYPE,
"chl": {
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
"T": np.asarray(T_L, dtype=np.float64),
"G": G_Ln,
"sfc": sfc_L_stream,
"stream": mdct_L_stream,
"codebook": int(cb_L),
},
"chr": {
"tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64),
"T": np.asarray(T_R, dtype=np.float64),
"G": G_Rn,
"sfc": sfc_R_stream,
"stream": mdct_R_stream,
"codebook": int(cb_R),
},
}
aac_seq.append(frame_out)
# Update psycho history (shift register)
prev2_L = prev1_L
prev1_L = frame_L
prev2_R = prev1_R
prev1_R = frame_R
prev_frame_type = frame_type
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
if verbose:
print(" done")
# Optional: store to .mat for the assignment wrapper
if filename_aac_coded is not None:
filename_aac_coded = Path(filename_aac_coded)
savemat(
str(filename_aac_coded),
{"aac_seq_3": np.array(aac_seq, dtype=object)},
do_compression=True,
)
return aac_seq

View File

@ -0,0 +1,41 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Configuration
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module contains the global configurations
#
# ------------------------------------------------------------
from __future__ import annotations
# Imports
from typing import Final
from core.aac_types import WinType
# Filterbank
# ------------------------------------------------------------
# Window type
# Options: "SIN", "KBD"
WIN_TYPE: WinType = "SIN"
# TNS
# ------------------------------------------------------------
PRED_ORDER = 4
QUANT_STEP = 0.1
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]
# -----------------------------------------------------------------------------
# Psycho
# -----------------------------------------------------------------------------
NMT_DB: Final[float] = 6.0 # Noise Masking Tone (dB)
TMN_DB: Final[float] = 18.0 # Tone Masking Noise (dB)

View File

@ -0,0 +1,445 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Inverse AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# - Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
# - Level 2 AAC decoder orchestration (inverse of aac_coder_1()).
#
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Union
import soundfile as sf
from core.aac_filterbank import aac_i_filter_bank
from core.aac_tns import aac_i_tns
from core.aac_quantizer import aac_i_quantizer
from core.aac_huffman import aac_decode_huff
from core.aac_utils import get_table, band_limits
from material.huff_utils import load_LUT
from core.aac_types import *
# -----------------------------------------------------------------------------
# Helper for NB
# -----------------------------------------------------------------------------
def _nbands(frame_type: FrameType) -> int:
table, _ = get_table(frame_type)
wlow, _whigh, _bval, _qthr_db = band_limits(table)
return int(len(wlow))
# -----------------------------------------------------------------------------
# Public helpers
# -----------------------------------------------------------------------------
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
"""
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
FrameF container expected by aac_i_filter_bank().
Parameters
----------
frame_type : FrameType
"OLS" | "LSS" | "ESH" | "LPS".
chl_f : FrameChannelF
Left channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
chr_f : FrameChannelF
Right channel coefficients:
- ESH: (128, 8)
- else: (1024, 1)
Returns
-------
FrameF
Stereo coefficients:
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
- else: (1024, 2)
"""
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
return frame_f
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_f[:, 0]
frame_f[:, 1] = chr_f[:, 0]
return frame_f
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
"""
Remove the boundary padding that the Level-1 encoder adds:
hop samples at start and hop samples at end.
Parameters
----------
y_pad : StereoSignal (np.ndarray)
Reconstructed padded stream, shape (N_pad, 2).
hop : int
Hop size in samples (default 1024).
Returns
-------
StereoSignal (np.ndarray)
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
Raises
------
ValueError
If y_pad is too short to unpad.
"""
if y_pad.shape[0] < 2 * hop:
raise ValueError("Decoded stream too short to unpad.")
return y_pad[hop:-hop, :]
# -----------------------------------------------------------------------------
# Level 1 decoder
# -----------------------------------------------------------------------------
def aac_decoder_1(
aac_seq_1: AACSeq1,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
"""
Level-1 AAC decoder (inverse of aac_coder_1()).
This function preserves the behavior of the original level_1 implementation:
- Reconstruct the full padded stream by overlap-adding K synthesized frames
- Remove hop padding at the beginning and hop padding at the end
- Write the reconstructed stereo WAV file (48 kHz)
- Return reconstructed stereo samples as float64
Parameters
----------
aac_seq_1 : AACSeq1
Encoded sequence as produced by aac_coder_1().
filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo.
verbose : bool
Optional argument to print encoding status
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_1)
# Output includes the encoder padding region, so we reconstruct the full padded stream.
# For K frames: last frame starts at (K-1)*hop and spans win,
# so total length = (K-1)*hop + win.
n_pad = (K - 1) * hop + win
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_1):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
start = i * hop
y_pad[start:start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
# Level 1 assumption: 48 kHz output.
sf.write(str(filename_out), y, 48000)
return y
# -----------------------------------------------------------------------------
# Level 2 decoder
# -----------------------------------------------------------------------------
def aac_decoder_2(
aac_seq_2: AACSeq2,
filename_out: Union[str, Path],
verbose: bool = False
) -> StereoSignal:
"""
Level-2 AAC decoder (inverse of aac_coder_2).
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
- Per frame/channel: inverse TNS using stored coefficients
- Re-pack to stereo frame_F
- IMDCT + windowing
- Overlap-add over frames
- Remove Level-1 padding (hop samples start/end)
- Write output WAV (48 kHz)
Parameters
----------
aac_seq_2 : AACSeq2
Encoded sequence as produced by aac_coder_2().
filename_out : Union[str, Path]
Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_2)
if K <= 0:
raise ValueError("aac_seq_2 must contain at least one frame.")
n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_2):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
# Inverse TNS per channel
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
# Re-pack to the stereo container expected by aac_i_filter_bank
if frame_type == "ESH":
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
raise ValueError("ESH channel frame_F must have shape (128, 8).")
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
for j in range(8):
frame_f[:, 2 * j + 0] = chl_f[:, j]
frame_f[:, 2 * j + 1] = chr_f[:, j]
else:
# Accept either (1024,1) or (1024,) from your internal convention.
if chl_f.shape == (1024,):
chl_col = chl_f.reshape(1024, 1)
elif chl_f.shape == (1024, 1):
chl_col = chl_f
else:
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
if chr_f.shape == (1024,):
chr_col = chr_f.reshape(1024, 1)
elif chr_f.shape == (1024, 1):
chr_col = chr_f
else:
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
frame_f = np.empty((1024, 2), dtype=np.float64)
frame_f[:, 0] = chl_col[:, 0]
frame_f[:, 1] = chr_col[:, 0]
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
start = i * hop
y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000)
return y
def aac_decoder_3(
aac_seq_3: AACSeq3,
filename_out: Union[str, Path],
verbose: bool = False,
) -> StereoSignal:
"""
Level-3 AAC decoder (inverse of aac_coder_3).
Steps per frame:
- Huffman decode scalefactors (sfc) using codebook 11
- Huffman decode MDCT symbols (stream) using stored codebook
- iQuantizer -> MDCT coefficients after TNS
- iTNS using stored predictor coefficients
- IMDCT filterbank -> time domain
- Overlap-add, remove padding, write WAV
Parameters
----------
aac_seq_3 : AACSeq3
Encoded sequence as produced by aac_coder_3.
filename_out : Union[str, Path]
Output WAV filename.
verbose : bool
Optional argument to print encoding status
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
filename_out = Path(filename_out)
hop = 1024
win = 2048
K = len(aac_seq_3)
if K <= 0:
raise ValueError("aac_seq_3 must contain at least one frame.")
# Load Huffman LUTs once.
huff_LUT_list = load_LUT()
n_pad = (K - 1) * hop + win
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
if verbose:
print("Decoding ", end="", flush=True)
for i, fr in enumerate(aac_seq_3):
frame_type: FrameType = fr["frame_type"]
win_type: WinType = fr["win_type"]
NB = _nbands(frame_type)
# We store G separately, so Huffman stream contains only (NB-1) DPCM differences.
sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1)
# -------------------------
# Left channel
# -------------------------
tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
G_L = fr["chl"]["G"]
sfc_bits_L = fr["chl"]["sfc"]
mdct_bits_L = fr["chl"]["stream"]
cb_L = int(fr["chl"]["codebook"])
sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
if frame_type == "ESH":
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F")
sfc_L = np.zeros((NB, 8), dtype=np.int64)
Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8)
sfc_L[0, :] = Gv[0, :].astype(np.int64)
sfc_L[1:, :] = sfc_dpcm_L
else:
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F")
sfc_L = np.zeros((NB, 1), dtype=np.int64)
sfc_L[0, 0] = int(float(G_L))
sfc_L[1:, :] = sfc_dpcm_L
# MDCT symbols: codebook 0 means "all-zero section"
if cb_L == 0:
S_dec_L = np.zeros((1024,), dtype=np.int64)
else:
S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False)
# Tuple coding may produce extra trailing symbols; caller knows the true length (1024).
# Also guard against short outputs by zero-padding.
if S_tmp_L.size < 1024:
S_dec_L = np.zeros((1024,), dtype=np.int64)
S_dec_L[: S_tmp_L.size] = S_tmp_L
else:
S_dec_L = S_tmp_L[:1024]
S_L = S_dec_L.reshape(1024, 1)
Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type)
X_L = aac_i_tns(Xq_L, frame_type, tns_L)
# -------------------------
# Right channel
# -------------------------
tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
G_R = fr["chr"]["G"]
sfc_bits_R = fr["chr"]["sfc"]
mdct_bits_R = fr["chr"]["stream"]
cb_R = int(fr["chr"]["codebook"])
sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
if frame_type == "ESH":
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F")
sfc_R = np.zeros((NB, 8), dtype=np.int64)
Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8)
sfc_R[0, :] = Gv[0, :].astype(np.int64)
sfc_R[1:, :] = sfc_dpcm_R
else:
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F")
sfc_R = np.zeros((NB, 1), dtype=np.int64)
sfc_R[0, 0] = int(float(G_R))
sfc_R[1:, :] = sfc_dpcm_R
if cb_R == 0:
S_dec_R = np.zeros((1024,), dtype=np.int64)
else:
S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False)
if S_tmp_R.size < 1024:
S_dec_R = np.zeros((1024,), dtype=np.int64)
S_dec_R[: S_tmp_R.size] = S_tmp_R
else:
S_dec_R = S_tmp_R[:1024]
S_R = S_dec_R.reshape(1024, 1)
Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type)
X_R = aac_i_tns(Xq_R, frame_type, tns_R)
# Re-pack to stereo container and inverse filterbank
frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R))
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
start = i * hop
y_pad[start : start + win, :] += frame_t_hat
if verbose and (i % (K//20)) == 0:
print(".", end="", flush=True)
y = aac_remove_padding(y_pad, hop=hop)
if verbose:
print(" done")
sf.write(str(filename_out), y, 48000)
return y

View File

@ -0,0 +1,387 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Filterbank module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
#
# ------------------------------------------------------------
from __future__ import annotations
from core.aac_utils import mdct, imdct
from core.aac_types import *
from scipy.signal.windows import kaiser
# Private helpers for Filterbank
# ------------------------------------------------------------
def _sin_window(N: int) -> Window:
"""
Build a sinusoidal (SIN) window of length N.
The AAC sinusoid window is:
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
Parameters
----------
N : int
Window length in samples.
Returns
-------
Window
1-D array of shape (N, ) with dtype float64.
"""
n = np.arange(N, dtype=np.float64)
return np.sin((np.pi / N) * (n + 0.5))
def _kbd_window(N: int, alpha: float) -> Window:
"""
Build a Kaiser-Bessel-Derived (KBD) window of length N.
This follows the standard KBD construction used in AAC:
1) Build a Kaiser kernel of length (N/2 + 1).
2) Form the left half by cumulative summation, normalization, and sqrt.
3) Mirror the left half to form the right half (symmetric full-length window).
Notes
-----
- N must be even (AAC uses N=2048 for long and N=256 for short).
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
Parameters
----------
N : int
Window length in samples (must be even).
alpha : float
KBD alpha parameter.
Returns
-------
Window
1-D array of shape (N,) with dtype float64.
"""
half = N // 2
# Kaiser kernel length: half + 1 samples (0 .. half)
# beta = pi * alpha per the usual correspondence with the ISO definition
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
csum = np.cumsum(kernel)
denom = csum[-1]
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
w_right = w_left[::-1] # mirror for second half
return np.concatenate([w_left, w_right])
def _long_window(win_type: WinType) -> Window:
"""
Return the long AAC window (length 2048) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(2048)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(2048, alpha=6.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _short_window(win_type: WinType) -> Window:
"""
Return the short AAC window (length 256) for the selected window family.
Parameters
----------
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (256,) with dtype float64.
"""
if win_type == "SIN":
return _sin_window(256)
if win_type == "KBD":
# Assignment-specific alpha values
return _kbd_window(256, alpha=4.0)
raise ValueError(f"Invalid win_type: {win_type!r}")
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
"""
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
In this assignment we assume a single window family is used globally
(no mixed KBD/SIN halves). Therefore, both the long and short windows
are drawn from the same family.
For frame_type:
- "OLS": return the long window Wl (2048).
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
Parameters
----------
frame_type : FrameType
One of "OLS", "LSS", "LPS".
win_type : WinType
Either "SIN" or "KBD".
Returns
-------
Window
1-D array of shape (2048,) with dtype float64.
"""
wL = _long_window(win_type) # length 2048
wS = _short_window(win_type) # length 256
if frame_type == "OLS":
return wL
if frame_type == "LSS":
# 0..1023: left half of long window
# 1024..1471: ones (448 samples)
# 1472..1599: right half of short window (128 samples)
# 1600..2047: zeros (448 samples)
out = np.zeros(2048, dtype=np.float64)
out[0:1024] = wL[0:1024]
out[1024:1472] = 1.0
out[1472:1600] = wS[128:256]
out[1600:2048] = 0.0
return out
if frame_type == "LPS":
# 0..447: zeros (448)
# 448..575: left half of short window (128)
# 576..1023: ones (448)
# 1024..2047: right half of long window (1024)
out = np.zeros(2048, dtype=np.float64)
out[0:448] = 0.0
out[448:576] = wS[0:128]
out[576:1024] = 1.0
out[1024:2048] = wL[1024:2048]
return out
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
"""
ESH analysis for one channel.
Parameters
----------
x_ch : FrameChannelT
Time-domain channel frame (expected shape: (2048,)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelF
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
of the j-th short window.
"""
wS = _short_window(win_type) # (256,)
X_esh = np.empty((128, 8), dtype=np.float64)
# ESH subwindows are taken from the central region:
# start positions: 448 + 128*j, j = 0..7
for j in range(8):
start = 448 + 128 * j
seg = x_ch[start:start + 256] * wS # (256,)
X_esh[:, j] = mdct(seg) # (128,)
return X_esh
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
"""
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
Parameters
----------
frame_F : FrameF
Packed ESH spectrum (expected shape: (128, 16)).
Returns
-------
left : FrameChannelF
Left channel spectrum, shape (128, 8).
right : FrameChannelF
Right channel spectrum, shape (128, 8).
Notes
-----
Inverse mapping of the packing used in aac_filter_bank():
packed[:, 2*j] = left[:, j]
packed[:, 2*j+1] = right[:, j]
"""
if frame_F.shape != (128, 16):
raise ValueError("ESH frame_F must have shape (128, 16).")
left = np.empty((128, 8), dtype=np.float64)
right = np.empty((128, 8), dtype=np.float64)
for j in range(8):
left[:, j] = frame_F[:, 2 * j + 0]
right[:, j] = frame_F[:, 2 * j + 1]
return left, right
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
"""
ESH synthesis for one channel.
Parameters
----------
X_esh : FrameChannelF
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
win_type : WinType
Window family ("KBD" or "SIN").
Returns
-------
FrameChannelT
Time-domain channel contribution, shape (2048,).
This is already overlap-added internally for the 8 short blocks and
ready for OLA at the caller level.
"""
if X_esh.shape != (128, 8):
raise ValueError("X_esh must have shape (128, 8).")
wS = _short_window(win_type) # (256,)
out = np.zeros(2048, dtype=np.float64)
# Each short IMDCT returns 256 samples. Place them at:
# start = 448 + 128*j, j=0..7 (50% overlap)
for j in range(8):
seg = imdct(X_esh[:, j]) * wS # (256,)
start = 448 + 128 * j
out[start:start + 256] += seg
return out
# -----------------------------------------------------------------------------
# Public Function prototypes
# -----------------------------------------------------------------------------
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
"""
Filterbank stage (MDCT analysis).
Parameters
----------
frame_T : FrameT
Time-domain frame, stereo, shape (2048, 2).
frame_type : FrameType
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN") used for the current frame.
Returns
-------
frame_F : FrameF
Frequency-domain MDCT coefficients:
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
containing MDCT coefficients for both channels.
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
placed in columns according to subframe order, i.e. overall shape (128, 16).
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
if frame_type in ("OLS", "LSS", "LPS"):
w = _window_sequence(frame_type, win_type) # length 2048
XL = mdct(xL * w) # length 1024
XR = mdct(xR * w) # length 1024
out = np.empty((1024, 2), dtype=np.float64)
out[:, 0] = XL
out[:, 1] = XR
return out
if frame_type == "ESH":
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
# Pack into (128, 16): each subframe as (128,2) placed in columns
out = np.empty((128, 16), dtype=np.float64)
for j in range(8):
out[:, 2 * j + 0] = Xl[:, j]
out[:, 2 * j + 1] = Xr[:, j]
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
"""
Inverse filterbank (IMDCT synthesis).
Parameters
----------
frame_F : FrameF
Frequency-domain MDCT coefficients as produced by filter_bank().
frame_type : FrameType
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
win_type : WinType
Window type ("KBD" or "SIN").
Returns
-------
frame_T : FrameT
Reconstructed time-domain frame, stereo, shape (2048, 2).
"""
if frame_type in ("OLS", "LSS", "LPS"):
if frame_F.shape != (1024, 2):
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
w = _window_sequence(frame_type, win_type)
xL = imdct(frame_F[:, 0]) * w
xR = imdct(frame_F[:, 1]) * w
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
if frame_type == "ESH":
if frame_F.shape != (128, 16):
raise ValueError("For ESH, frame_F must have shape (128, 16).")
Xl, Xr = _unpack_esh(frame_F)
xL = _i_filter_bank_esh_channel(Xl, win_type)
xR = _i_filter_bank_esh_channel(Xr, win_type)
out = np.empty((2048, 2), dtype=np.float64)
out[:, 0] = xL
out[:, 1] = xR
return out
raise ValueError(f"Invalid frame_type: {frame_type!r}")

View File

@ -0,0 +1,112 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Huffman wrappers (Level 3)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Thin wrappers around the provided Huffman utilities (material/huff_utils.py)
# so that the API matches the assignment text.
#
# Exposed API (assignment):
# huff_sec, huff_codebook = aac_encode_huff(coeff_sec, huff_LUT_list, force_codebook)
# dec_coeffs = aac_decode_huff(huff_sec, huff_codebook, huff_LUT_list)
#
# Notes:
# - Huffman coding operates on tuples. Therefore, decode(encode(x)) may return
# extra trailing symbols due to tuple padding. The AAC decoder knows the
# true section length from side information (band limits) and truncates.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Any
import numpy as np
from material.huff_utils import encode_huff, decode_huff
def aac_encode_huff(
coeff_sec: np.ndarray,
huff_LUT_list: list[dict[str, Any]],
force_codebook: int | None = None,
) -> tuple[str, int]:
"""
Huffman-encode a section of coefficients (MDCT symbols or scalefactors).
Parameters
----------
coeff_sec : np.ndarray
Coefficient section to be encoded. Any shape is accepted; the input
is flattened and treated as a 1-D sequence of int64 symbols.
huff_LUT_list : list[dict[str, Any]]
List of Huffman Look-Up Tables (LUTs) as returned by material.load_LUT().
Index corresponds to codebook id (typically 1..11, with 0 reserved).
force_codebook : int | None
If provided, forces the use of this Huffman codebook. In the assignment,
scalefactors are encoded with codebook 11. For MDCT coefficients, this
argument is usually omitted (auto-selection).
Returns
-------
tuple[str, int]
(huff_sec, huff_codebook)
- huff_sec: bitstream as a string of '0'/'1'
- huff_codebook: codebook id used by the encoder
"""
coeff_sec_arr = np.asarray(coeff_sec, dtype=np.int64).reshape(-1)
if force_codebook is None:
# Provided utility returns (bitstream, codebook) in the auto-selection case.
huff_sec, huff_codebook = encode_huff(coeff_sec_arr, huff_LUT_list)
return str(huff_sec), int(huff_codebook)
# Provided utility returns ONLY the bitstream when force_codebook is set.
cb = int(force_codebook)
huff_sec = encode_huff(coeff_sec_arr, huff_LUT_list, force_codebook=cb)
return str(huff_sec), cb
def aac_decode_huff(
huff_sec: str | np.ndarray,
huff_codebook: int,
huff_LUT: list[dict[str, Any]],
) -> np.ndarray:
"""
Huffman-decode a bitstream using the specified codebook.
Parameters
----------
huff_sec : str | np.ndarray
Huffman bitstream. Typically a string of '0'/'1'. If an array is provided,
it is passed through to the provided decoder.
huff_codebook : int
Codebook id that was returned by aac_encode_huff.
Codebook 0 represents an all-zero section.
huff_LUT : list[dict[str, Any]]
Huffman LUT list as returned by material.load_LUT().
Returns
-------
np.ndarray
Decoded coefficients as a 1-D np.int64 array.
Note: Due to tuple coding, the decoded array may contain extra trailing
padding symbols. The caller must truncate to the known section length.
"""
cb = int(huff_codebook)
if cb == 0:
# Codebook 0 represents an all-zero section. The decoded length is not
# recoverable from the bitstream alone; the caller must expand/truncate.
return np.zeros((0,), dtype=np.int64)
if cb < 0 or cb >= len(huff_LUT):
raise ValueError(f"Invalid Huffman codebook index: {cb}")
lut = huff_LUT[cb]
dec = decode_huff(huff_sec, lut)
return np.asarray(dec, dtype=np.int64).reshape(-1)

View File

@ -0,0 +1,441 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Psychoacoustic Model
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Psychoacoustic model for ONE channel, based on the assignment notes (Section 2.4).
#
# Public API:
# SMR = aac_psycho(frame_T, frame_type, frame_T_prev_1, frame_T_prev_2)
#
# Output:
# - For long frames ("OLS", "LSS", "LPS"): SMR has shape (69,)
# - For short frames ("ESH"): SMR has shape (42, 8) (one column per subframe)
#
# Notes:
# - Uses Bark band tables from material/TableB219.mat:
# * B219a for long windows (69 bands, N=2048 FFT, N/2=1024 bins)
# * B219b for short windows (42 bands, N=256 FFT, N/2=128 bins)
# - Applies a Hann window in time domain before FFT magnitude/phase extraction.
# - Implements:
# spreading function -> band spreading -> tonality index -> masking thresholds -> SMR.
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
from core.aac_utils import band_limits, get_table
from core.aac_configuration import NMT_DB, TMN_DB
from core.aac_types import *
# -----------------------------------------------------------------------------
# Spreading function
# -----------------------------------------------------------------------------
def _spreading_matrix(bval: BandValueArray) -> FloatArray:
"""
Compute the spreading function matrix between psychoacoustic bands.
The spreading function describes how energy in one critical band masks
nearby bands. The formula follows the assignment pseudo-code.
Parameters
----------
bval : BandValueArray
Bark value per band, shape (B,).
Returns
-------
FloatArray
Spreading matrix S of shape (B, B), where:
S[bb, b] quantifies the contribution of band bb masking band b.
"""
bval = np.asarray(bval, dtype=np.float64).reshape(-1)
B = int(bval.shape[0])
spread = np.zeros((B, B), dtype=np.float64)
for b in range(B):
for bb in range(B):
# tmpx depends on direction (asymmetric spreading)
if bb >= b:
tmpx = 3.0 * (bval[bb] - bval[b])
else:
tmpx = 1.5 * (bval[bb] - bval[b])
# tmpz uses the "min(..., 0)" nonlinearity exactly as in the notes
tmpz = 8.0 * min((tmpx - 0.5) ** 2 - 2.0 * (tmpx - 0.5), 0.0)
tmpy = 15.811389 + 7.5 * (tmpx + 0.474) - 17.5 * np.sqrt(1.0 + (tmpx + 0.474) ** 2)
# Clamp very small values (below -100 dB) to 0 contribution
if tmpy < -100.0:
spread[bb, b] = 0.0
else:
spread[bb, b] = 10.0 ** ((tmpz + tmpy) / 10.0)
return spread
# -----------------------------------------------------------------------------
# Windowing + FFT feature extraction
# -----------------------------------------------------------------------------
def _hann_window(N: int) -> FloatArray:
"""
Hann window as specified in the notes:
w[n] = 0.5 - 0.5*cos(2*pi*(n + 0.5)/N)
Parameters
----------
N : int
Window length.
Returns
-------
FloatArray
1-D array of shape (N,), dtype float64.
"""
n = np.arange(N, dtype=np.float64)
return 0.5 - 0.5 * np.cos((2.0 * np.pi / N) * (n + 0.5))
def _r_phi_from_time(x: FrameChannelT, N: int) -> tuple[FloatArray, FloatArray]:
"""
Compute FFT magnitude r(w) and phase phi(w) for bins w = 0 .. N/2-1.
Processing:
1) Apply Hann window in time domain.
2) Compute N-point FFT.
3) Keep only the positive-frequency bins [0 .. N/2-1].
Parameters
----------
x : FrameChannelT
Time-domain samples, shape (N,).
N : int
FFT size (2048 or 256).
Returns
-------
r : FloatArray
Magnitude spectrum for bins 0 .. N/2-1, shape (N/2,).
phi : FloatArray
Phase spectrum for bins 0 .. N/2-1, shape (N/2,).
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
if x.shape[0] != N:
raise ValueError(f"Expected time vector of length {N}, got {x.shape[0]}.")
w = _hann_window(N)
X = np.fft.fft(x * w, n=N)
Xp = X[: N // 2]
r = np.abs(Xp).astype(np.float64, copy=False)
phi = np.angle(Xp).astype(np.float64, copy=False)
return r, phi
def _predictability(
r: FloatArray,
phi: FloatArray,
r_m1: FloatArray,
phi_m1: FloatArray,
r_m2: FloatArray,
phi_m2: FloatArray,
) -> FloatArray:
"""
Compute predictability c(w) per spectral bin.
The notes define:
r_pred(w) = 2*r_{-1}(w) - r_{-2}(w)
phi_pred(w) = 2*phi_{-1}(w) - phi_{-2}(w)
c(w) = |X(w) - X_pred(w)| / (r(w) + |r_pred(w)|)
where X(w) is represented in polar form using r(w), phi(w).
Parameters
----------
r, phi : FloatArray
Current magnitude and phase, shape (N/2,).
r_m1, phi_m1 : FloatArray
Previous magnitude and phase, shape (N/2,).
r_m2, phi_m2 : FloatArray
Pre-previous magnitude and phase, shape (N/2,).
Returns
-------
FloatArray
Predictability c(w), shape (N/2,).
"""
r_pred = 2.0 * r_m1 - r_m2
phi_pred = 2.0 * phi_m1 - phi_m2
num = np.sqrt(
(r * np.cos(phi) - r_pred * np.cos(phi_pred)) ** 2
+ (r * np.sin(phi) - r_pred * np.sin(phi_pred)) ** 2
)
den = r + np.abs(r_pred) + 1e-12 # avoid division-by-zero without altering behavior
return (num / den).astype(np.float64, copy=False)
# -----------------------------------------------------------------------------
# Band-domain aggregation
# -----------------------------------------------------------------------------
def _band_energy_and_weighted_predictability(
r: FloatArray,
c: FloatArray,
wlow: BandIndexArray,
whigh: BandIndexArray,
) -> tuple[FloatArray, FloatArray]:
"""
Aggregate spectral bin quantities into psychoacoustic bands.
Definitions (notes):
e(b) = sum_{w=wlow(b)..whigh(b)} r(w)^2
c_num(b) = sum_{w=wlow(b)..whigh(b)} c(w) * r(w)^2
The band predictability c(b) is later computed after spreading as:
cb(b) = ct(b) / ecb(b)
Parameters
----------
r : FloatArray
Magnitude spectrum, shape (N/2,).
c : FloatArray
Predictability per bin, shape (N/2,).
wlow, whigh : BandIndexArray
Band limits (inclusive indices), shape (B,).
Returns
-------
e_b : FloatArray
Band energies e(b), shape (B,).
c_num_b : FloatArray
Weighted predictability numerators c_num(b), shape (B,).
"""
r2 = (r * r).astype(np.float64, copy=False)
B = int(wlow.shape[0])
e_b = np.zeros(B, dtype=np.float64)
c_num_b = np.zeros(B, dtype=np.float64)
for b in range(B):
a = int(wlow[b])
z = int(whigh[b])
seg_r2 = r2[a : z + 1]
e_b[b] = float(np.sum(seg_r2))
c_num_b[b] = float(np.sum(c[a : z + 1] * seg_r2))
return e_b, c_num_b
def _psycho_one_window(
time_x: FrameChannelT,
prev1_x: FrameChannelT,
prev2_x: FrameChannelT,
*,
N: int,
table: BarkTable,
) -> FloatArray:
"""
Compute SMR for one FFT analysis window (N=2048 for long, N=256 for short).
This implements the pipeline described in the notes:
- FFT magnitude/phase
- predictability per bin
- band energies and predictability
- band spreading
- tonality index tb(b)
- masking threshold (noise + threshold in quiet)
- SMR(b) = e(b) / np(b)
Parameters
----------
time_x : FrameChannelT
Current time-domain samples, shape (N,).
prev1_x : FrameChannelT
Previous time-domain samples, shape (N,).
prev2_x : FrameChannelT
Pre-previous time-domain samples, shape (N,).
N : int
FFT size.
table : BarkTable
Psychoacoustic band table (B219a or B219b).
Returns
-------
FloatArray
SMR per band, shape (B,).
"""
wlow, whigh, bval, qthr_db = band_limits(table)
spread = _spreading_matrix(bval)
# FFT features for current and history windows
r, phi = _r_phi_from_time(time_x, N)
r_m1, phi_m1 = _r_phi_from_time(prev1_x, N)
r_m2, phi_m2 = _r_phi_from_time(prev2_x, N)
# Predictability per bin
c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2)
# Aggregate into psycho bands
e_b, c_num_b = _band_energy_and_weighted_predictability(r, c_w, wlow, whigh)
# Spread energies and predictability across bands:
# ecb(b) = sum_bb e(bb) * S(bb, b)
# ct(b) = sum_bb c_num(bb) * S(bb, b)
ecb = spread.T @ e_b
ct = spread.T @ c_num_b
# Band predictability after spreading: cb(b) = ct(b) / ecb(b)
cb = ct / (ecb + 1e-12)
# Normalized energy term:
# en(b) = ecb(b) / sum_bb S(bb, b)
spread_colsum = np.sum(spread, axis=0)
en = ecb / (spread_colsum + 1e-12)
# Tonality index (clamped to [0, 1])
tb = -0.299 - 0.43 * np.log(np.maximum(cb, 1e-12))
tb = np.clip(tb, 0.0, 1.0)
# Required SNR per band (dB): interpolate between TMN and NMT
snr_b = tb * TMN_DB + (1.0 - tb) * NMT_DB
bc = 10.0 ** (-snr_b / 10.0)
# Noise masking threshold estimate (power domain)
nb = en * bc
# Threshold in quiet (convert from dB to power domain):
# qthr_power = eps * (N/2) * 10^(qthr_db/10)
qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0))
# Final masking threshold per band:
# np(b) = max(nb(b), qthr(b))
npart = np.maximum(nb, qthr_power)
# Signal-to-mask ratio:
# SMR(b) = e(b) / np(b)
smr = e_b / (npart + 1e-12)
return smr.astype(np.float64, copy=False)
# -----------------------------------------------------------------------------
# ESH window slicing (match filterbank conventions)
# -----------------------------------------------------------------------------
def _esh_subframes_256(x_2048: FrameChannelT) -> list[FrameChannelT]:
"""
Extract the 8 overlapping 256-sample short windows used by AAC ESH.
The project convention (matching the filterbank) is:
start_j = 448 + 128*j, for j = 0..7
subframe_j = x[start_j : start_j + 256]
This selects the central 1152-sample region [448, 1600) and produces
8 windows with 50% overlap.
Parameters
----------
x_2048 : FrameChannelT
Time-domain channel frame, shape (2048,).
Returns
-------
list[FrameChannelT]
List of 8 subframes, each of shape (256,).
"""
x_2048 = np.asarray(x_2048, dtype=np.float64).reshape(-1)
if x_2048.shape[0] != 2048:
raise ValueError("ESH requires 2048-sample input frames.")
subs: list[FrameChannelT] = []
for j in range(8):
start = 448 + 128 * j
subs.append(x_2048[start : start + 256])
return subs
# -----------------------------------------------------------------------------
# Public API
# -----------------------------------------------------------------------------
def aac_psycho(
frame_T: FrameChannelT,
frame_type: FrameType,
frame_T_prev_1: FrameChannelT,
frame_T_prev_2: FrameChannelT,
) -> FloatArray:
"""
Psychoacoustic model for ONE channel.
Parameters
----------
frame_T : FrameChannelT
Current time-domain channel frame, shape (2048,).
For "ESH", the 8 short windows are derived internally.
frame_type : FrameType
AAC frame type ("OLS", "LSS", "ESH", "LPS").
frame_T_prev_1 : FrameChannelT
Previous time-domain channel frame, shape (2048,).
frame_T_prev_2 : FrameChannelT
Pre-previous time-domain channel frame, shape (2048,).
Returns
-------
FloatArray
Signal-to-Mask Ratio (SMR), per psychoacoustic band.
- If frame_type == "ESH": shape (42, 8)
- Else: shape (69,)
"""
frame_T = np.asarray(frame_T, dtype=np.float64).reshape(-1)
frame_T_prev_1 = np.asarray(frame_T_prev_1, dtype=np.float64).reshape(-1)
frame_T_prev_2 = np.asarray(frame_T_prev_2, dtype=np.float64).reshape(-1)
if frame_T.shape[0] != 2048 or frame_T_prev_1.shape[0] != 2048 or frame_T_prev_2.shape[0] != 2048:
raise ValueError("aac_psycho expects 2048-sample frames for current/prev1/prev2.")
table, N = get_table(frame_type)
# Long frame types: compute one SMR vector (69 bands)
if frame_type != "ESH":
return _psycho_one_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table)
# ESH: compute 8 SMR vectors (42 bands each), one per short subframe.
#
# The notes use short-window history for predictability:
# - For j=0: use previous frame's subframes (7, 6)
# - For j=1: use current subframe 0 and previous frame's subframe 7
# - For j>=2: use current subframes (j-1, j-2)
#
# This matches the "within-frame history" convention commonly used in
# simplified psycho models for ESH.
cur_subs = _esh_subframes_256(frame_T)
prev1_subs = _esh_subframes_256(frame_T_prev_1)
B = int(table.shape[0]) # expected 42
smr_out = np.zeros((B, 8), dtype=np.float64)
for j in range(8):
if j == 0:
x_m1 = prev1_subs[7]
x_m2 = prev1_subs[6]
elif j == 1:
x_m1 = cur_subs[0]
x_m2 = prev1_subs[7]
else:
x_m1 = cur_subs[j - 1]
x_m2 = cur_subs[j - 2]
smr_out[:, j] = _psycho_one_window(cur_subs[j], x_m1, x_m2, N=256, table=table)
return smr_out

View File

@ -0,0 +1,604 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Implements AAC quantizer and inverse quantizer for one channel.
# Based on assignment section 2.6 (Eq. 12-15).
#
# Notes:
# - Bit reservoir is not implemented (assignment simplification).
# - Scalefactor bands are assumed equal to psychoacoustic bands
# (Table B.2.1.9a / B.2.1.9b from TableB219.mat).
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
from core.aac_utils import get_table, band_limits
from core.aac_types import *
# -----------------------------------------------------------------------------
# Constants (assignment)
# -----------------------------------------------------------------------------
MAGIC_NUMBER: float = 0.4054
MQ: int = 8191
EPS: float = 1e-12
MAX_SFC_DIFF: int = 60
# Safeguard: prevents infinite loops in pathological cases
MAX_ALPHA_ITERS: int = 2000
# -----------------------------------------------------------------------------
# Helpers: ESH packing/unpacking (128x8 <-> 1024x1)
# -----------------------------------------------------------------------------
def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray:
"""
Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1).
Packing order:
Columns are concatenated in subframe order (0..7), column-major.
Parameters
----------
x_128x8 : FloatArray
ESH coefficients, shape (128, 8).
Returns
-------
FloatArray
Packed coefficients, shape (1024, 1).
"""
x_128x8 = np.asarray(x_128x8, dtype=np.float64)
if x_128x8.shape != (128, 8):
raise ValueError("ESH pack expects shape (128, 8).")
return x_128x8.reshape(1024, 1, order="F")
def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray:
"""
Unpack a packed ESH vector (1024 elements) back to shape (128, 8).
Parameters
----------
x_1024x1 : FloatArray
Packed ESH vector, shape (1024,) or (1024, 1) after flattening.
Returns
-------
FloatArray
Unpacked ESH coefficients, shape (128, 8).
"""
x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1)
if x_1024x1.shape[0] != 1024:
raise ValueError("ESH unpack expects 1024 elements.")
return x_1024x1.reshape(128, 8, order="F")
# -----------------------------------------------------------------------------
# Core quantizer formulas (Eq. 12, Eq. 13)
# -----------------------------------------------------------------------------
def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols:
"""
Quantize MDCT coefficients to integer symbols S(k).
Implements Eq. (12):
S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER )
Parameters
----------
x : FloatArray
MDCT coefficients for a contiguous set of spectral lines.
Shape: (N,)
alpha : float
Scalefactor gain for the corresponding scalefactor band.
Returns
-------
QuantizedSymbols
Quantized symbols S(k) as int64, shape (N,).
"""
x = np.asarray(x, dtype=np.float64)
scale = 2.0 ** (-0.25 * float(alpha))
ax = np.abs(x) * scale
y = np.power(ax, 0.75, dtype=np.float64)
# "int" in the assignment corresponds to truncation.
q = np.floor(y + MAGIC_NUMBER).astype(np.int64)
return (np.sign(x).astype(np.int64) * q).astype(np.int64)
def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray:
"""
Inverse quantizer (dequantization of symbols).
Implements Eq. (13):
Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4)
Parameters
----------
S : QuantizedSymbols
Quantized symbols S(k), int64, shape (N,).
alpha : float
Scalefactor gain for the corresponding scalefactor band.
Returns
-------
FloatArray
Reconstructed MDCT coefficients Xhat(k), float64, shape (N,).
"""
S = np.asarray(S, dtype=np.int64)
scale = 2.0 ** (0.25 * float(alpha))
aS = np.abs(S).astype(np.float64)
y = np.power(aS, 4.0 / 3.0, dtype=np.float64)
return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64)
# -----------------------------------------------------------------------------
# Alpha initialization (Eq. 14)
# -----------------------------------------------------------------------------
def _alpha_initial_from_frame_max(x_all: FloatArray) -> int:
"""
Compute the initial scalefactor gain alpha_hat for a frame.
Implements Eq. (14):
alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ )
The same initial value is used for all bands before the per-band refinement.
Parameters
----------
x_all : FloatArray
All MDCT coefficients of a frame (one channel), flattened.
Returns
-------
int
Initial alpha value (integer).
"""
x_all = np.asarray(x_all, dtype=np.float64).reshape(-1)
if x_all.size == 0:
return 0
max_abs = float(np.max(np.abs(x_all)))
if max_abs <= 0.0:
return 0
val = (max_abs ** 0.75) / float(MQ)
if val <= 0.0:
return 0
alpha_hat = (16.0 / 3.0) * np.log2(val)
return int(np.floor(alpha_hat))
# -----------------------------------------------------------------------------
# Band utilities
# -----------------------------------------------------------------------------
def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]:
"""
Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type.
These are derived from the psychoacoustic tables (TableB219),
and map directly to MDCT indices:
- long: 0..1023
- short (ESH subframe): 0..127
Parameters
----------
frame_type : FrameType
Frame type ("OLS", "LSS", "ESH", "LPS").
Returns
-------
list[tuple[int, int]]
List of (lo, hi) inclusive index pairs for each band.
"""
table, _Nfft = get_table(frame_type)
wlow, whigh, _bval, _qthr_db = band_limits(table)
bands: list[tuple[int, int]] = []
for lo, hi in zip(wlow, whigh):
bands.append((int(lo), int(hi)))
return bands
def _band_energy(x: FloatArray, lo: int, hi: int) -> float:
"""
Compute energy of a spectral segment x[lo:hi+1].
Parameters
----------
x : FloatArray
MDCT coefficient vector.
lo, hi : int
Inclusive index range.
Returns
-------
float
Sum of squares (energy) within the band.
"""
sec = x[lo : hi + 1]
return float(np.sum(sec * sec))
def _threshold_T_from_SMR(
X: FloatArray,
SMR_col: FloatArray,
bands: list[tuple[int, int]],
) -> FloatArray:
"""
Compute psychoacoustic thresholds T(b) per band.
Uses:
P(b) = sum_{k in band} X(k)^2
T(b) = P(b) / SMR(b)
Parameters
----------
X : FloatArray
MDCT coefficients for a frame (long) or one ESH subframe (short).
SMR_col : FloatArray
SMR values for this frame/subframe, shape (NB,).
bands : list[tuple[int, int]]
Band index ranges.
Returns
-------
FloatArray
Threshold vector T(b), shape (NB,).
"""
nb = len(bands)
T = np.zeros((nb,), dtype=np.float64)
for b, (lo, hi) in enumerate(bands):
P = _band_energy(X, lo, hi)
smr = float(SMR_col[b])
if smr <= EPS:
T[b] = 0.0
else:
T[b] = P / smr
return T
# -----------------------------------------------------------------------------
# Alpha selection per band + neighbor-difference constraint
# -----------------------------------------------------------------------------
def _best_alpha_for_band(
X: FloatArray,
lo: int,
hi: int,
T_b: float,
alpha0: int,
alpha_min: int,
alpha_max: int,
) -> int:
"""
Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha.
The algorithm increases alpha until the quantization error power exceeds
the band threshold:
P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2
select the largest alpha such that P_e(b) <= T(b)
Parameters
----------
X : FloatArray
Full MDCT vector (one frame or one subframe), shape (N,).
lo, hi : int
Inclusive MDCT index range defining the band.
T_b : float
Psychoacoustic threshold for this band.
alpha0 : int
Initial alpha value for the band.
alpha_min, alpha_max : int
Bounds for alpha, used as a safeguard.
Returns
-------
int
Selected integer alpha(b).
"""
if T_b <= 0.0:
return int(alpha0)
Xsec = X[lo : hi + 1]
alpha = int(alpha0)
alpha = max(alpha_min, min(alpha, alpha_max))
# Evaluate at current alpha
Ssec = _quantize_symbol(Xsec, alpha)
Xhat = _dequantize_symbol(Ssec, alpha)
Pe = float(np.sum((Xsec - Xhat) ** 2))
if Pe > T_b:
return alpha
iters = 0
while iters < MAX_ALPHA_ITERS:
iters += 1
alpha_next = alpha + 1
if alpha_next > alpha_max:
break
Ssec = _quantize_symbol(Xsec, alpha_next)
Xhat = _dequantize_symbol(Ssec, alpha_next)
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
if Pe_next > T_b:
break
alpha = alpha_next
Pe = Pe_next
return alpha
def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray:
"""
Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping.
Uses a forward pass and a backward pass to reduce drift.
Parameters
----------
alpha : np.ndarray
Alpha vector, shape (NB,).
max_diff : int
Maximum allowed absolute difference between adjacent bands.
Returns
-------
np.ndarray
Clamped alpha vector, int64, shape (NB,).
"""
a = np.asarray(alpha, dtype=np.int64).copy()
nb = a.shape[0]
for b in range(1, nb):
lo = int(a[b - 1] - max_diff)
hi = int(a[b - 1] + max_diff)
if a[b] < lo:
a[b] = lo
elif a[b] > hi:
a[b] = hi
for b in range(nb - 2, -1, -1):
lo = int(a[b + 1] - max_diff)
hi = int(a[b + 1] + max_diff)
if a[b] < lo:
a[b] = lo
elif a[b] > hi:
a[b] = hi
return a
# -----------------------------------------------------------------------------
# Public API
# -----------------------------------------------------------------------------
def aac_quantizer(
frame_F: FrameChannelF,
frame_type: FrameType,
SMR: FloatArray,
) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]:
"""
AAC quantizer for one channel (Level 3).
Quantizes MDCT coefficients using band-wise scalefactors derived from SMR.
Parameters
----------
frame_F : FrameChannelF
MDCT coefficients after TNS, one channel.
Shapes:
- Long frames: (1024,) or (1024, 1)
- ESH: (128, 8)
frame_type : FrameType
AAC frame type ("OLS", "LSS", "ESH", "LPS").
SMR : FloatArray
Signal-to-Mask Ratio per band.
Shapes:
- Long: (NB,) or (NB, 1)
- ESH: (NB, 8)
Returns
-------
S : QuantizedSymbols
Quantized symbols S(k), shape (1024, 1) for all frame types.
sfc : ScaleFactors
DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1).
Shapes:
- Long: (NB, 1)
- ESH: (NB, 8)
G : GlobalGain
Global gain G = alpha(0).
- Long: scalar float
- ESH: array shape (1, 8)
"""
bands = _band_slices(frame_type)
NB = len(bands)
X = np.asarray(frame_F, dtype=np.float64)
SMR = np.asarray(SMR, dtype=np.float64)
if frame_type == "ESH":
if X.shape != (128, 8):
raise ValueError("For ESH, frame_F must have shape (128, 8).")
if SMR.shape != (NB, 8):
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
G_arr = np.zeros((1, 8), dtype=np.int64)
for j in range(8):
Xj = X[:, j].reshape(128)
SMRj = SMR[:, j].reshape(NB)
T = _threshold_T_from_SMR(Xj, SMRj, bands)
alpha0 = _alpha_initial_from_frame_max(Xj)
alpha = np.full((NB,), alpha0, dtype=np.int64)
for b, (lo, hi) in enumerate(bands):
alpha[b] = _best_alpha_for_band(
X=Xj, lo=lo, hi=hi, T_b=float(T[b]),
alpha0=int(alpha[b]),
alpha_min=-4096,
alpha_max=4096,
)
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
G_arr[0, j] = int(alpha[0])
sfc[0, j] = int(alpha[0])
for b in range(1, NB):
sfc[b, j] = int(alpha[b] - alpha[b - 1])
Sj = np.zeros((128,), dtype=np.int64)
for b, (lo, hi) in enumerate(bands):
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
# Place this subframe in the packed output (column-major subframe layout)
S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj
return S_out, sfc, G_arr.astype(np.float64)
# Long frames
if X.shape == (1024,):
Xv = X
elif X.shape == (1024, 1):
Xv = X[:, 0]
else:
raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).")
if SMR.shape == (NB,):
SMRv = SMR
elif SMR.shape == (NB, 1):
SMRv = SMR[:, 0]
else:
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
T = _threshold_T_from_SMR(Xv, SMRv, bands)
alpha0 = _alpha_initial_from_frame_max(Xv)
alpha = np.full((NB,), alpha0, dtype=np.int64)
for b, (lo, hi) in enumerate(bands):
alpha[b] = _best_alpha_for_band(
X=Xv, lo=lo, hi=hi, T_b=float(T[b]),
alpha0=int(alpha[b]),
alpha_min=-4096,
alpha_max=4096,
)
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
sfc[0, 0] = int(alpha[0])
for b in range(1, NB):
sfc[b, 0] = int(alpha[b] - alpha[b - 1])
G: float = float(alpha[0])
S_vec = np.zeros((1024,), dtype=np.int64)
for b, (lo, hi) in enumerate(bands):
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
return S_vec.reshape(1024, 1), sfc, G
def aac_i_quantizer(
S: QuantizedSymbols,
sfc: ScaleFactors,
G: GlobalGain,
frame_type: FrameType,
) -> FrameChannelF:
"""
Inverse quantizer (iQuantizer) for one channel.
Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors.
Parameters
----------
S : QuantizedSymbols
Quantized symbols, shape (1024, 1) (or any array with 1024 elements).
sfc : ScaleFactors
DPCM-coded scalefactors.
Shapes:
- Long: (NB, 1)
- ESH: (NB, 8)
G : GlobalGain
Global gain (not strictly required if sfc includes sfc(0)=alpha(0)).
Present for API compatibility with the assignment.
frame_type : FrameType
AAC frame type.
Returns
-------
FrameChannelF
Reconstructed MDCT coefficients:
- ESH: (128, 8)
- Long: (1024, 1)
"""
bands = _band_slices(frame_type)
NB = len(bands)
S_flat = np.asarray(S, dtype=np.int64).reshape(-1)
if S_flat.shape[0] != 1024:
raise ValueError("S must contain 1024 symbols.")
if frame_type == "ESH":
sfc = np.asarray(sfc, dtype=np.int64)
if sfc.shape != (NB, 8):
raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).")
S_128x8 = _esh_unpack_from_1024(S_flat)
Xrec = np.zeros((128, 8), dtype=np.float64)
for j in range(8):
alpha = np.zeros((NB,), dtype=np.int64)
alpha[0] = int(sfc[0, j])
for b in range(1, NB):
alpha[b] = int(alpha[b - 1] + sfc[b, j])
Xj = np.zeros((128,), dtype=np.float64)
for b, (lo, hi) in enumerate(bands):
Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b]))
Xrec[:, j] = Xj
return Xrec
sfc = np.asarray(sfc, dtype=np.int64)
if sfc.shape != (NB, 1):
raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).")
alpha = np.zeros((NB,), dtype=np.int64)
alpha[0] = int(sfc[0, 0])
for b in range(1, NB):
alpha[b] = int(alpha[b - 1] + sfc[b, 0])
Xrec = np.zeros((1024,), dtype=np.float64)
for b, (lo, hi) in enumerate(bands):
Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b]))
return Xrec.reshape(1024, 1)

View File

@ -0,0 +1,217 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Sequence Segmentation Control module
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Sequence Segmentation Control module (SSC).
# Selects and returns the frame type based on input parameters.
# ------------------------------------------------------------
from __future__ import annotations
from typing import Dict, Tuple
from core.aac_types import FrameType, FrameT, FrameChannelT
import numpy as np
# -----------------------------------------------------------------------------
# Private helpers for SSC
# -----------------------------------------------------------------------------
# See Table 1 in mm-2025-hw-v0.1.pdf
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
("OLS", "OLS"): "OLS",
("OLS", "LSS"): "LSS",
("OLS", "ESH"): "ESH",
("OLS", "LPS"): "LPS",
("LSS", "OLS"): "LSS",
("LSS", "LSS"): "LSS",
("LSS", "ESH"): "ESH",
("LSS", "LPS"): "ESH",
("ESH", "OLS"): "ESH",
("ESH", "LSS"): "ESH",
("ESH", "ESH"): "ESH",
("ESH", "LPS"): "ESH",
("LPS", "OLS"): "LPS",
("LPS", "LSS"): "ESH",
("LPS", "ESH"): "ESH",
("LPS", "LPS"): "LPS",
}
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
"""
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
according to the assignment's criterion.
Parameters
----------
next_frame_channel : FrameChannelT
One channel of next_frame_T (expected shape: (2048,)).
Returns
-------
bool
True if an attack is detected (=> next frame predicted ESH), else False.
Notes
-----
The criterion is implemented as described in the spec:
1) Apply the high-pass filter:
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
implemented in the time domain as:
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
2) Split y into 16 segments of length 128 and compute segment energies s[l].
3) Compute the ratio:
ds[l] = s[l] / s[l-1]
4) An attack exists if there exists l in {1..7} such that:
s[l] > 1e-3 and ds[l] > 10
"""
# Local alias; expected to be a 1-D array of length 2048.
x = next_frame_channel
# High-pass filter reference implementation (scalar recurrence).
y = np.zeros_like(x)
prev_x = 0.0
prev_y = 0.0
for n in range(x.shape[0]):
xn = float(x[n])
yn = (xn - prev_x) + 0.5 * prev_y
y[n] = yn
prev_x = xn
prev_y = yn
# Segment energies over 16 blocks of 128 samples.
s = np.empty(16, dtype=np.float64)
for l in range(16):
a = l * 128
b = (l + 1) * 128
seg = y[a:b]
s[l] = float(np.sum(seg * seg))
# ds[l] for l>=1. For l=0 not defined, keep 0.
ds = np.zeros(16, dtype=np.float64)
eps = 1e-12 # Avoid division by zero without materially changing the logic.
for l in range(1, 16):
ds[l] = s[l] / max(s[l - 1], eps)
# Spec: check l in {1..7}.
for l in range(1, 8):
if (s[l] > 1e-3) and (ds[l] > 10.0):
return True
return False
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
"""
Decide the current frame type for a single channel based on the previous
frame type and whether the next frame is predicted to be ESH.
Rules (spec):
- If prev is "LSS" => current is "ESH"
- If prev is "LPS" => current is "OLS"
- If prev is "OLS" => current is "LSS" if attack else "OLS"
- If prev is "ESH" => current is "ESH" if attack else "LPS"
Parameters
----------
prev_frame_type : FrameType
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
attack : bool
True if the next frame is predicted ESH for this channel.
Returns
-------
FrameType
The per-channel decision for the current frame.
"""
if prev_frame_type == "LSS":
return "ESH"
if prev_frame_type == "LPS":
return "OLS"
if prev_frame_type == "OLS":
return "LSS" if attack else "OLS"
if prev_frame_type == "ESH":
return "ESH" if attack else "LPS"
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
"""
Merge per-channel frame type decisions into one common frame type using
the stereo merge table from the spec.
Parameters
----------
ft_l : FrameType
Frame type decision for the left channel.
ft_r : FrameType
Frame type decision for the right channel.
Returns
-------
FrameType
The merged common frame type.
"""
try:
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
except KeyError as e:
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
# -----------------------------------------------------------------------------
# Public Function prototypes
# -----------------------------------------------------------------------------
def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
"""
Sequence Segmentation Control (SSC).
Select and return the frame type for the current frame (i) based on:
- the current time-domain frame (stereo),
- the next time-domain frame (stereo), used for attack detection,
- the previous frame type.
Parameters
----------
frame_T : FrameT
Current time-domain frame i (expected shape: (2048, 2)).
next_frame_T : FrameT
Next time-domain frame (i+1), used to decide transitions to/from ESH
(expected shape: (2048, 2)).
prev_frame_type : FrameType
Frame type chosen for the previous frame (i-1).
Returns
-------
FrameType
One of: "OLS", "LSS", "ESH", "LPS".
"""
if frame_T.shape != (2048, 2):
raise ValueError("frame_T must have shape (2048, 2).")
if next_frame_T.shape != (2048, 2):
raise ValueError("next_frame_T must have shape (2048, 2).")
# Detect attack independently per channel on the next frame.
attack_l = _detect_attack(next_frame_T[:, 0])
attack_r = _detect_attack(next_frame_T[:, 1])
# Decide per-channel type based on shared prev_frame_type.
ft_l = _decide_frame_type(prev_frame_type, attack_l)
ft_r = _decide_frame_type(prev_frame_type, attack_r)
# Stereo merge as per the spec table.
return _stereo_merge(ft_l, ft_r)

View File

@ -0,0 +1,514 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Temporal Noise Shaping (TNS)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Temporal Noise Shaping (TNS) module (Level 2).
#
# Public API:
# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs)
#
# Notes (per assignment):
# - TNS is applied per channel (not stereo).
# - For ESH, TNS is applied independently to each of the 8 short subframes.
# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short)
# provided in TableB219.mat.
# - Predictor order is fixed to p = 4.
# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1.
# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Tuple
from core.aac_utils import load_b219_tables
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
from core.aac_types import *
# -----------------------------------------------------------------------------
# Private helpers
# -----------------------------------------------------------------------------
def _band_ranges_for_kcount(k_count: int) -> BandRanges:
"""
Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count.
Parameters
----------
k_count : int
Number of MDCT lines:
- 1024 for long frames
- 128 for short subframes (ESH)
Returns
-------
BandRanges (list[tuple[int, int]])
Each tuple is (start_k, end_k) inclusive.
"""
tables = load_b219_tables()
if k_count == 1024:
tbl = tables["B219a"]
elif k_count == 128:
tbl = tables["B219b"]
else:
raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).")
start = tbl[:, 1].astype(int)
end = tbl[:, 2].astype(int)
ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)]
for s, e in ranges:
if s < 0 or e < s or e >= k_count:
raise ValueError("Invalid band table ranges for given k_count.")
return ranges
# -----------------------------------------------------------------------------
# Core DSP helpers
# -----------------------------------------------------------------------------
def _smooth_sw_inplace(sw: MdctCoeffs) -> None:
"""
Smooth Sw(k) to reduce discontinuities between adjacent Bark bands.
The assignment applies two passes:
- Backward: Sw(k) = (Sw(k) + Sw(k+1))/2
- Forward: Sw(k) = (Sw(k) + Sw(k-1))/2
Parameters
----------
sw : MdctCoeffs
1-D array of length K (float64). Modified in-place.
"""
k_count = int(sw.shape[0])
for k in range(k_count - 2, -1, -1):
sw[k] = 0.5 * (sw[k] + sw[k + 1])
for k in range(1, k_count):
sw[k] = 0.5 * (sw[k] + sw[k - 1])
def _compute_sw(x: MdctCoeffs) -> MdctCoeffs:
"""
Compute Sw(k) from band energies P(j) and apply boundary smoothing.
Parameters
----------
x : MdctCoeffs
1-D MDCT line array, length K.
Returns
-------
MdctCoeffs
Sw(k), 1-D array of length K, float64.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
k_count = int(x.shape[0])
bands = _band_ranges_for_kcount(k_count)
sw = np.zeros(k_count, dtype=np.float64)
for s, e in bands:
seg = x[s : e + 1]
p_j = float(np.sum(seg * seg))
sw_val = float(np.sqrt(p_j))
sw[s : e + 1] = sw_val
_smooth_sw_inplace(sw)
return sw
def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs:
"""
Autocorrelation r(m) for m=0..p.
Parameters
----------
x : MdctCoeffs
1-D signal.
p : int
Maximum lag.
Returns
-------
MdctCoeffs
r, shape (p+1,), float64.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
n = int(x.shape[0])
r = np.zeros(p + 1, dtype=np.float64)
for m in range(p + 1):
r[m] = float(np.dot(x[m:], x[: n - m]))
return r
def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs:
"""
Solve Yule-Walker normal equations for LPC coefficients of order p.
Parameters
----------
xw : MdctCoeffs
1-D normalized sequence Xw(k).
p : int
Predictor order.
Returns
-------
MdctCoeffs
LPC coefficients a[0..p-1], shape (p,), float64.
"""
r = _autocorr(xw, p)
R = np.empty((p, p), dtype=np.float64)
for i in range(p):
for j in range(p):
R[i, j] = r[abs(i - j)]
rhs = r[1 : p + 1].reshape(p)
reg = 1e-12
R_reg = R + reg * np.eye(p, dtype=np.float64)
a = np.linalg.solve(R_reg, rhs)
return a
def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs:
"""
Quantize LPC coefficients with uniform symmetric quantizer and clamp.
Parameters
----------
a : MdctCoeffs
LPC coefficient array, shape (p,).
Returns
-------
MdctCoeffs
Quantized coefficients, shape (p,), float64.
"""
a = np.asarray(a, dtype=np.float64).reshape(-1)
q = np.round(a / QUANT_STEP) * QUANT_STEP
q = np.clip(q, -QUANT_MAX, QUANT_MAX)
return q.astype(np.float64, copy=False)
def _is_inverse_stable(a_q: MdctCoeffs) -> bool:
"""
Check stability of the inverse TNS filter H_TNS^{-1}.
Forward filter:
H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
Inverse filter poles are roots of:
A(z) = 1 - a1 z^-1 - ... - ap z^-p
Multiply by z^p:
z^p - a1 z^{p-1} - ... - ap = 0
Stability condition:
all roots satisfy |z| < 1.
Parameters
----------
a_q : MdctCoeffs
Quantized predictor coefficients, shape (p,).
Returns
-------
bool
True if stable, else False.
"""
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
# Polynomial in z: z^p - a1 z^{p-1} - ... - ap
poly = np.empty(p + 1, dtype=np.float64)
poly[0] = 1.0
poly[1:] = -a_q
roots = np.roots(poly)
# Strictly inside unit circle for stability. Add tiny margin for numeric safety.
margin = 1e-12
return bool(np.all(np.abs(roots) < (1.0 - margin)))
def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs:
"""
Make quantized predictor coefficients stable for inverse filtering.
Policy:
- If already stable: return as-is.
- Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid.
- If still unstable after attempts: fall back to all-zero coefficients (disable TNS).
Parameters
----------
a_q : MdctCoeffs
Quantized predictor coefficients, shape (p,).
Returns
-------
MdctCoeffs
Stable quantized coefficients, shape (p,).
"""
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
if _is_inverse_stable(a_q):
return a_q
# Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid.
gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1)
for g in gammas:
cand = _quantize_coeffs(g * a_q)
if _is_inverse_stable(cand):
return cand
# Last resort: disable TNS for this vector
return np.zeros_like(a_q, dtype=np.float64)
def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
"""
Apply forward TNS FIR filter:
y[k] = x[k] - sum_{l=1..p} a_l * x[k-l]
Parameters
----------
x : MdctCoeffs
1-D MDCT lines, length K.
a_q : MdctCoeffs
Quantized LPC coefficients, shape (p,).
Returns
-------
MdctCoeffs
Filtered MDCT lines y, length K.
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
k_count = int(x.shape[0])
y = np.zeros(k_count, dtype=np.float64)
for k in range(k_count):
acc = x[k]
for l in range(1, p + 1):
if k - l >= 0:
acc -= a_q[l - 1] * x[k - l]
y[k] = acc
return y
def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
"""
Apply inverse TNS IIR filter:
x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l]
Parameters
----------
y : MdctCoeffs
1-D MDCT lines after TNS, length K.
a_q : MdctCoeffs
Quantized LPC coefficients, shape (p,).
Returns
-------
MdctCoeffs
Reconstructed MDCT lines x_hat, length K.
"""
y = np.asarray(y, dtype=np.float64).reshape(-1)
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
p = int(a_q.shape[0])
k_count = int(y.shape[0])
x_hat = np.zeros(k_count, dtype=np.float64)
for k in range(k_count):
acc = y[k]
for l in range(1, p + 1):
if k - l >= 0:
acc += a_q[l - 1] * x_hat[k - l]
x_hat[k] = acc
return x_hat
def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
"""
TNS for a single MDCT vector (one long frame or one short subframe).
Steps:
1) Compute Sw(k) from Bark band energies and smooth it.
2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0).
3) Compute LPC coefficients (order p=PRED_ORDER) on Xw.
4) Quantize coefficients (4-bit symmetric, step QUANT_STEP).
5) Apply FIR filter on original X(k) using quantized coefficients.
Parameters
----------
x : MdctCoeffs
1-D MDCT vector.
Returns
-------
y : MdctCoeffs
TNS-processed MDCT vector (same length).
a_q : MdctCoeffs
Quantized LPC coefficients, shape (PRED_ORDER,).
"""
x = np.asarray(x, dtype=np.float64).reshape(-1)
sw = _compute_sw(x)
eps = 1e-12
xw = np.zeros_like(x, dtype=np.float64)
mask = sw > eps
np.divide(x, sw, out=xw, where=mask)
a = _lpc_coeffs(xw, PRED_ORDER)
a_q = _quantize_coeffs(a)
# Ensure inverse stability (assignment requirement)
a_q = _stabilize_quantized_coeffs(a_q)
y = _apply_tns_fir(x, a_q)
return y, a_q
# -----------------------------------------------------------------------------
# Public Functions
# -----------------------------------------------------------------------------
def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]:
"""
Temporal Noise Shaping (TNS) for ONE channel.
Parameters
----------
frame_F_in : FrameChannelF
Per-channel MDCT coefficients.
Expected (typical) shapes:
- If frame_type == "ESH": (128, 8)
- Else: (1024, 1) or (1024,)
frame_type : FrameType
Frame type code ("OLS", "LSS", "ESH", "LPS").
Returns
-------
frame_F_out : FrameChannelF
Per-channel MDCT coefficients after applying TNS.
Same shape convention as input.
tns_coeffs : TnsCoeffs
Quantized TNS predictor coefficients.
Expected shapes:
- If frame_type == "ESH": (PRED_ORDER, 8)
- Else: (PRED_ORDER, 1)
"""
x = np.asarray(frame_F_in, dtype=np.float64)
if frame_type == "ESH":
if x.shape != (128, 8):
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
y = np.empty_like(x, dtype=np.float64)
a_out = np.empty((PRED_ORDER, 8), dtype=np.float64)
for j in range(8):
y[:, j], a_out[:, j] = _tns_one_vector(x[:, j])
return y, a_out
if x.shape == (1024,):
x_vec = x
out_shape = (1024,)
elif x.shape == (1024, 1):
x_vec = x[:, 0]
out_shape = (1024, 1)
else:
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
y_vec, a_q = _tns_one_vector(x_vec)
if out_shape == (1024,):
y_out = y_vec
else:
y_out = y_vec.reshape(1024, 1)
a_out = a_q.reshape(PRED_ORDER, 1)
return y_out, a_out
def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF:
"""
Inverse Temporal Noise Shaping (iTNS) for ONE channel.
Parameters
----------
frame_F_in : FrameChannelF
Per-channel MDCT coefficients after TNS.
Expected (typical) shapes:
- If frame_type == "ESH": (128, 8)
- Else: (1024, 1) or (1024,)
frame_type : FrameType
Frame type code ("OLS", "LSS", "ESH", "LPS").
tns_coeffs : TnsCoeffs
Quantized TNS predictor coefficients.
Expected shapes:
- If frame_type == "ESH": (PRED_ORDER, 8)
- Else: (PRED_ORDER, 1)
Returns
-------
FrameChannelF
Per-channel MDCT coefficients after inverse TNS.
Same shape convention as input frame_F_in.
"""
x = np.asarray(frame_F_in, dtype=np.float64)
a = np.asarray(tns_coeffs, dtype=np.float64)
if frame_type == "ESH":
if x.shape != (128, 8):
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
if a.shape != (PRED_ORDER, 8):
raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).")
y = np.empty_like(x, dtype=np.float64)
for j in range(8):
y[:, j] = _apply_itns_iir(x[:, j], a[:, j])
return y
if a.shape != (PRED_ORDER, 1):
raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).")
if x.shape == (1024,):
x_vec = x
out_shape = (1024,)
elif x.shape == (1024, 1):
x_vec = x[:, 0]
out_shape = (1024, 1)
else:
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
y_vec = _apply_itns_iir(x_vec, a[:, 0])
if out_shape == (1024,):
return y_vec
return y_vec.reshape(1024, 1)

View File

@ -0,0 +1,411 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Public Type Aliases
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements Public Type aliases
# ------------------------------------------------------------
from __future__ import annotations
from typing import List, Literal, TypeAlias, TypedDict
import numpy as np
from numpy.typing import NDArray
# -----------------------------------------------------------------------------
# Code enums (for readability; not intended to enforce shapes/lengths)
# -----------------------------------------------------------------------------
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
"""
Frame type codes (AAC):
- "OLS": ONLY_LONG_SEQUENCE
- "LSS": LONG_START_SEQUENCE
- "ESH": EIGHT_SHORT_SEQUENCE
- "LPS": LONG_STOP_SEQUENCE
"""
WinType: TypeAlias = Literal["KBD", "SIN"]
"""
Window type codes (AAC):
- "KBD": Kaiser-Bessel-Derived
- "SIN": sinusoid
"""
ChannelKey: TypeAlias = Literal["chl", "chr"]
"""Channel dictionary keys used in Level payloads."""
# -----------------------------------------------------------------------------
# Array “semantic” aliases
#
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
# forcing strict shapes in the type system.
# -----------------------------------------------------------------------------
FloatArray: TypeAlias = NDArray[np.float64]
"""
Generic float64 NumPy array.
Note:
- We standardize internal numeric computations to float64 for stability and
reproducibility. External I/O can still be float32, but we convert at the
boundaries.
"""
Window: TypeAlias = FloatArray
"""
Time-domain window (weighting sequence), 1-D.
Typical lengths in this assignment:
- Long: 2048
- Short: 256
- Window sequences for LSS/LPS are also 2048
Expected shape: (N,)
dtype: float64
"""
TimeSignal: TypeAlias = FloatArray
"""
Time-domain signal samples, typically 1-D.
Examples:
- Windowed MDCT input: shape (N,)
- IMDCT output: shape (N,)
dtype: float64
"""
StereoSignal: TypeAlias = FloatArray
"""
Time-domain stereo signal stream.
Expected (typical) shape: (N, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
MdctCoeffs: TypeAlias = FloatArray
"""
MDCT coefficient vector, typically 1-D.
Examples:
- Long: shape (1024,)
- Short: shape (128,)
dtype: float64
"""
MdctFrameChannel: TypeAlias = FloatArray
"""
Per-channel MDCT container used in Level-1/2 sequences.
Typical shapes:
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
dtype: float64
Notes
-----
Some parts of the assignment store long-frame coefficients as a column vector
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
when convenient, but the semantic meaning is identical.
"""
TnsCoeffs: TypeAlias = FloatArray
"""
Quantized TNS predictor coefficients (one channel).
Typical shapes (Level 2):
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
- Else: (4, 1) (order p=4 for the long frame)
dtype: float64
Notes
-----
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
We store the quantized coefficient values as float64 (typically multiples of 0.1)
to keep the pipeline simple and readable.
"""
FrameT: TypeAlias = FloatArray
"""
Time-domain frame (stereo), as used by the filterbank input/output.
Expected (typical) shape for stereo: (2048, 2)
- axis 0: time samples
- axis 1: channels [L, R]
dtype: float64
"""
FrameChannelT: TypeAlias = FloatArray
"""
Time-domain single-channel frame.
Expected (typical) shape: (2048,)
dtype: float64
"""
FrameF: TypeAlias = FloatArray
"""
Frequency-domain frame (MDCT coefficients), stereo container.
Typical shapes (Level 1):
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
- If frame_type == "ESH": (128, 16)
Rationale for ESH (128, 16):
- 8 short subframes per channel => 8 * 2 = 16 columns total
- Each short subframe per stereo is (128, 2), flattened into columns
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
dtype: float64
"""
FrameChannelF: TypeAlias = MdctFrameChannel
"""
Frequency-domain single-channel MDCT coefficients.
Typical shapes (Level 1/2):
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
- If frame_type == "ESH": (128, 8)
dtype: float64
"""
BandRanges: TypeAlias = list[tuple[int, int]]
"""
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
Used by TNS to map MDCT indices k to Bark bands.
"""
BarkTable: TypeAlias = FloatArray
"""
Psychoacoustic Bark band table loaded from TableB219.mat.
Typical shapes:
- Long: (69, 6)
- Short: (42, 6)
"""
BandIndexArray: TypeAlias = NDArray[np.int_]
"""
Array of FFT bin indices per psychoacoustic band.
"""
BandValueArray: TypeAlias = FloatArray
"""
Per-band psychoacoustic values (e.g. Bark position, thresholds).
"""
# Quantizer-related semantic aliases
QuantizedSymbols: TypeAlias = NDArray[np.generic]
"""
Quantized MDCT symbols S(k).
Shapes:
- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols).
"""
ScaleFactors: TypeAlias = NDArray[np.generic]
"""
DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1).
Shapes:
- Long frames: (NB, 1)
- ESH frames: (NB, 8)
"""
GlobalGain: TypeAlias = float | NDArray[np.generic]
"""
Global gain G = alpha(0).
- Long frames: scalar float
- ESH frames: array shape (1, 8)
"""
# Huffman semantic aliases
HuffmanBitstream: TypeAlias = str
"""Huffman-coded bitstream stored as a string of '0'/'1'."""
HuffmanCodebook: TypeAlias = int
"""Huffman codebook id (e.g., 0..11)."""
# -----------------------------------------------------------------------------
# Level 1 AAC sequence payload types
# -----------------------------------------------------------------------------
class AACChannelFrameF(TypedDict):
"""
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
Keys
----
frame_F:
The MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8) (8 short subframes)
- else: (1024, 1) or (1024,)
"""
frame_F: FrameChannelF
class AACSeq1Frame(TypedDict):
"""
One frame dictionary element of aac_seq_1 (Level 1).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF
chr: AACChannelFrameF
AACSeq1: TypeAlias = List[AACSeq1Frame]
"""
AAC sequence for Level 1:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
"""
# -----------------------------------------------------------------------------
# Level 2 AAC sequence payload types (TNS)
# -----------------------------------------------------------------------------
class AACChannelFrameF2(TypedDict):
"""
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
Keys
----
frame_F:
The TNS-processed MDCT coefficients for ONE channel.
Typical shapes:
- ESH: (128, 8)
- else: (1024, 1) or (1024,)
tns_coeffs:
Quantized TNS predictor coefficients for ONE channel.
Typical shapes:
- ESH: (PRED_ORDER, 8)
- else: (PRED_ORDER, 1)
"""
frame_F: FrameChannelF
tns_coeffs: TnsCoeffs
class AACSeq2Frame(TypedDict):
"""
One frame dictionary element of aac_seq_2 (Level 2).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF2
chr: AACChannelFrameF2
AACSeq2: TypeAlias = List[AACSeq2Frame]
"""
AAC sequence for Level 2:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
Level 2 adds:
- per-channel "tns_coeffs"
and stores:
- per-channel "frame_F" after applying TNS.
"""
# -----------------------------------------------------------------------------
# Level 3 AAC sequence payload types (Quantizer + Huffman)
# -----------------------------------------------------------------------------
class AACChannelFrameF3(TypedDict):
"""
Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3).
Keys
----
tns_coeffs:
Quantized TNS predictor coefficients for ONE channel.
Shapes:
- ESH: (PRED_ORDER, 8)
- else: (PRED_ORDER, 1)
T:
Psychoacoustic thresholds per band.
Shapes:
- ESH: (NB, 8)
- else: (NB, 1)
Note: Stored for completeness / debugging; not entropy-coded.
G:
Quantized global gains.
Shapes:
- ESH: (1, 8) (one per short subframe)
- else: scalar (or compatible np scalar)
sfc:
Huffman-coded scalefactor differences (DPCM sequence).
stream:
Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols).
codebook:
Huffman codebook id used for MDCT symbols (stream).
(Scalefactors typically use fixed codebook 11 and do not need to store it.)
"""
tns_coeffs: TnsCoeffs
T: FloatArray
G: FloatArray | float
sfc: HuffmanBitstream
stream: HuffmanBitstream
codebook: HuffmanCodebook
class AACSeq3Frame(TypedDict):
"""
One frame dictionary element of aac_seq_3 (Level 3).
"""
frame_type: FrameType
win_type: WinType
chl: AACChannelFrameF3
chr: AACChannelFrameF3
AACSeq3: TypeAlias = List[AACSeq3Frame]
"""
AAC sequence for Level 3:
List of length K (K = number of frames).
Each element is a dict with keys:
- "frame_type", "win_type", "chl", "chr"
Level 3 adds (per channel):
- "tns_coeffs"
- "T" thresholds (not entropy-coded)
- "G" global gain(s)
- "sfc" Huffman-coded scalefactor differences
- "stream" Huffman-coded MDCT quantized symbols
- "codebook" Huffman codebook for MDCT symbols
"""

View File

@ -0,0 +1,270 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - AAC Utilities
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Shared utility functions used across AAC encoder/decoder levels.
#
# This module currently provides:
# - MDCT / IMDCT conversions
# - Signal-to-Noise Ratio (SNR) computation in dB
# - Loading and access helpers for psychoacoustic band tables
# (TableB219.mat, Tables B.2.1.9a / B.2.1.9b of the AAC specification)
# ------------------------------------------------------------
from __future__ import annotations
import numpy as np
from pathlib import Path
from scipy.io import loadmat
from core.aac_types import *
# -----------------------------------------------------------------------------
# Global cached data
# -----------------------------------------------------------------------------
# Cached contents of TableB219.mat to avoid repeated disk I/O.
# Keys:
# - "B219a": long-window psychoacoustic bands (69 bands, FFT size 2048)
# - "B219b": short-window psychoacoustic bands (42 bands, FFT size 256)
B219_CACHE: dict[str, BarkTable] | None = None
# -----------------------------------------------------------------------------
# MDCT / IMDCT
# -----------------------------------------------------------------------------
def mdct(s: TimeSignal) -> MdctCoeffs:
"""
MDCT (direct form) as specified in the assignment.
Parameters
----------
s : TimeSignal
Windowed time samples, 1-D array of length N (N = 2048 or 256).
Returns
-------
MdctCoeffs
MDCT coefficients, 1-D array of length N/2.
Definition
----------
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
s = np.asarray(s, dtype=np.float64).reshape(-1)
N = int(s.shape[0])
if N not in (2048, 256):
raise ValueError("MDCT input length must be 2048 or 256.")
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(N // 2, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
X = 2.0 * (s @ C) # (N/2,)
return X
def imdct(X: MdctCoeffs) -> TimeSignal:
"""
IMDCT (direct form) as specified in the assignment.
Parameters
----------
X : MdctCoeffs
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
Returns
-------
TimeSignal
Reconstructed time samples, 1-D array of length N = 2K.
Definition
----------
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
where n0 = (N/2 + 1)/2.
"""
X = np.asarray(X, dtype=np.float64).reshape(-1)
K = int(X.shape[0])
if K not in (1024, 128):
raise ValueError("IMDCT input length must be 1024 or 128.")
N = 2 * K
n0 = (N / 2.0 + 1.0) / 2.0
n = np.arange(N, dtype=np.float64) + n0
k = np.arange(K, dtype=np.float64) + 0.5
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
s = (2.0 / N) * (C @ X) # (N,)
return s
# -----------------------------------------------------------------------------
# Signal quality metrics
# -----------------------------------------------------------------------------
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
"""
Compute the overall Signal-to-Noise Ratio (SNR) in dB.
The SNR is computed over all available samples and channels,
after conservatively aligning the two signals to their common
length and channel count.
Parameters
----------
x_ref : StereoSignal
Reference (original) signal.
Typical shape: (N, 2) for stereo.
x_hat : StereoSignal
Reconstructed or processed signal.
Typical shape: (M, 2) for stereo.
Returns
-------
float
SNR in dB.
- +inf if the noise power is zero (perfect reconstruction).
- -inf if the reference signal power is zero.
"""
x_ref = np.asarray(x_ref, dtype=np.float64)
x_hat = np.asarray(x_hat, dtype=np.float64)
# Ensure 2-D shape: (samples, channels)
if x_ref.ndim == 1:
x_ref = x_ref.reshape(-1, 1)
if x_hat.ndim == 1:
x_hat = x_hat.reshape(-1, 1)
# Align lengths and channel count conservatively
n = min(x_ref.shape[0], x_hat.shape[0])
c = min(x_ref.shape[1], x_hat.shape[1])
x_ref = x_ref[:n, :c]
x_hat = x_hat[:n, :c]
err = x_ref - x_hat
ps = float(np.sum(x_ref * x_ref)) # signal power
pn = float(np.sum(err * err)) # noise power
if pn <= 0.0:
return float("inf")
if ps <= 0.0:
return float("-inf")
return float(10.0 * np.log10(ps / pn))
# -----------------------------------------------------------------------------
# Psychoacoustic band tables (TableB219.mat)
# -----------------------------------------------------------------------------
def load_b219_tables() -> dict[str, BarkTable]:
"""
Load and cache psychoacoustic band tables from TableB219.mat.
The assignment/project layout assumes that a 'material' directory
is available in the current working directory when running:
- tests
- level_1 / level_2 / level_3 entrypoints
This function loads the tables once and caches them for subsequent calls.
Returns
-------
dict[str, BarkTable]
Dictionary with the following entries:
- "B219a": long-window psychoacoustic table
(69 bands, FFT size 2048 / 1024 spectral lines)
- "B219b": short-window psychoacoustic table
(42 bands, FFT size 256 / 128 spectral lines)
"""
global B219_CACHE
if B219_CACHE is not None:
return B219_CACHE
mat_path = Path("material") / "TableB219.mat"
if not mat_path.exists():
raise FileNotFoundError(
"Could not locate material/TableB219.mat in the current working directory."
)
data = loadmat(str(mat_path))
if "B219a" not in data or "B219b" not in data:
raise ValueError(
"TableB219.mat missing required variables 'B219a' and/or 'B219b'."
)
B219_CACHE = {
"B219a": np.asarray(data["B219a"], dtype=np.float64),
"B219b": np.asarray(data["B219b"], dtype=np.float64),
}
return B219_CACHE
def get_table(frame_type: FrameType) -> tuple[BarkTable, int]:
"""
Select the appropriate psychoacoustic band table and FFT size
based on the AAC frame type.
Parameters
----------
frame_type : FrameType
AAC frame type ("OLS", "LSS", "ESH", "LPS").
Returns
-------
table : BarkTable
Psychoacoustic band table:
- B219a for long frames
- B219b for ESH short subframes
N : int
FFT size corresponding to the table:
- 2048 for long frames
- 256 for short frames (ESH)
"""
tables = load_b219_tables()
if frame_type == "ESH":
return tables["B219b"], 256
return tables["B219a"], 2048
def band_limits(
table: BarkTable,
) -> tuple[BandIndexArray, BandIndexArray, BandValueArray, BandValueArray]:
"""
Extract per-band metadata from a TableB2.1.9 psychoacoustic table.
The column layout follows the provided TableB219.mat file and the
AAC specification tables B.2.1.9a / B.2.1.9b.
Parameters
----------
table : BarkTable
Psychoacoustic band table (B219a or B219b).
Returns
-------
wlow : BandIndexArray
Lower FFT bin index (inclusive) for each band.
whigh : BandIndexArray
Upper FFT bin index (inclusive) for each band.
bval : BandValueArray
Bark-scale (or equivalent) band position values.
Used in the spreading function.
qthr_db : BandValueArray
Threshold in quiet for each band, in dB.
"""
wlow = table[:, 1].astype(int)
whigh = table[:, 2].astype(int)
bval = table[:, 4].astype(np.float64)
qthr_db = table[:, 5].astype(np.float64)
return wlow, whigh, bval, qthr_db

237
source/level_3/level_3.py Normal file
View File

@ -0,0 +1,237 @@
# ------------------------------------------------------------
# AAC Coder/Decoder - Level 3 Wrappers + Demo
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# Level 3 wrapper module.
#
# This file provides:
# - Thin wrappers for Level 3 API functions (encode/decode) that delegate
# to the corresponding core implementations.
# - A demo function that runs end-to-end and computes:
# * SNR
# * bitrate (coded)
# * compression ratio
# - A small CLI entrypoint for convenience.
# ------------------------------------------------------------
from __future__ import annotations
from pathlib import Path
from typing import Optional, Tuple, Union
import os
import soundfile as sf
from core.aac_types import AACSeq3, StereoSignal
from core.aac_coder import aac_coder_3 as core_aac_coder_3
from core.aac_coder import aac_read_wav_stereo_48k
from core.aac_decoder import aac_decoder_3 as core_aac_decoder_3
from core.aac_utils import snr_db
# -----------------------------------------------------------------------------
# Helpers (Level 3 metrics)
# -----------------------------------------------------------------------------
def _wav_duration_seconds(wav_path: Path) -> float:
"""Return WAV duration in seconds using soundfile metadata."""
info = sf.info(str(wav_path))
if info.samplerate <= 0:
raise ValueError("Invalid samplerate in WAV header.")
if info.frames < 0:
raise ValueError("Invalid frame count in WAV header.")
return float(info.frames) / float(info.samplerate)
def _bitrate_before_from_file(wav_path: Path) -> float:
"""
Compute input bitrate (bits/s) from file size and duration.
Note:
This is a file-based bitrate estimate (includes WAV header), which is
acceptable for a simple compression ratio metric.
"""
duration = _wav_duration_seconds(wav_path)
if duration <= 0.0:
raise ValueError("Non-positive WAV duration.")
nbits = float(os.path.getsize(wav_path)) * 8.0
return nbits / duration
def _bitrate_after_from_aacseq(aac_seq_3: AACSeq3, duration_sec: float) -> float:
"""
Compute coded bitrate (bits/s) from Huffman streams stored in AACSeq3.
We count bits from:
- scalefactor Huffman bitstream ("sfc")
- MDCT symbols Huffman bitstream ("stream")
for both channels and all frames.
Note:
We intentionally ignore side-info overhead (frame_type, G, T, TNS coeffs,
codebook ids, etc.). This matches a common simplified metric in demos.
"""
if duration_sec <= 0.0:
raise ValueError("Non-positive duration for bitrate computation.")
total_bits = 0
for fr in aac_seq_3:
total_bits += len(fr["chl"]["sfc"])
total_bits += len(fr["chl"]["stream"])
total_bits += len(fr["chr"]["sfc"])
total_bits += len(fr["chr"]["stream"])
return float(total_bits) / float(duration_sec)
# -----------------------------------------------------------------------------
# Public Level 3 API (wrappers)
# -----------------------------------------------------------------------------
def aac_coder_3(
filename_in: Union[str, Path],
filename_aac_coded: Optional[Union[str, Path]] = None,
) -> AACSeq3:
"""
Level-3 AAC encoder (wrapper).
Delegates to core implementation.
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename.
Assumption: stereo audio, sampling rate 48 kHz.
filename_aac_coded : Optional[Union[str, Path]]
Optional filename to store the encoded AAC sequence (e.g., .mat).
Returns
-------
AACSeq3
List of encoded frames (Level 3 schema).
"""
return core_aac_coder_3(filename_in, filename_aac_coded, verbose=True)
def i_aac_coder_3(
aac_seq_3: AACSeq3,
filename_out: Union[str, Path],
) -> StereoSignal:
"""
Level-3 AAC decoder (wrapper).
Delegates to core implementation.
Parameters
----------
aac_seq_3 : AACSeq3
Encoded sequence as produced by aac_coder_3().
filename_out : Union[str, Path]
Output WAV filename. Assumption: 48 kHz, stereo.
Returns
-------
StereoSignal
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
"""
return core_aac_decoder_3(aac_seq_3, filename_out, verbose=True)
# -----------------------------------------------------------------------------
# Demo (Level 3)
# -----------------------------------------------------------------------------
def demo_aac_3(
filename_in: Union[str, Path],
filename_out: Union[str, Path],
filename_aac_coded: Optional[Union[str, Path]] = None,
) -> Tuple[float, float, float]:
"""
Demonstration for the Level-3 codec.
Runs:
- aac_coder_3(filename_in, filename_aac_coded)
- i_aac_coder_3(aac_seq_3, filename_out)
and computes:
- total SNR between original and decoded audio
- coded bitrate (bits/s) based on Huffman streams
- compression ratio (bitrate_before / bitrate_after)
Parameters
----------
filename_in : Union[str, Path]
Input WAV filename (stereo, 48 kHz).
filename_out : Union[str, Path]
Output WAV filename (stereo, 48 kHz).
filename_aac_coded : Optional[Union[str, Path]]
Optional filename to store the encoded AAC sequence (e.g., .mat).
Returns
-------
Tuple[float, float, float]
(SNR_dB, bitrate_after_bits_per_s, compression_ratio)
"""
filename_in = Path(filename_in)
filename_out = Path(filename_out)
filename_aac_coded = Path(filename_aac_coded) if filename_aac_coded else None
# Read original audio (reference) with the same validation as the codec.
x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in)
if int(fs_ref) != 48000:
raise ValueError("Input sampling rate must be 48 kHz.")
# Encode / decode
aac_seq_3 = aac_coder_3(filename_in, filename_aac_coded)
x_hat = i_aac_coder_3(aac_seq_3, filename_out)
# Optional sanity: ensure output file exists and is readable
_, fs_hat = sf.read(str(filename_out), always_2d=True)
if int(fs_hat) != 48000:
raise ValueError("Decoded output sampling rate must be 48 kHz.")
# Metrics
s = snr_db(x_ref, x_hat)
duration = _wav_duration_seconds(filename_in)
bitrate_before = _bitrate_before_from_file(filename_in)
bitrate_after = _bitrate_after_from_aacseq(aac_seq_3, duration)
compression = float("inf") if bitrate_after <= 0.0 else (bitrate_before / bitrate_after)
return float(s), float(bitrate_after), float(compression)
# -----------------------------------------------------------------------------
# CLI
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Example:
# cd level_3
# python -m level_3 input.wav output.wav
# for example:
# python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav
# or
# python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav aac_seq_3.mat
import sys
if len(sys.argv) not in (3, 4):
raise SystemExit("Usage: python -m level_3 <input.wav> <output.wav> [aac_seq_3.mat]")
in_wav = Path(sys.argv[1])
out_wav = Path(sys.argv[2])
aac_mat = Path(sys.argv[3]) if len(sys.argv) == 4 else None
print(f"Encoding/Decoding {in_wav} to {out_wav}")
if aac_mat is not None:
print(f"Storing coded sequence to {aac_mat}")
snr, bitrate, compression = demo_aac_3(in_wav, out_wav, aac_mat)
print(f"SNR = {snr:.3f} dB")
print(f"Bitrate (coded) = {bitrate:.2f} bits/s")
print(f"Compression ratio = {compression:.4f}")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,400 @@
import numpy as np
import scipy.io as sio
import os
# ------------------ LOAD LUT ------------------
def load_LUT(mat_filename=None):
"""
Loads the list of Huffman Codebooks (LUTs)
Returns:
huffLUT : list (index 1..11 used, index 0 unused)
"""
if mat_filename is None:
current_dir = os.path.dirname(os.path.abspath(__file__))
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
mat = sio.loadmat(mat_filename)
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
huffCodebooks = []
for i in range(11):
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
# Build inverse VLC tables
invTable = [None] * 11
for i in range(11):
h = huffCodebooks[i][:, 2].astype(int) # column 3
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
hbin = []
for j in range(len(h)):
hbin.append(format(h[j], f'0{hlength[j]}b'))
invTable[i] = vlc_table(hbin)
# Build Huffman LUT dicts
huffLUT = [None] * 12 # index 0 unused
params = [
(4, 1, True),
(4, 1, True),
(4, 2, False),
(4, 2, False),
(2, 4, True),
(2, 4, True),
(2, 7, False),
(2, 7, False),
(2, 12, False),
(2, 12, False),
(2, 16, False),
]
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
huffLUT[i] = {
'LUT': huffCodebooks[i-1],
'invTable': invTable[i-1],
'codebook': i,
'nTupleSize': nTupleSize,
'maxAbsCodeVal': maxAbs,
'signedValues': signed
}
return huffLUT
def vlc_table(code_array):
"""
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
returns:
h : NumPy array of shape (num_nodes, 3)
columns:
[ next_if_0 , next_if_1 , symbol_index ]
"""
h = np.zeros((1, 3), dtype=int)
for code_index, code in enumerate(code_array, start=1):
word = [int(bit) for bit in code]
h_index = 0
for bit in word:
k = bit
next_node = h[h_index, k]
if next_node == 0:
h = np.vstack([h, [0, 0, 0]])
new_index = h.shape[0] - 1
h[h_index, k] = new_index
h_index = new_index
else:
h_index = next_node
h[h_index, 2] = code_index
return h
# ------------------ ENCODE ------------------
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
"""
Huffman-encode a sequence of quantized coefficients.
This function selects the appropriate Huffman codebook based on the
maximum absolute value of the input coefficients, encodes the coefficients
into a binary Huffman bitstream, and returns both the bitstream and the
selected codebook index.
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
in audio/image coding (e.g., scale factor band encoding). The input
coefficient sequence is grouped into fixed-size tuples as defined by
the chosen Huffman LUT. Zero-padding may be applied internally.
Parameters
----------
coeff_sec : array_like of int
1-D array of quantized integer coefficients to encode.
Typically corresponds to a "section" or scale-factor band.
huff_LUT_list : list
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
Index 1..11 correspond to valid Huffman codebooks.
Index 0 is unused.
Returns
-------
huffSec : str
Huffman-encoded bitstream represented as a string of '0' and '1'
characters.
huffCodebook : int
Index (1..11) of the Huffman codebook used for encoding.
A value of 0 indicates a special all-zero section.
"""
if force_codebook is not None:
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
maxAbsVal = np.max(np.abs(coeff_sec))
if maxAbsVal == 0:
huffCodebook = 0
huffSec = huff_LUT_code_0()
elif maxAbsVal == 1:
candidates = [1, 2]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal == 2:
candidates = [3, 4]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (3, 4):
candidates = [5, 6]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (5, 6, 7):
candidates = [7, 8]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (8, 9, 10, 11, 12):
candidates = [9, 10]
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
if len(huffSec1) <= len(huffSec2):
huffSec = huffSec1
huffCodebook = candidates[0]
else:
huffSec = huffSec2
huffCodebook = candidates[1]
elif maxAbsVal in (13, 14, 15):
huffCodebook = 11
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
else:
huffCodebook = 11
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
return huffSec, huffCodebook
def huff_LUT_code_1(huff_LUT, coeff_sec):
LUT = huff_LUT['LUT']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
signedValues = huff_LUT['signedValues']
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
if signedValues:
coeff = coeff_sec + maxAbsCodeVal
base = 2 * maxAbsCodeVal + 1
else:
coeff = coeff_sec
base = maxAbsCodeVal + 1
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
coeffPad[:len(coeff)] = coeff
huffSec = []
powers = base ** np.arange(nTupleSize - 1, -1, -1)
for i in range(numTuples):
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
huffIndex = int(np.abs(nTuple) @ powers)
hexVal = LUT[huffIndex, 2]
huffLen = LUT[huffIndex, 1]
bits = format(int(hexVal), f'0{int(huffLen)}b')
if signedValues:
huffSec.append(bits)
else:
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
huffSec.append(bits + signBits)
return ''.join(huffSec)
def huff_LUT_code_0():
return ''
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
LUT = huff_LUT['LUT']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
base = maxAbsCodeVal + 1
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
coeffPad[:len(coeff_sec)] = coeff_sec
huffSec = []
powers = base ** np.arange(nTupleSize - 1, -1, -1)
for i in range(numTuples):
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
lnTuple = nTuple.astype(float)
lnTuple[lnTuple == 0] = np.finfo(float).eps
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
N = np.maximum(0, N4 - 4)
esc = np.abs(nTuple) > 15
nTupleESC = nTuple.copy()
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
huffIndex = int(np.abs(nTupleESC) @ powers)
hexVal = LUT[huffIndex, 2]
huffLen = LUT[huffIndex, 1]
bits = format(int(hexVal), f'0{int(huffLen)}b')
escSeq = ''
for k in range(nTupleSize):
if esc[k]:
escSeq += '1' * N[k]
escSeq += '0'
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
huffSec.append(bits + signBits + escSeq)
return ''.join(huffSec)
# ------------------ DECODE ------------------
def decode_huff(huff_sec, huff_LUT):
"""
Decode a Huffman-encoded stream.
Parameters
----------
huff_sec : array-like of int or str
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
huff_LUT : dict
Huffman lookup table with keys:
- 'invTable': inverse table (numpy array)
- 'codebook': codebook number
- 'nTupleSize': tuple size
- 'maxAbsCodeVal': maximum absolute code value
- 'signedValues': True/False
Returns
-------
decCoeffs : list of int
Decoded quantized coefficients.
"""
h = huff_LUT['invTable']
huffCodebook = huff_LUT['codebook']
nTupleSize = huff_LUT['nTupleSize']
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
signedValues = huff_LUT['signedValues']
# Convert string to array of ints
if isinstance(huff_sec, str):
huff_sec = np.array([int(b) for b in huff_sec])
eos = False
decCoeffs = []
streamIndex = 0
while not eos:
wordbit = 0
r = 0 # start at root
# Decode Huffman word using inverse table
while True:
b = huff_sec[streamIndex + wordbit]
wordbit += 1
rOld = r
r = h[rOld, b]
if h[r, 0] == 0 and h[r, 1] == 0:
symbolIndex = h[r, 2] - 1 # zero-based
streamIndex += wordbit
break
# Decode n-tuple magnitudes
if signedValues:
base = 2 * maxAbsCodeVal + 1
nTupleDec = []
tmp = symbolIndex
for p in reversed(range(nTupleSize)):
val = tmp // (base ** p)
nTupleDec.append(val - maxAbsCodeVal)
tmp = tmp % (base ** p)
nTupleDec = np.array(nTupleDec)
else:
base = maxAbsCodeVal + 1
nTupleDec = []
tmp = symbolIndex
for p in reversed(range(nTupleSize)):
val = tmp // (base ** p)
nTupleDec.append(val)
tmp = tmp % (base ** p)
nTupleDec = np.array(nTupleDec)
# Apply sign bits
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
streamIndex += nTupleSize
nTupleDec = nTupleDec * nTupleSign
# Handle escape sequences
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
if huffCodebook == 11 and escIndex.size > 0:
for idx in escIndex:
N = 0
b = huff_sec[streamIndex]
while b:
N += 1
b = huff_sec[streamIndex + N]
streamIndex += N +1
N4 = N + 4
escape_word = huff_sec[streamIndex:streamIndex + N4]
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
nTupleDec[idx] = escape_value
streamIndex += N4
# Apply signs again
nTupleDec[escIndex] *= nTupleSign[escIndex]
decCoeffs.extend(nTupleDec.tolist())
if streamIndex >= len(huff_sec):
eos = True
return decCoeffs