Level 3: First positive SNR version
This commit is contained in:
parent
4ebee28e4e
commit
cd2b89bd73
@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) ->
|
|||||||
# Level 1 encoder
|
# Level 1 encoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
def aac_coder_1(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq1:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC encoder.
|
Level-1 AAC encoder.
|
||||||
|
|
||||||
@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
filename_in : Union[str, Path]
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename.
|
Input WAV filename.
|
||||||
Assumption: stereo audio, sampling rate 48 kHz.
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
aac_seq: AACSeq1 = []
|
aac_seq: AACSeq1 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
win_type: WinType = WIN_TYPE
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
|
|
||||||
aac_seq.append({
|
aac_seq.append({
|
||||||
"frame_type": frame_type,
|
"frame_type": frame_type,
|
||||||
"win_type": win_type,
|
"win_type": WIN_TYPE,
|
||||||
"chl": {"frame_F": chl_f},
|
"chl": {"frame_F": chl_f},
|
||||||
"chr": {"frame_F": chr_f},
|
"chr": {"frame_F": chr_f},
|
||||||
})
|
})
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
def aac_coder_2(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq2:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC encoder (Level 1 + TNS).
|
Level-2 AAC encoder (Level 1 + TNS).
|
||||||
|
|
||||||
@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
----------
|
----------
|
||||||
filename_in : Union[str, Path]
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename (stereo, 48 kHz).
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
aac_seq: AACSeq2 = []
|
aac_seq: AACSeq2 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
# Level 1 analysis (packed stereo container)
|
# Level 1 analysis (packed stereo container)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
# Unpack to per-channel (as you already do in Level 1)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
if frame_type == "ESH":
|
|
||||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
for j in range(8):
|
|
||||||
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
|
|
||||||
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
|
|
||||||
else:
|
|
||||||
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
|
|
||||||
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
|
|
||||||
|
|
||||||
# Level 2: apply TNS per channel
|
# Level 2: apply TNS per channel
|
||||||
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
def aac_coder_3(
|
def aac_coder_3(
|
||||||
filename_in: Union[str, Path],
|
filename_in: Union[str, Path],
|
||||||
filename_aac_coded: Union[str, Path] | None = None,
|
filename_aac_coded: Union[str, Path] | None = None,
|
||||||
|
verbose: bool = False,
|
||||||
) -> AACSeq3:
|
) -> AACSeq3:
|
||||||
"""
|
"""
|
||||||
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
||||||
@ -389,6 +402,8 @@ def aac_coder_3(
|
|||||||
Input WAV filename (stereo, 48 kHz).
|
Input WAV filename (stereo, 48 kHz).
|
||||||
filename_aac_coded : Union[str, Path] | None
|
filename_aac_coded : Union[str, Path] | None
|
||||||
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -416,15 +431,14 @@ def aac_coder_3(
|
|||||||
aac_seq: AACSeq3 = []
|
aac_seq: AACSeq3 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
# Pin win_type to the WinType literal for type checkers.
|
|
||||||
win_type: WinType = WIN_TYPE
|
|
||||||
|
|
||||||
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
||||||
prev1_L = np.zeros((2048,), dtype=np.float64)
|
prev1_L = np.zeros((2048,), dtype=np.float64)
|
||||||
prev2_L = np.zeros((2048,), dtype=np.float64)
|
prev2_L = np.zeros((2048,), dtype=np.float64)
|
||||||
prev1_R = np.zeros((2048,), dtype=np.float64)
|
prev1_R = np.zeros((2048,), dtype=np.float64)
|
||||||
prev2_R = np.zeros((2048,), dtype=np.float64)
|
prev2_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -440,7 +454,7 @@ def aac_coder_3(
|
|||||||
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
# Analysis filterbank (stereo packed)
|
# Analysis filterbank (stereo packed)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
# TNS per channel
|
# TNS per channel
|
||||||
@ -474,32 +488,35 @@ def aac_coder_3(
|
|||||||
# Codebook 11:
|
# Codebook 11:
|
||||||
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
||||||
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
||||||
sf_cb = 11
|
# sf_cb = 11
|
||||||
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
||||||
|
#
|
||||||
|
# sfc_L_dpcm = np.clip(
|
||||||
|
# sfc_L_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
#
|
||||||
|
# sfc_R_dpcm = np.clip(
|
||||||
|
# sfc_R_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
|
||||||
sfc_L_dpcm = np.clip(
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
||||||
sfc_L_dpcm,
|
|
||||||
-sf_max_abs,
|
|
||||||
sf_max_abs,
|
|
||||||
).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_R_dpcm = np.clip(
|
|
||||||
sfc_R_dpcm,
|
|
||||||
-sf_max_abs,
|
|
||||||
sf_max_abs,
|
|
||||||
).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_L_stream, _ = aac_encode_huff(
|
|
||||||
sfc_L_dpcm.reshape(-1, order="F"),
|
sfc_L_dpcm.reshape(-1, order="F"),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
force_codebook=sf_cb,
|
# force_codebook=11,
|
||||||
)
|
)
|
||||||
sfc_R_stream, _ = aac_encode_huff(
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
||||||
sfc_R_dpcm.reshape(-1, order="F"),
|
sfc_R_dpcm.reshape(-1, order="F"),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
force_codebook=sf_cb,
|
# force_codebook=11,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
|
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
||||||
|
|
||||||
mdct_L_stream, cb_L = aac_encode_huff(
|
mdct_L_stream, cb_L = aac_encode_huff(
|
||||||
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
@ -512,7 +529,7 @@ def aac_coder_3(
|
|||||||
# Typed dict construction helps static analyzers validate the schema.
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
frame_out: AACSeq3Frame = {
|
frame_out: AACSeq3Frame = {
|
||||||
"frame_type": frame_type,
|
"frame_type": frame_type,
|
||||||
"win_type": win_type,
|
"win_type": WIN_TYPE,
|
||||||
"chl": {
|
"chl": {
|
||||||
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
||||||
"T": np.asarray(T_L, dtype=np.float64),
|
"T": np.asarray(T_L, dtype=np.float64),
|
||||||
@ -539,6 +556,11 @@ def aac_coder_3(
|
|||||||
prev1_R = frame_R
|
prev1_R = frame_R
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
# Optional: store to .mat for the assignment wrapper
|
# Optional: store to .mat for the assignment wrapper
|
||||||
if filename_aac_coded is not None:
|
if filename_aac_coded is not None:
|
||||||
@ -548,6 +570,5 @@ def aac_coder_3(
|
|||||||
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
||||||
do_compression=True,
|
do_compression=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|||||||
@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
|||||||
# Level 1 decoder
|
# Level 1 decoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_1(
|
||||||
|
aac_seq_1: AACSeq1,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC decoder (inverse of aac_coder_1()).
|
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||||||
|
|
||||||
@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_1().
|
Encoded sequence as produced by aac_coder_1().
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename. Assumption: 48 kHz, stereo.
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
for i, fr in enumerate(aac_seq_1):
|
for i, fr in enumerate(aac_seq_1):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start:start + win, :] += frame_t_hat
|
y_pad[start:start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
# Level 1 assumption: 48 kHz output.
|
# Level 1 assumption: 48 kHz output.
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
# Level 2 decoder
|
# Level 2 decoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_2(
|
||||||
|
aac_seq_2: AACSeq2,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC decoder (inverse of aac_coder_2).
|
Level-2 AAC decoder (inverse of aac_coder_2).
|
||||||
|
|
||||||
@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_2().
|
Encoded sequence as produced by aac_coder_2().
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename.
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
for i, fr in enumerate(aac_seq_2):
|
for i, fr in enumerate(aac_seq_2):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start : start + win, :] += frame_t_hat
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y = aac_remove_padding(y_pad, hop=hop)
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_3(
|
||||||
|
aac_seq_3: AACSeq3,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-3 AAC decoder (inverse of aac_coder_3).
|
Level-3 AAC decoder (inverse of aac_coder_3).
|
||||||
|
|
||||||
@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_3.
|
Encoded sequence as produced by aac_coder_3.
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename.
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
|
|
||||||
for i, fr in enumerate(aac_seq_3):
|
for i, fr in enumerate(aac_seq_3):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start : start + win, :] += frame_t_hat
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y = aac_remove_padding(y_pad, hop=hop)
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|||||||
@ -316,8 +316,8 @@ def _psycho_one_window(
|
|||||||
nb = en * bc
|
nb = en * bc
|
||||||
|
|
||||||
# Threshold in quiet (convert from dB to power domain):
|
# Threshold in quiet (convert from dB to power domain):
|
||||||
# qthr_power = (N/2) * 10^(qthr_db/10)
|
# qthr_power = eps * (N/2) * 10^(qthr_db/10)
|
||||||
qthr_power = (N / 2.0) * (10.0 ** (qthr_db / 10.0))
|
qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0))
|
||||||
|
|
||||||
# Final masking threshold per band:
|
# Final masking threshold per band:
|
||||||
# np(b) = max(nb(b), qthr(b))
|
# np(b) = max(nb(b), qthr(b))
|
||||||
|
|||||||
@ -25,38 +25,64 @@ from core.aac_utils import snr_db
|
|||||||
from core.aac_types import *
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
# Helper "fixtures" for aac_coder_1 / i_aac_coder_1
|
# -----------------------------------------------------------------------------
|
||||||
|
# Fixtures (small wav logic)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def wav_in_path() -> Path:
|
||||||
|
"""
|
||||||
|
Provided input WAV used for end-to-end tests.
|
||||||
|
|
||||||
|
Expected project layout:
|
||||||
|
source/material/LicorDeCalandraca.wav
|
||||||
|
"""
|
||||||
|
return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def tmp_stereo_wav(tmp_path: Path) -> Path:
|
def mk_random_stereo_wav(tmp_path: Path, request: pytest.FixtureRequest) -> Path:
|
||||||
"""
|
"""
|
||||||
Create a temporary 48 kHz stereo WAV with random samples.
|
Create a temporary 48 kHz stereo WAV with random samples.
|
||||||
|
|
||||||
|
Length (in seconds) must be provided via indirect parametrization.
|
||||||
"""
|
"""
|
||||||
|
length = float(request.param)
|
||||||
rng = np.random.default_rng(123)
|
rng = np.random.default_rng(123)
|
||||||
fs = 48000
|
fs = 48000
|
||||||
|
|
||||||
# ~1 second of audio (kept small for test speed).
|
n = int(fs * length)
|
||||||
n = fs
|
|
||||||
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
|
x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64)
|
||||||
|
|
||||||
wav_path = tmp_path / "in.wav"
|
wav_path = tmp_path / "in_random.wav"
|
||||||
sf.write(str(wav_path), x, fs)
|
sf.write(str(wav_path), x, fs)
|
||||||
return wav_path
|
return wav_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mk_actual_stereo_wav(tmp_path: Path, wav_in_path: Path, request: pytest.FixtureRequest) -> Path:
|
||||||
|
"""
|
||||||
|
Create a temporary 48 kHz stereo WAV by chopping from the provided material WAV.
|
||||||
|
Length can be overridden via indirect parametrization.
|
||||||
|
"""
|
||||||
|
length = float(getattr(request, "param", 0.25)) # seconds (default: small)
|
||||||
|
|
||||||
|
x, fs = aac_read_wav_stereo_48k(wav_in_path)
|
||||||
|
n = int(fs * length)
|
||||||
|
x_short = x[:n, :]
|
||||||
|
|
||||||
|
wav_path = tmp_path / "in_actual.wav"
|
||||||
|
sf.write(str(wav_path), x_short, fs)
|
||||||
|
return wav_path
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Helper-function tests
|
# Helper-function tests
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None:
|
@pytest.mark.parametrize("mk_random_stereo_wav", [2.0], indirect=True)
|
||||||
"""
|
def test_aac_read_wav_stereo_48k_roundtrip(mk_random_stereo_wav: Path) -> None:
|
||||||
Contract test for aac_read_wav_stereo_48k():
|
x, fs = aac_read_wav_stereo_48k(mk_random_stereo_wav)
|
||||||
- Reads stereo WAV
|
|
||||||
- Returns float64 array with shape (N,2)
|
|
||||||
- Returns fs = 48000
|
|
||||||
"""
|
|
||||||
x, fs = aac_read_wav_stereo_48k(tmp_stereo_wav)
|
|
||||||
|
|
||||||
assert int(fs) == 48000
|
assert int(fs) == 48000
|
||||||
assert isinstance(x, np.ndarray)
|
assert isinstance(x, np.ndarray)
|
||||||
@ -67,10 +93,6 @@ def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def test_aac_remove_padding_removes_hop_from_both_ends() -> None:
|
def test_aac_remove_padding_removes_hop_from_both_ends() -> None:
|
||||||
"""
|
|
||||||
Contract test for aac_remove_padding():
|
|
||||||
- Removes 'hop' samples from start and end.
|
|
||||||
"""
|
|
||||||
hop = 1024
|
hop = 1024
|
||||||
n = 10000
|
n = 10000
|
||||||
|
|
||||||
@ -82,9 +104,6 @@ def test_aac_remove_padding_removes_hop_from_both_ends() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def test_aac_remove_padding_errors_on_too_short_input() -> None:
|
def test_aac_remove_padding_errors_on_too_short_input() -> None:
|
||||||
"""
|
|
||||||
aac_remove_padding must raise if y_pad is shorter than 2*hop.
|
|
||||||
"""
|
|
||||||
hop = 1024
|
hop = 1024
|
||||||
y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64)
|
y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64)
|
||||||
|
|
||||||
@ -95,12 +114,10 @@ def test_aac_remove_padding_errors_on_too_short_input() -> None:
|
|||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Level 1 tests
|
# Level 1 tests
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
|
||||||
"""
|
@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
|
||||||
Module-level contract test:
|
def test_aac_coder_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None:
|
||||||
Ensure aac_seq_1 follows the expected schema and per-frame shapes.
|
aac_seq: AACSeq1 = aac_coder_1(mk_random_stereo_wav)
|
||||||
"""
|
|
||||||
aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav)
|
|
||||||
|
|
||||||
assert isinstance(aac_seq, list)
|
assert isinstance(aac_seq, list)
|
||||||
assert len(aac_seq) > 0
|
assert len(aac_seq) > 0
|
||||||
@ -108,7 +125,6 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
|||||||
for fr in aac_seq:
|
for fr in aac_seq:
|
||||||
assert isinstance(fr, dict)
|
assert isinstance(fr, dict)
|
||||||
|
|
||||||
# Required keys
|
|
||||||
assert "frame_type" in fr
|
assert "frame_type" in fr
|
||||||
assert "win_type" in fr
|
assert "win_type" in fr
|
||||||
assert "chl" in fr
|
assert "chl" in fr
|
||||||
@ -136,41 +152,32 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
|||||||
assert chr_f.shape == (1024, 1)
|
assert chr_f.shape == (1024, 1)
|
||||||
|
|
||||||
|
|
||||||
def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
|
@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
|
||||||
"""
|
def test_end_to_end_aac_coder_decoder_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None:
|
||||||
End-to-end test:
|
x_ref, fs = sf.read(str(mk_random_stereo_wav), always_2d=True)
|
||||||
Encode + decode and check SNR is very high (numerical-noise only).
|
|
||||||
|
|
||||||
The threshold is intentionally loose to avoid fragility across platforms/BLAS.
|
|
||||||
"""
|
|
||||||
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
|
|
||||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
assert int(fs) == 48000
|
assert int(fs) == 48000
|
||||||
|
|
||||||
out_wav = tmp_path / "out.wav"
|
out_wav = tmp_path / "out.wav"
|
||||||
|
|
||||||
aac_seq = aac_coder_1(tmp_stereo_wav)
|
aac_seq = aac_coder_1(mk_random_stereo_wav)
|
||||||
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
|
x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav)
|
||||||
|
|
||||||
# Basic sanity: output file exists and is readable
|
|
||||||
assert out_wav.exists()
|
assert out_wav.exists()
|
||||||
x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True)
|
_, fs_hat = sf.read(str(out_wav), always_2d=True)
|
||||||
assert int(fs_hat) == 48000
|
assert int(fs_hat) == 48000
|
||||||
|
|
||||||
# SNR against returned array (file should match closely, but we do not require it here).
|
|
||||||
snr = snr_db(x_ref, x_hat)
|
snr = snr_db(x_ref, x_hat)
|
||||||
assert snr > 80.0
|
assert snr > 80.0
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Level 2 tests (new)
|
# Level 2 tests
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
|
||||||
"""
|
def test_aac_coder_2_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None:
|
||||||
Module-level contract test (Level 2):
|
aac_seq: AACSeq2 = aac_coder_2(mk_random_stereo_wav)
|
||||||
Ensure aac_seq_2 follows the expected schema and per-frame shapes, including tns_coeffs.
|
|
||||||
"""
|
|
||||||
aac_seq: AACSeq2 = aac_coder_2(tmp_stereo_wav)
|
|
||||||
|
|
||||||
assert isinstance(aac_seq, list)
|
assert isinstance(aac_seq, list)
|
||||||
assert len(aac_seq) > 0
|
assert len(aac_seq) > 0
|
||||||
@ -194,27 +201,20 @@ def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None:
|
|||||||
|
|
||||||
if frame_type == "ESH":
|
if frame_type == "ESH":
|
||||||
assert frame_f.shape == (128, 8)
|
assert frame_f.shape == (128, 8)
|
||||||
assert coeffs.shape[0] == 4
|
assert coeffs.shape == (4, 8)
|
||||||
assert coeffs.shape[1] == 8
|
|
||||||
else:
|
else:
|
||||||
assert frame_f.shape == (1024, 1)
|
assert frame_f.shape == (1024, 1)
|
||||||
assert coeffs.shape == (4, 1)
|
assert coeffs.shape == (4, 1)
|
||||||
|
|
||||||
|
|
||||||
def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None:
|
@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True)
|
||||||
"""
|
def test_end_to_end_level_2_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None:
|
||||||
End-to-end test (Level 2):
|
x_ref, fs = sf.read(str(mk_random_stereo_wav), always_2d=True)
|
||||||
Encode + decode and check SNR remains very high.
|
|
||||||
|
|
||||||
Level 2 is still floating-point (TNS is reversible), so reconstruction
|
|
||||||
should remain numerical-noise only.
|
|
||||||
"""
|
|
||||||
x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True)
|
|
||||||
x_ref = np.asarray(x_ref, dtype=np.float64)
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
assert int(fs) == 48000
|
assert int(fs) == 48000
|
||||||
|
|
||||||
out_wav = tmp_path / "out_l2.wav"
|
out_wav = tmp_path / "out_l2.wav"
|
||||||
aac_seq = aac_coder_2(tmp_stereo_wav)
|
aac_seq = aac_coder_2(mk_random_stereo_wav)
|
||||||
x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav)
|
x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav)
|
||||||
|
|
||||||
assert out_wav.exists()
|
assert out_wav.exists()
|
||||||
@ -222,30 +222,14 @@ def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> No
|
|||||||
assert int(fs_hat) == 48000
|
assert int(fs_hat) == 48000
|
||||||
|
|
||||||
snr = snr_db(x_ref, x_hat)
|
snr = snr_db(x_ref, x_hat)
|
||||||
assert snr > 80
|
assert snr > 80.0
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Level 3 tests (Quantizer + Huffman)
|
# Level 3 tests (Quantizer + Huffman)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def wav_in_path() -> Path:
|
|
||||||
"""
|
|
||||||
Input WAV used for end-to-end tests.
|
|
||||||
|
|
||||||
This should point to the provided test audio under material/.
|
|
||||||
Adjust this path if your project layout differs.
|
|
||||||
"""
|
|
||||||
# Typical layout in this project:
|
|
||||||
# source/material/LicorDeCalandraca.wav
|
|
||||||
return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav"
|
|
||||||
|
|
||||||
|
|
||||||
def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None:
|
def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None:
|
||||||
"""
|
|
||||||
Validate Level-3 per-frame schema (keys + basic types only).
|
|
||||||
"""
|
|
||||||
assert "frame_type" in frame
|
assert "frame_type" in frame
|
||||||
assert "win_type" in frame
|
assert "win_type" in frame
|
||||||
assert "chl" in frame
|
assert "chl" in frame
|
||||||
@ -264,37 +248,18 @@ def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None:
|
|||||||
assert isinstance(ch["stream"], str)
|
assert isinstance(ch["stream"], str)
|
||||||
assert isinstance(ch["codebook"], int)
|
assert isinstance(ch["codebook"], int)
|
||||||
|
|
||||||
# Arrays: only check they are numpy arrays with expected dtype categories.
|
|
||||||
assert isinstance(ch["tns_coeffs"], np.ndarray)
|
assert isinstance(ch["tns_coeffs"], np.ndarray)
|
||||||
assert isinstance(ch["T"], np.ndarray)
|
assert isinstance(ch["T"], np.ndarray)
|
||||||
|
|
||||||
# Global gain: long frames may be scalar float, ESH may be ndarray
|
|
||||||
assert np.isscalar(ch["G"]) or isinstance(ch["G"], np.ndarray)
|
assert np.isscalar(ch["G"]) or isinstance(ch["G"], np.ndarray)
|
||||||
|
|
||||||
|
|
||||||
def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) -> None:
|
@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True)
|
||||||
|
def test_aac_coder_3_seq_schema_and_shapes(mk_actual_stereo_wav: Path) -> None:
|
||||||
"""
|
"""
|
||||||
Contract test:
|
Uses a short WAV excerpt produced by mk_actual_stereo_wav.
|
||||||
- aac_coder_3 returns AACSeq3
|
0.11s is enough for a few frames at 48 kHz.
|
||||||
- Per-frame keys exist and types are consistent
|
|
||||||
- Basic shape expectations hold for ESH vs non-ESH cases
|
|
||||||
|
|
||||||
Note:
|
|
||||||
This test uses a short excerpt (a few frames) to keep runtime bounded.
|
|
||||||
"""
|
"""
|
||||||
# Use only a few frames to avoid long runtimes in the quantizer loop.
|
aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav)
|
||||||
hop = 1024
|
|
||||||
win = 2048
|
|
||||||
n_frames = 4
|
|
||||||
n_samples = win + (n_frames - 1) * hop
|
|
||||||
|
|
||||||
x, fs = aac_read_wav_stereo_48k(wav_in_path)
|
|
||||||
x_short = x[:n_samples, :]
|
|
||||||
|
|
||||||
short_wav = tmp_path / "input_short.wav"
|
|
||||||
sf.write(str(short_wav), x_short, fs)
|
|
||||||
|
|
||||||
aac_seq_3: AACSeq3 = aac_coder_3(short_wav)
|
|
||||||
|
|
||||||
assert isinstance(aac_seq_3, list)
|
assert isinstance(aac_seq_3, list)
|
||||||
assert len(aac_seq_3) > 0
|
assert len(aac_seq_3) > 0
|
||||||
@ -329,46 +294,21 @@ def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) ->
|
|||||||
else:
|
else:
|
||||||
assert np.isscalar(G)
|
assert np.isscalar(G)
|
||||||
|
|
||||||
assert isinstance(ch["sfc"], str)
|
|
||||||
assert isinstance(ch["stream"], str)
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True)
|
||||||
|
def test_end_to_end_level_3_high_snr(mk_actual_stereo_wav: Path, tmp_path: Path) -> None:
|
||||||
def test_end_to_end_level_3_high_snr(wav_in_path: Path, tmp_path: Path) -> None:
|
|
||||||
"""
|
"""
|
||||||
End-to-end test for Level 3 (Quantizer + Huffman):
|
End-to-end Level 3 using a small WAV excerpt produced by mk_actual_stereo_wav.
|
||||||
|
|
||||||
coder_3 -> decoder_3 should reconstruct a waveform with acceptable SNR.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
- Level 3 includes quantization, so SNR is expected to be lower than Level 1/2.
|
|
||||||
- We intentionally use a short excerpt (few frames) to keep runtime bounded,
|
|
||||||
since the reference quantizer implementation is computationally expensive.
|
|
||||||
"""
|
"""
|
||||||
# Use only a few frames to avoid long runtimes.
|
x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav)
|
||||||
hop = 1024
|
assert int(fs) == 48000
|
||||||
win = 2048
|
|
||||||
n_frames = 4
|
|
||||||
n_samples = win + (n_frames - 1) * hop
|
|
||||||
|
|
||||||
x_ref, fs = aac_read_wav_stereo_48k(wav_in_path)
|
|
||||||
x_short = x_ref[:n_samples, :]
|
|
||||||
|
|
||||||
short_wav = tmp_path / "input_short_l3.wav"
|
|
||||||
sf.write(str(short_wav), x_short, fs)
|
|
||||||
|
|
||||||
out_wav = tmp_path / "decoded_level3.wav"
|
out_wav = tmp_path / "decoded_level3.wav"
|
||||||
|
|
||||||
aac_seq_3: AACSeq3 = aac_coder_3(short_wav)
|
aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav)
|
||||||
y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav)
|
y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav)
|
||||||
|
|
||||||
# Align lengths defensively (padding removal may differ by a few samples)
|
n = min(x_ref.shape[0], y_hat.shape[0])
|
||||||
n = min(x_short.shape[0], y_hat.shape[0])
|
s = snr_db(x_ref[:n, :], y_hat[:n, :])
|
||||||
x2 = x_short[:n, :]
|
print(f"SNR={s}")
|
||||||
y2 = y_hat[:n, :]
|
assert s > 2.0
|
||||||
|
|
||||||
s = snr_db(x2, y2)
|
|
||||||
|
|
||||||
# Conservative threshold: Level 3 is lossy by design.
|
|
||||||
assert s > 10.0
|
|
||||||
|
|||||||
@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) ->
|
|||||||
# Level 1 encoder
|
# Level 1 encoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
def aac_coder_1(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq1:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC encoder.
|
Level-1 AAC encoder.
|
||||||
|
|
||||||
@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
filename_in : Union[str, Path]
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename.
|
Input WAV filename.
|
||||||
Assumption: stereo audio, sampling rate 48 kHz.
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
aac_seq: AACSeq1 = []
|
aac_seq: AACSeq1 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
win_type: WinType = WIN_TYPE
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
|
|
||||||
aac_seq.append({
|
aac_seq.append({
|
||||||
"frame_type": frame_type,
|
"frame_type": frame_type,
|
||||||
"win_type": win_type,
|
"win_type": WIN_TYPE,
|
||||||
"chl": {"frame_F": chl_f},
|
"chl": {"frame_F": chl_f},
|
||||||
"chr": {"frame_F": chr_f},
|
"chr": {"frame_F": chr_f},
|
||||||
})
|
})
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
def aac_coder_2(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq2:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC encoder (Level 1 + TNS).
|
Level-2 AAC encoder (Level 1 + TNS).
|
||||||
|
|
||||||
@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
----------
|
----------
|
||||||
filename_in : Union[str, Path]
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename (stereo, 48 kHz).
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
aac_seq: AACSeq2 = []
|
aac_seq: AACSeq2 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
# Level 1 analysis (packed stereo container)
|
# Level 1 analysis (packed stereo container)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
# Unpack to per-channel (as you already do in Level 1)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
if frame_type == "ESH":
|
|
||||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
for j in range(8):
|
|
||||||
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
|
|
||||||
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
|
|
||||||
else:
|
|
||||||
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
|
|
||||||
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
|
|
||||||
|
|
||||||
# Level 2: apply TNS per channel
|
# Level 2: apply TNS per channel
|
||||||
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
def aac_coder_3(
|
def aac_coder_3(
|
||||||
filename_in: Union[str, Path],
|
filename_in: Union[str, Path],
|
||||||
filename_aac_coded: Union[str, Path] | None = None,
|
filename_aac_coded: Union[str, Path] | None = None,
|
||||||
|
verbose: bool = False,
|
||||||
) -> AACSeq3:
|
) -> AACSeq3:
|
||||||
"""
|
"""
|
||||||
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
||||||
@ -389,6 +402,8 @@ def aac_coder_3(
|
|||||||
Input WAV filename (stereo, 48 kHz).
|
Input WAV filename (stereo, 48 kHz).
|
||||||
filename_aac_coded : Union[str, Path] | None
|
filename_aac_coded : Union[str, Path] | None
|
||||||
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -416,15 +431,14 @@ def aac_coder_3(
|
|||||||
aac_seq: AACSeq3 = []
|
aac_seq: AACSeq3 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
# Pin win_type to the WinType literal for type checkers.
|
|
||||||
win_type: WinType = WIN_TYPE
|
|
||||||
|
|
||||||
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
||||||
prev1_L = np.zeros((2048,), dtype=np.float64)
|
prev1_L = np.zeros((2048,), dtype=np.float64)
|
||||||
prev2_L = np.zeros((2048,), dtype=np.float64)
|
prev2_L = np.zeros((2048,), dtype=np.float64)
|
||||||
prev1_R = np.zeros((2048,), dtype=np.float64)
|
prev1_R = np.zeros((2048,), dtype=np.float64)
|
||||||
prev2_R = np.zeros((2048,), dtype=np.float64)
|
prev2_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -440,7 +454,7 @@ def aac_coder_3(
|
|||||||
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
# Analysis filterbank (stereo packed)
|
# Analysis filterbank (stereo packed)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
# TNS per channel
|
# TNS per channel
|
||||||
@ -474,32 +488,35 @@ def aac_coder_3(
|
|||||||
# Codebook 11:
|
# Codebook 11:
|
||||||
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
||||||
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
||||||
sf_cb = 11
|
# sf_cb = 11
|
||||||
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
||||||
|
#
|
||||||
|
# sfc_L_dpcm = np.clip(
|
||||||
|
# sfc_L_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
#
|
||||||
|
# sfc_R_dpcm = np.clip(
|
||||||
|
# sfc_R_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
|
||||||
sfc_L_dpcm = np.clip(
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
||||||
sfc_L_dpcm,
|
|
||||||
-sf_max_abs,
|
|
||||||
sf_max_abs,
|
|
||||||
).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_R_dpcm = np.clip(
|
|
||||||
sfc_R_dpcm,
|
|
||||||
-sf_max_abs,
|
|
||||||
sf_max_abs,
|
|
||||||
).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_L_stream, _ = aac_encode_huff(
|
|
||||||
sfc_L_dpcm.reshape(-1, order="F"),
|
sfc_L_dpcm.reshape(-1, order="F"),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
force_codebook=sf_cb,
|
# force_codebook=11,
|
||||||
)
|
)
|
||||||
sfc_R_stream, _ = aac_encode_huff(
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
||||||
sfc_R_dpcm.reshape(-1, order="F"),
|
sfc_R_dpcm.reshape(-1, order="F"),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
force_codebook=sf_cb,
|
# force_codebook=11,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
|
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
||||||
|
|
||||||
mdct_L_stream, cb_L = aac_encode_huff(
|
mdct_L_stream, cb_L = aac_encode_huff(
|
||||||
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
@ -512,7 +529,7 @@ def aac_coder_3(
|
|||||||
# Typed dict construction helps static analyzers validate the schema.
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
frame_out: AACSeq3Frame = {
|
frame_out: AACSeq3Frame = {
|
||||||
"frame_type": frame_type,
|
"frame_type": frame_type,
|
||||||
"win_type": win_type,
|
"win_type": WIN_TYPE,
|
||||||
"chl": {
|
"chl": {
|
||||||
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
||||||
"T": np.asarray(T_L, dtype=np.float64),
|
"T": np.asarray(T_L, dtype=np.float64),
|
||||||
@ -539,6 +556,11 @@ def aac_coder_3(
|
|||||||
prev1_R = frame_R
|
prev1_R = frame_R
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
# Optional: store to .mat for the assignment wrapper
|
# Optional: store to .mat for the assignment wrapper
|
||||||
if filename_aac_coded is not None:
|
if filename_aac_coded is not None:
|
||||||
@ -548,6 +570,5 @@ def aac_coder_3(
|
|||||||
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
||||||
do_compression=True,
|
do_compression=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|||||||
@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
|||||||
# Level 1 decoder
|
# Level 1 decoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_1(
|
||||||
|
aac_seq_1: AACSeq1,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC decoder (inverse of aac_coder_1()).
|
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||||||
|
|
||||||
@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_1().
|
Encoded sequence as produced by aac_coder_1().
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename. Assumption: 48 kHz, stereo.
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
for i, fr in enumerate(aac_seq_1):
|
for i, fr in enumerate(aac_seq_1):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start:start + win, :] += frame_t_hat
|
y_pad[start:start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
# Level 1 assumption: 48 kHz output.
|
# Level 1 assumption: 48 kHz output.
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
# Level 2 decoder
|
# Level 2 decoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_2(
|
||||||
|
aac_seq_2: AACSeq2,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC decoder (inverse of aac_coder_2).
|
Level-2 AAC decoder (inverse of aac_coder_2).
|
||||||
|
|
||||||
@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_2().
|
Encoded sequence as produced by aac_coder_2().
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename.
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
for i, fr in enumerate(aac_seq_2):
|
for i, fr in enumerate(aac_seq_2):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start : start + win, :] += frame_t_hat
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y = aac_remove_padding(y_pad, hop=hop)
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_3(
|
||||||
|
aac_seq_3: AACSeq3,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-3 AAC decoder (inverse of aac_coder_3).
|
Level-3 AAC decoder (inverse of aac_coder_3).
|
||||||
|
|
||||||
@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_3.
|
Encoded sequence as produced by aac_coder_3.
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename.
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
|
|
||||||
for i, fr in enumerate(aac_seq_3):
|
for i, fr in enumerate(aac_seq_3):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start : start + win, :] += frame_t_hat
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y = aac_remove_padding(y_pad, hop=hop)
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|||||||
@ -20,6 +20,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from tabnanny import verbose
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
@ -52,7 +53,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
AACSeq1
|
AACSeq1
|
||||||
List of encoded frames (Level 1 schema).
|
List of encoded frames (Level 1 schema).
|
||||||
"""
|
"""
|
||||||
return core_aac_coder_1(filename_in)
|
return core_aac_coder_1(filename_in, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
@ -73,7 +74,7 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
StereoSignal
|
StereoSignal
|
||||||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
"""
|
"""
|
||||||
return core_aac_decoder_1(aac_seq_1, filename_out)
|
return core_aac_decoder_1(aac_seq_1, filename_out, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|||||||
@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) ->
|
|||||||
# Level 1 encoder
|
# Level 1 encoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
def aac_coder_1(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq1:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC encoder.
|
Level-1 AAC encoder.
|
||||||
|
|
||||||
@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
filename_in : Union[str, Path]
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename.
|
Input WAV filename.
|
||||||
Assumption: stereo audio, sampling rate 48 kHz.
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
aac_seq: AACSeq1 = []
|
aac_seq: AACSeq1 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
win_type: WinType = WIN_TYPE
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1:
|
|||||||
next_t = np.vstack([next_t, tail])
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
frame_f = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
|
|
||||||
aac_seq.append({
|
aac_seq.append({
|
||||||
"frame_type": frame_type,
|
"frame_type": frame_type,
|
||||||
"win_type": win_type,
|
"win_type": WIN_TYPE,
|
||||||
"chl": {"frame_F": chl_f},
|
"chl": {"frame_F": chl_f},
|
||||||
"chr": {"frame_F": chr_f},
|
"chr": {"frame_F": chr_f},
|
||||||
})
|
})
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
def aac_coder_2(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq2:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC encoder (Level 1 + TNS).
|
Level-2 AAC encoder (Level 1 + TNS).
|
||||||
|
|
||||||
@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
----------
|
----------
|
||||||
filename_in : Union[str, Path]
|
filename_in : Union[str, Path]
|
||||||
Input WAV filename (stereo, 48 kHz).
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
aac_seq: AACSeq2 = []
|
aac_seq: AACSeq2 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
# Level 1 analysis (packed stereo container)
|
# Level 1 analysis (packed stereo container)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
# Unpack to per-channel (as you already do in Level 1)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
if frame_type == "ESH":
|
|
||||||
chl_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
chr_f = np.empty((128, 8), dtype=np.float64)
|
|
||||||
for j in range(8):
|
|
||||||
chl_f[:, j] = frame_f_stereo[:, 2 * j + 0]
|
|
||||||
chr_f[:, j] = frame_f_stereo[:, 2 * j + 1]
|
|
||||||
else:
|
|
||||||
chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False)
|
|
||||||
chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False)
|
|
||||||
|
|
||||||
# Level 2: apply TNS per channel
|
# Level 2: apply TNS per channel
|
||||||
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
def aac_coder_3(
|
def aac_coder_3(
|
||||||
filename_in: Union[str, Path],
|
filename_in: Union[str, Path],
|
||||||
filename_aac_coded: Union[str, Path] | None = None,
|
filename_aac_coded: Union[str, Path] | None = None,
|
||||||
|
verbose: bool = False,
|
||||||
) -> AACSeq3:
|
) -> AACSeq3:
|
||||||
"""
|
"""
|
||||||
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
||||||
@ -389,6 +402,8 @@ def aac_coder_3(
|
|||||||
Input WAV filename (stereo, 48 kHz).
|
Input WAV filename (stereo, 48 kHz).
|
||||||
filename_aac_coded : Union[str, Path] | None
|
filename_aac_coded : Union[str, Path] | None
|
||||||
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -416,15 +431,14 @@ def aac_coder_3(
|
|||||||
aac_seq: AACSeq3 = []
|
aac_seq: AACSeq3 = []
|
||||||
prev_frame_type: FrameType = "OLS"
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
# Pin win_type to the WinType literal for type checkers.
|
|
||||||
win_type: WinType = WIN_TYPE
|
|
||||||
|
|
||||||
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
||||||
prev1_L = np.zeros((2048,), dtype=np.float64)
|
prev1_L = np.zeros((2048,), dtype=np.float64)
|
||||||
prev2_L = np.zeros((2048,), dtype=np.float64)
|
prev2_L = np.zeros((2048,), dtype=np.float64)
|
||||||
prev1_R = np.zeros((2048,), dtype=np.float64)
|
prev1_R = np.zeros((2048,), dtype=np.float64)
|
||||||
prev2_R = np.zeros((2048,), dtype=np.float64)
|
prev2_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
start = i * hop
|
start = i * hop
|
||||||
|
|
||||||
@ -440,7 +454,7 @@ def aac_coder_3(
|
|||||||
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
# Analysis filterbank (stereo packed)
|
# Analysis filterbank (stereo packed)
|
||||||
frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type)
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
# TNS per channel
|
# TNS per channel
|
||||||
@ -474,32 +488,35 @@ def aac_coder_3(
|
|||||||
# Codebook 11:
|
# Codebook 11:
|
||||||
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
||||||
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
||||||
sf_cb = 11
|
# sf_cb = 11
|
||||||
sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
||||||
|
#
|
||||||
|
# sfc_L_dpcm = np.clip(
|
||||||
|
# sfc_L_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
#
|
||||||
|
# sfc_R_dpcm = np.clip(
|
||||||
|
# sfc_R_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
|
||||||
sfc_L_dpcm = np.clip(
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
||||||
sfc_L_dpcm,
|
|
||||||
-sf_max_abs,
|
|
||||||
sf_max_abs,
|
|
||||||
).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_R_dpcm = np.clip(
|
|
||||||
sfc_R_dpcm,
|
|
||||||
-sf_max_abs,
|
|
||||||
sf_max_abs,
|
|
||||||
).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_L_stream, _ = aac_encode_huff(
|
|
||||||
sfc_L_dpcm.reshape(-1, order="F"),
|
sfc_L_dpcm.reshape(-1, order="F"),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
force_codebook=sf_cb,
|
# force_codebook=11,
|
||||||
)
|
)
|
||||||
sfc_R_stream, _ = aac_encode_huff(
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
||||||
sfc_R_dpcm.reshape(-1, order="F"),
|
sfc_R_dpcm.reshape(-1, order="F"),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
force_codebook=sf_cb,
|
# force_codebook=11,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
|
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
||||||
|
|
||||||
mdct_L_stream, cb_L = aac_encode_huff(
|
mdct_L_stream, cb_L = aac_encode_huff(
|
||||||
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
||||||
huff_LUT_list,
|
huff_LUT_list,
|
||||||
@ -512,7 +529,7 @@ def aac_coder_3(
|
|||||||
# Typed dict construction helps static analyzers validate the schema.
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
frame_out: AACSeq3Frame = {
|
frame_out: AACSeq3Frame = {
|
||||||
"frame_type": frame_type,
|
"frame_type": frame_type,
|
||||||
"win_type": win_type,
|
"win_type": WIN_TYPE,
|
||||||
"chl": {
|
"chl": {
|
||||||
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
||||||
"T": np.asarray(T_L, dtype=np.float64),
|
"T": np.asarray(T_L, dtype=np.float64),
|
||||||
@ -539,6 +556,11 @@ def aac_coder_3(
|
|||||||
prev1_R = frame_R
|
prev1_R = frame_R
|
||||||
|
|
||||||
prev_frame_type = frame_type
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
# Optional: store to .mat for the assignment wrapper
|
# Optional: store to .mat for the assignment wrapper
|
||||||
if filename_aac_coded is not None:
|
if filename_aac_coded is not None:
|
||||||
@ -548,6 +570,5 @@ def aac_coder_3(
|
|||||||
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
||||||
do_compression=True,
|
do_compression=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
return aac_seq
|
return aac_seq
|
||||||
|
|
||||||
|
|||||||
@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
|||||||
# Level 1 decoder
|
# Level 1 decoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_1(
|
||||||
|
aac_seq_1: AACSeq1,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-1 AAC decoder (inverse of aac_coder_1()).
|
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||||||
|
|
||||||
@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_1().
|
Encoded sequence as produced by aac_coder_1().
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename. Assumption: 48 kHz, stereo.
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
for i, fr in enumerate(aac_seq_1):
|
for i, fr in enumerate(aac_seq_1):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start:start + win, :] += frame_t_hat
|
y_pad[start:start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
# Level 1 assumption: 48 kHz output.
|
# Level 1 assumption: 48 kHz output.
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS
|
|||||||
# Level 2 decoder
|
# Level 2 decoder
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_2(
|
||||||
|
aac_seq_2: AACSeq2,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-2 AAC decoder (inverse of aac_coder_2).
|
Level-2 AAC decoder (inverse of aac_coder_2).
|
||||||
|
|
||||||
@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_2().
|
Encoded sequence as produced by aac_coder_2().
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename.
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
for i, fr in enumerate(aac_seq_2):
|
for i, fr in enumerate(aac_seq_2):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
|
|
||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start : start + win, :] += frame_t_hat
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y = aac_remove_padding(y_pad, hop=hop)
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal:
|
def aac_decoder_3(
|
||||||
|
aac_seq_3: AACSeq3,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> StereoSignal:
|
||||||
"""
|
"""
|
||||||
Level-3 AAC decoder (inverse of aac_coder_3).
|
Level-3 AAC decoder (inverse of aac_coder_3).
|
||||||
|
|
||||||
@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
Encoded sequence as produced by aac_coder_3.
|
Encoded sequence as produced by aac_coder_3.
|
||||||
filename_out : Union[str, Path]
|
filename_out : Union[str, Path]
|
||||||
Output WAV filename.
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
n_pad = (K - 1) * hop + win
|
n_pad = (K - 1) * hop + win
|
||||||
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
|
|
||||||
for i, fr in enumerate(aac_seq_3):
|
for i, fr in enumerate(aac_seq_3):
|
||||||
frame_type: FrameType = fr["frame_type"]
|
frame_type: FrameType = fr["frame_type"]
|
||||||
win_type: WinType = fr["win_type"]
|
win_type: WinType = fr["win_type"]
|
||||||
@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS
|
|||||||
start = i * hop
|
start = i * hop
|
||||||
y_pad[start : start + win, :] += frame_t_hat
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
y = aac_remove_padding(y_pad, hop=hop)
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
sf.write(str(filename_out), y, 48000)
|
sf.write(str(filename_out), y, 48000)
|
||||||
return y
|
return y
|
||||||
|
|||||||
@ -51,7 +51,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2:
|
|||||||
AACSeq2
|
AACSeq2
|
||||||
List of encoded frames (Level 2 schema).
|
List of encoded frames (Level 2 schema).
|
||||||
"""
|
"""
|
||||||
return core_aac_coder_2(filename_in)
|
return core_aac_coder_2(filename_in, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
|
||||||
@ -72,7 +72,7 @@ def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS
|
|||||||
StereoSignal
|
StereoSignal
|
||||||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
"""
|
"""
|
||||||
return core_aac_decoder_2(aac_seq_2, filename_out)
|
return core_aac_decoder_2(aac_seq_2, filename_out, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|||||||
574
source/level_3/core/aac_coder.py
Normal file
574
source/level_3/core/aac_coder.py
Normal file
@ -0,0 +1,574 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - AAC Coder (Core)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 1 AAC encoder orchestration.
|
||||||
|
# Keeps the same functional behavior as the original level_1 implementation:
|
||||||
|
# - Reads WAV via soundfile
|
||||||
|
# - Validates stereo and 48 kHz
|
||||||
|
# - Frames into 2048 samples with hop=1024 and zero padding at both ends
|
||||||
|
# - SSC decision uses next-frame attack detection
|
||||||
|
# - Filterbank analysis (MDCT)
|
||||||
|
# - Stores per-channel spectra in AACSeq1 schema:
|
||||||
|
# * ESH: (128, 8)
|
||||||
|
# * else: (1024, 1)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
from scipy.io import savemat
|
||||||
|
|
||||||
|
from core.aac_configuration import WIN_TYPE
|
||||||
|
from core.aac_filterbank import aac_filter_bank
|
||||||
|
from core.aac_ssc import aac_ssc
|
||||||
|
from core.aac_tns import aac_tns
|
||||||
|
from core.aac_psycho import aac_psycho
|
||||||
|
from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py
|
||||||
|
from core.aac_huffman import aac_encode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
|
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helpers for thresholds (T(b))
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Return inclusive (lo, hi) band slices derived from TableB219.
|
||||||
|
"""
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)]
|
||||||
|
|
||||||
|
|
||||||
|
def _thresholds_from_smr(
|
||||||
|
frame_F_ch: FrameChannelF,
|
||||||
|
frame_type: FrameType,
|
||||||
|
SMR: FloatArray,
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long: returns (NB, 1)
|
||||||
|
- ESH: returns (NB, 8)
|
||||||
|
"""
|
||||||
|
bands = _band_slices_from_table(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
X = np.asarray(frame_F_ch, dtype=np.float64)
|
||||||
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if X.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F_ch must have shape (128, 8).")
|
||||||
|
if SMR.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
Xj = X[:, j]
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xj[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMR[b, j])
|
||||||
|
T[b, j] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
return T
|
||||||
|
|
||||||
|
# Long
|
||||||
|
if X.shape == (1024,):
|
||||||
|
Xv = X
|
||||||
|
elif X.shape == (1024, 1):
|
||||||
|
Xv = X[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
if SMR.shape == (NB,):
|
||||||
|
SMRv = SMR
|
||||||
|
elif SMR.shape == (NB, 1):
|
||||||
|
SMRv = SMR[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
T = np.zeros((NB, 1), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = float(np.sum(Xv[lo : hi + 1] ** 2))
|
||||||
|
smr = float(SMRv[b])
|
||||||
|
T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr)
|
||||||
|
|
||||||
|
return T
|
||||||
|
|
||||||
|
def _normalize_global_gain(G: GlobalGain) -> float | FloatArray:
|
||||||
|
"""
|
||||||
|
Normalize GlobalGain to match AACChannelFrameF3["G"] type:
|
||||||
|
- long: return float
|
||||||
|
- ESH: return float64 ndarray of shape (1, 8)
|
||||||
|
"""
|
||||||
|
if np.isscalar(G):
|
||||||
|
return float(G)
|
||||||
|
|
||||||
|
G_arr = np.asarray(G)
|
||||||
|
if G_arr.size == 1:
|
||||||
|
return float(G_arr.reshape(-1)[0])
|
||||||
|
|
||||||
|
return np.asarray(G_arr, dtype=np.float64)
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public helpers (useful for level_x demo wrappers)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]:
|
||||||
|
"""
|
||||||
|
Read a WAV file using soundfile and validate the Level-1 assumptions.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
x : StereoSignal (np.ndarray)
|
||||||
|
Stereo samples as float64, shape (N, 2).
|
||||||
|
fs : int
|
||||||
|
Sampling rate (Hz). Must be 48000.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the input is not stereo or the sampling rate is not 48 kHz.
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, fs = sf.read(str(filename_in), always_2d=True)
|
||||||
|
x = np.asarray(x, dtype=np.float64)
|
||||||
|
|
||||||
|
if x.shape[1] != 2:
|
||||||
|
raise ValueError("Input must be stereo (2 channels).")
|
||||||
|
if int(fs) != 48000:
|
||||||
|
raise ValueError("Input sampling rate must be 48 kHz.")
|
||||||
|
|
||||||
|
return x, int(fs)
|
||||||
|
|
||||||
|
|
||||||
|
def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||||
|
"""
|
||||||
|
Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays
|
||||||
|
as required by the Level-1 AACSeq1 schema.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
"OLS" | "LSS" | "ESH" | "LPS".
|
||||||
|
frame_f : FrameF
|
||||||
|
Output of aac_filter_bank():
|
||||||
|
- If frame_type != "ESH": shape (1024, 2)
|
||||||
|
- If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
chl_f : FrameChannelF
|
||||||
|
Left channel coefficients:
|
||||||
|
- ESH: shape (128, 8)
|
||||||
|
- else: shape (1024, 1)
|
||||||
|
chr_f : FrameChannelF
|
||||||
|
Right channel coefficients:
|
||||||
|
- ESH: shape (128, 8)
|
||||||
|
- else: shape (1024, 1)
|
||||||
|
"""
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if frame_f.shape != (128, 16):
|
||||||
|
raise ValueError("For ESH, frame_f must have shape (128, 16).")
|
||||||
|
|
||||||
|
chl_f = np.empty((128, 8), dtype=np.float64)
|
||||||
|
chr_f = np.empty((128, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
chl_f[:, j] = frame_f[:, 2 * j + 0]
|
||||||
|
chr_f[:, j] = frame_f[:, 2 * j + 1]
|
||||||
|
return chl_f, chr_f
|
||||||
|
|
||||||
|
# Non-ESH: store as (1024, 1) as required by the original Level-1 schema.
|
||||||
|
if frame_f.shape != (1024, 2):
|
||||||
|
raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).")
|
||||||
|
|
||||||
|
chl_f = frame_f[:, 0:1].astype(np.float64, copy=False)
|
||||||
|
chr_f = frame_f[:, 1:2].astype(np.float64, copy=False)
|
||||||
|
return chl_f, chr_f
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 encoder
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_coder_1(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq1:
|
||||||
|
"""
|
||||||
|
Level-1 AAC encoder.
|
||||||
|
|
||||||
|
This function preserves the behavior of the original level_1 implementation:
|
||||||
|
- Read stereo 48 kHz WAV
|
||||||
|
- Pad hop samples at start and hop samples at end
|
||||||
|
- Frame with win=2048, hop=1024
|
||||||
|
- Use SSC with next-frame lookahead
|
||||||
|
- Apply filterbank analysis
|
||||||
|
- Store per-channel coefficients using AACSeq1 schema
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename.
|
||||||
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq1
|
||||||
|
List of encoded frames (Level 1 schema).
|
||||||
|
"""
|
||||||
|
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
# The assignment assumes 48 kHz
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
# Pad at the beginning to support the first overlap region.
|
||||||
|
# Tail padding is kept minimal; next-frame is padded on-the-fly when needed.
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
# Number of frames such that current frame fits; next frame will be padded if needed.
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
aac_seq: AACSeq1 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start:start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
# This should not happen due to K definition, but keep it explicit.
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop:start + hop + win, :]
|
||||||
|
|
||||||
|
# Ensure next_t is always (2048, 2) by zero-padding at the tail.
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f)
|
||||||
|
|
||||||
|
aac_seq.append({
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": WIN_TYPE,
|
||||||
|
"chl": {"frame_F": chl_f},
|
||||||
|
"chr": {"frame_F": chr_f},
|
||||||
|
})
|
||||||
|
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
|
def aac_coder_2(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> AACSeq2:
|
||||||
|
"""
|
||||||
|
Level-2 AAC encoder (Level 1 + TNS).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq2
|
||||||
|
Encoded AAC sequence (Level 2 payload schema).
|
||||||
|
For each frame i:
|
||||||
|
- "frame_type": FrameType
|
||||||
|
- "win_type": WinType
|
||||||
|
- "chl"/"chr":
|
||||||
|
- "frame_F": FrameChannelF (after TNS)
|
||||||
|
- "tns_coeffs": TnsCoeffs
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
# The assignment assumes 48 kHz
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
aac_seq: AACSeq2 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start : start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop : start + hop + win, :]
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
|
# Level 1 analysis (packed stereo container)
|
||||||
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
|
# Level 2: apply TNS per channel
|
||||||
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
|
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||||
|
|
||||||
|
aac_seq.append(
|
||||||
|
{
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": WIN_TYPE,
|
||||||
|
"chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs},
|
||||||
|
"chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
|
return aac_seq
|
||||||
|
|
||||||
|
|
||||||
|
def aac_coder_3(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
filename_aac_coded: Union[str, Path] | None = None,
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> AACSeq3:
|
||||||
|
"""
|
||||||
|
Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
filename_aac_coded : Union[str, Path] | None
|
||||||
|
Optional .mat filename to store aac_seq_3 (assignment convenience).
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq3
|
||||||
|
Encoded AAC sequence (Level 3 payload schema).
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
|
||||||
|
x, _ = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
|
||||||
|
pad_pre = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
pad_post = np.zeros((hop, 2), dtype=np.float64)
|
||||||
|
x_pad = np.vstack([pad_pre, x, pad_post])
|
||||||
|
|
||||||
|
K = int((x_pad.shape[0] - win) // hop + 1)
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("Input too short for framing.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
aac_seq: AACSeq3 = []
|
||||||
|
prev_frame_type: FrameType = "OLS"
|
||||||
|
|
||||||
|
# Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames.
|
||||||
|
prev1_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_L = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev1_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
prev2_R = np.zeros((2048,), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Encoding ", end="", flush=True)
|
||||||
|
for i in range(K):
|
||||||
|
start = i * hop
|
||||||
|
|
||||||
|
frame_t: FrameT = x_pad[start : start + win, :]
|
||||||
|
if frame_t.shape != (win, 2):
|
||||||
|
raise ValueError("Internal framing error: frame_t has wrong shape.")
|
||||||
|
|
||||||
|
next_t = x_pad[start + hop : start + hop + win, :]
|
||||||
|
if next_t.shape[0] < win:
|
||||||
|
tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64)
|
||||||
|
next_t = np.vstack([next_t, tail])
|
||||||
|
|
||||||
|
frame_type = aac_ssc(frame_t, next_t, prev_frame_type)
|
||||||
|
|
||||||
|
# Analysis filterbank (stereo packed)
|
||||||
|
frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE)
|
||||||
|
chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo)
|
||||||
|
|
||||||
|
# TNS per channel
|
||||||
|
chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type)
|
||||||
|
chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type)
|
||||||
|
|
||||||
|
# Psychoacoustic model per channel (time-domain)
|
||||||
|
frame_L = np.asarray(frame_t[:, 0], dtype=np.float64)
|
||||||
|
frame_R = np.asarray(frame_t[:, 1], dtype=np.float64)
|
||||||
|
|
||||||
|
SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L)
|
||||||
|
SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R)
|
||||||
|
|
||||||
|
# Thresholds T(b) (stored, not entropy-coded)
|
||||||
|
T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L)
|
||||||
|
T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Quantizer per channel
|
||||||
|
S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L)
|
||||||
|
S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R)
|
||||||
|
|
||||||
|
# Normalize G types for AACSeq3 schema (float | float64 ndarray).
|
||||||
|
G_Ln = _normalize_global_gain(G_L)
|
||||||
|
G_Rn = _normalize_global_gain(G_R)
|
||||||
|
|
||||||
|
# Huffman-code ONLY the DPCM differences for b>0.
|
||||||
|
# sfc[0] corresponds to alpha(0)=G and is stored separately in the frame.
|
||||||
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
|
# Codebook 11:
|
||||||
|
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
||||||
|
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
||||||
|
# sf_cb = 11
|
||||||
|
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
||||||
|
#
|
||||||
|
# sfc_L_dpcm = np.clip(
|
||||||
|
# sfc_L_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
#
|
||||||
|
# sfc_R_dpcm = np.clip(
|
||||||
|
# sfc_R_dpcm,
|
||||||
|
# -sf_max_abs,
|
||||||
|
# sf_max_abs,
|
||||||
|
# ).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
||||||
|
sfc_L_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
# force_codebook=11,
|
||||||
|
)
|
||||||
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
||||||
|
sfc_R_dpcm.reshape(-1, order="F"),
|
||||||
|
huff_LUT_list,
|
||||||
|
# force_codebook=11,
|
||||||
|
)
|
||||||
|
|
||||||
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
|
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
||||||
|
|
||||||
|
mdct_L_stream, cb_L = aac_encode_huff(
|
||||||
|
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
mdct_R_stream, cb_R = aac_encode_huff(
|
||||||
|
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
||||||
|
huff_LUT_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
|
frame_out: AACSeq3Frame = {
|
||||||
|
"frame_type": frame_type,
|
||||||
|
"win_type": WIN_TYPE,
|
||||||
|
"chl": {
|
||||||
|
"tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_L, dtype=np.float64),
|
||||||
|
"G": G_Ln,
|
||||||
|
"sfc": sfc_L_stream,
|
||||||
|
"stream": mdct_L_stream,
|
||||||
|
"codebook": int(cb_L),
|
||||||
|
},
|
||||||
|
"chr": {
|
||||||
|
"tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64),
|
||||||
|
"T": np.asarray(T_R, dtype=np.float64),
|
||||||
|
"G": G_Rn,
|
||||||
|
"sfc": sfc_R_stream,
|
||||||
|
"stream": mdct_R_stream,
|
||||||
|
"codebook": int(cb_R),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
aac_seq.append(frame_out)
|
||||||
|
|
||||||
|
# Update psycho history (shift register)
|
||||||
|
prev2_L = prev1_L
|
||||||
|
prev1_L = frame_L
|
||||||
|
prev2_R = prev1_R
|
||||||
|
prev1_R = frame_R
|
||||||
|
|
||||||
|
prev_frame_type = frame_type
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
|
# Optional: store to .mat for the assignment wrapper
|
||||||
|
if filename_aac_coded is not None:
|
||||||
|
filename_aac_coded = Path(filename_aac_coded)
|
||||||
|
savemat(
|
||||||
|
str(filename_aac_coded),
|
||||||
|
{"aac_seq_3": np.array(aac_seq, dtype=object)},
|
||||||
|
do_compression=True,
|
||||||
|
)
|
||||||
|
return aac_seq
|
||||||
|
|
||||||
41
source/level_3/core/aac_configuration.py
Normal file
41
source/level_3/core/aac_configuration.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Configuration
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module contains the global configurations
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# Imports
|
||||||
|
from typing import Final
|
||||||
|
|
||||||
|
from core.aac_types import WinType
|
||||||
|
|
||||||
|
# Filterbank
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Window type
|
||||||
|
# Options: "SIN", "KBD"
|
||||||
|
WIN_TYPE: WinType = "SIN"
|
||||||
|
|
||||||
|
|
||||||
|
# TNS
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
PRED_ORDER = 4
|
||||||
|
QUANT_STEP = 0.1
|
||||||
|
QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7]
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Psycho
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
NMT_DB: Final[float] = 6.0 # Noise Masking Tone (dB)
|
||||||
|
TMN_DB: Final[float] = 18.0 # Tone Masking Noise (dB)
|
||||||
445
source/level_3/core/aac_decoder.py
Normal file
445
source/level_3/core/aac_decoder.py
Normal file
@ -0,0 +1,445 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Inverse AAC Coder (Core)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# - Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
|
||||||
|
# - Level 2 AAC decoder orchestration (inverse of aac_coder_1()).
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from core.aac_filterbank import aac_i_filter_bank
|
||||||
|
from core.aac_tns import aac_i_tns
|
||||||
|
from core.aac_quantizer import aac_i_quantizer
|
||||||
|
from core.aac_huffman import aac_decode_huff
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from material.huff_utils import load_LUT
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helper for NB
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _nbands(frame_type: FrameType) -> int:
|
||||||
|
table, _ = get_table(frame_type)
|
||||||
|
wlow, _whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
return int(len(wlow))
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public helpers
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
|
||||||
|
"""
|
||||||
|
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
|
||||||
|
FrameF container expected by aac_i_filter_bank().
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
"OLS" | "LSS" | "ESH" | "LPS".
|
||||||
|
chl_f : FrameChannelF
|
||||||
|
Left channel coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- else: (1024, 1)
|
||||||
|
chr_f : FrameChannelF
|
||||||
|
Right channel coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- else: (1024, 1)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameF
|
||||||
|
Stereo coefficients:
|
||||||
|
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||||||
|
- else: (1024, 2)
|
||||||
|
"""
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||||
|
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||||
|
|
||||||
|
frame_f = np.empty((128, 16), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||||
|
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||||
|
return frame_f
|
||||||
|
|
||||||
|
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
|
||||||
|
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
|
||||||
|
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
|
||||||
|
|
||||||
|
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||||
|
frame_f[:, 0] = chl_f[:, 0]
|
||||||
|
frame_f[:, 1] = chr_f[:, 0]
|
||||||
|
return frame_f
|
||||||
|
|
||||||
|
|
||||||
|
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Remove the boundary padding that the Level-1 encoder adds:
|
||||||
|
hop samples at start and hop samples at end.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y_pad : StereoSignal (np.ndarray)
|
||||||
|
Reconstructed padded stream, shape (N_pad, 2).
|
||||||
|
hop : int
|
||||||
|
Hop size in samples (default 1024).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal (np.ndarray)
|
||||||
|
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If y_pad is too short to unpad.
|
||||||
|
"""
|
||||||
|
if y_pad.shape[0] < 2 * hop:
|
||||||
|
raise ValueError("Decoded stream too short to unpad.")
|
||||||
|
return y_pad[hop:-hop, :]
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 decoder
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_decoder_1(
|
||||||
|
aac_seq_1: AACSeq1,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||||||
|
|
||||||
|
This function preserves the behavior of the original level_1 implementation:
|
||||||
|
- Reconstruct the full padded stream by overlap-adding K synthesized frames
|
||||||
|
- Remove hop padding at the beginning and hop padding at the end
|
||||||
|
- Write the reconstructed stereo WAV file (48 kHz)
|
||||||
|
- Return reconstructed stereo samples as float64
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_1 : AACSeq1
|
||||||
|
Encoded sequence as produced by aac_coder_1().
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_1)
|
||||||
|
|
||||||
|
# Output includes the encoder padding region, so we reconstruct the full padded stream.
|
||||||
|
# For K frames: last frame starts at (K-1)*hop and spans win,
|
||||||
|
# so total length = (K-1)*hop + win.
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
|
for i, fr in enumerate(aac_seq_1):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||||
|
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||||
|
|
||||||
|
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start:start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
|
# Level 1 assumption: 48 kHz output.
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 2 decoder
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_decoder_2(
|
||||||
|
aac_seq_2: AACSeq2,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False
|
||||||
|
) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-2 AAC decoder (inverse of aac_coder_2).
|
||||||
|
|
||||||
|
Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
|
||||||
|
- Per frame/channel: inverse TNS using stored coefficients
|
||||||
|
- Re-pack to stereo frame_F
|
||||||
|
- IMDCT + windowing
|
||||||
|
- Overlap-add over frames
|
||||||
|
- Remove Level-1 padding (hop samples start/end)
|
||||||
|
- Write output WAV (48 kHz)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_2 : AACSeq2
|
||||||
|
Encoded sequence as produced by aac_coder_2().
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_2)
|
||||||
|
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("aac_seq_2 must contain at least one frame.")
|
||||||
|
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
|
for i, fr in enumerate(aac_seq_2):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||||||
|
chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||||||
|
|
||||||
|
chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
|
||||||
|
# Inverse TNS per channel
|
||||||
|
chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
|
||||||
|
chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)
|
||||||
|
|
||||||
|
# Re-pack to the stereo container expected by aac_i_filter_bank
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||||||
|
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||||||
|
|
||||||
|
frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||||||
|
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||||||
|
else:
|
||||||
|
# Accept either (1024,1) or (1024,) from your internal convention.
|
||||||
|
if chl_f.shape == (1024,):
|
||||||
|
chl_col = chl_f.reshape(1024, 1)
|
||||||
|
elif chl_f.shape == (1024, 1):
|
||||||
|
chl_col = chl_f
|
||||||
|
else:
|
||||||
|
raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
if chr_f.shape == (1024,):
|
||||||
|
chr_col = chr_f.reshape(1024, 1)
|
||||||
|
elif chr_f.shape == (1024, 1):
|
||||||
|
chr_col = chr_f
|
||||||
|
else:
|
||||||
|
raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||||||
|
frame_f[:, 0] = chl_col[:, 0]
|
||||||
|
frame_f[:, 1] = chr_col[:, 0]
|
||||||
|
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def aac_decoder_3(
|
||||||
|
aac_seq_3: AACSeq3,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-3 AAC decoder (inverse of aac_coder_3).
|
||||||
|
|
||||||
|
Steps per frame:
|
||||||
|
- Huffman decode scalefactors (sfc) using codebook 11
|
||||||
|
- Huffman decode MDCT symbols (stream) using stored codebook
|
||||||
|
- iQuantizer -> MDCT coefficients after TNS
|
||||||
|
- iTNS using stored predictor coefficients
|
||||||
|
- IMDCT filterbank -> time domain
|
||||||
|
- Overlap-add, remove padding, write WAV
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_3 : AACSeq3
|
||||||
|
Encoded sequence as produced by aac_coder_3.
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename.
|
||||||
|
verbose : bool
|
||||||
|
Optional argument to print encoding status
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
|
||||||
|
hop = 1024
|
||||||
|
win = 2048
|
||||||
|
K = len(aac_seq_3)
|
||||||
|
|
||||||
|
if K <= 0:
|
||||||
|
raise ValueError("aac_seq_3 must contain at least one frame.")
|
||||||
|
|
||||||
|
# Load Huffman LUTs once.
|
||||||
|
huff_LUT_list = load_LUT()
|
||||||
|
|
||||||
|
n_pad = (K - 1) * hop + win
|
||||||
|
y_pad = np.zeros((n_pad, 2), dtype=np.float64)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decoding ", end="", flush=True)
|
||||||
|
|
||||||
|
for i, fr in enumerate(aac_seq_3):
|
||||||
|
frame_type: FrameType = fr["frame_type"]
|
||||||
|
win_type: WinType = fr["win_type"]
|
||||||
|
|
||||||
|
NB = _nbands(frame_type)
|
||||||
|
# We store G separately, so Huffman stream contains only (NB-1) DPCM differences.
|
||||||
|
sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Left channel
|
||||||
|
# -------------------------
|
||||||
|
tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_L = fr["chl"]["G"]
|
||||||
|
sfc_bits_L = fr["chl"]["sfc"]
|
||||||
|
mdct_bits_L = fr["chl"]["stream"]
|
||||||
|
cb_L = int(fr["chl"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_L[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
else:
|
||||||
|
sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_L = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_L[0, 0] = int(float(G_L))
|
||||||
|
sfc_L[1:, :] = sfc_dpcm_L
|
||||||
|
|
||||||
|
# MDCT symbols: codebook 0 means "all-zero section"
|
||||||
|
if cb_L == 0:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
# Tuple coding may produce extra trailing symbols; caller knows the true length (1024).
|
||||||
|
# Also guard against short outputs by zero-padding.
|
||||||
|
if S_tmp_L.size < 1024:
|
||||||
|
S_dec_L = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_L[: S_tmp_L.size] = S_tmp_L
|
||||||
|
else:
|
||||||
|
S_dec_L = S_tmp_L[:1024]
|
||||||
|
|
||||||
|
S_L = S_dec_L.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type)
|
||||||
|
X_L = aac_i_tns(Xq_L, frame_type, tns_L)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Right channel
|
||||||
|
# -------------------------
|
||||||
|
tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)
|
||||||
|
G_R = fr["chr"]["G"]
|
||||||
|
sfc_bits_R = fr["chr"]["sfc"]
|
||||||
|
mdct_bits_R = fr["chr"]["stream"]
|
||||||
|
cb_R = int(fr["chr"]["codebook"])
|
||||||
|
|
||||||
|
sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False)
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8)
|
||||||
|
sfc_R[0, :] = Gv[0, :].astype(np.int64)
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
else:
|
||||||
|
sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F")
|
||||||
|
sfc_R = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc_R[0, 0] = int(float(G_R))
|
||||||
|
sfc_R[1:, :] = sfc_dpcm_R
|
||||||
|
|
||||||
|
if cb_R == 0:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
else:
|
||||||
|
S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False)
|
||||||
|
|
||||||
|
if S_tmp_R.size < 1024:
|
||||||
|
S_dec_R = np.zeros((1024,), dtype=np.int64)
|
||||||
|
S_dec_R[: S_tmp_R.size] = S_tmp_R
|
||||||
|
else:
|
||||||
|
S_dec_R = S_tmp_R[:1024]
|
||||||
|
|
||||||
|
S_R = S_dec_R.reshape(1024, 1)
|
||||||
|
|
||||||
|
Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type)
|
||||||
|
X_R = aac_i_tns(Xq_R, frame_type, tns_R)
|
||||||
|
|
||||||
|
# Re-pack to stereo container and inverse filterbank
|
||||||
|
frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R))
|
||||||
|
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)
|
||||||
|
|
||||||
|
start = i * hop
|
||||||
|
y_pad[start : start + win, :] += frame_t_hat
|
||||||
|
|
||||||
|
if verbose and (i % (K//20)) == 0:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
|
||||||
|
y = aac_remove_padding(y_pad, hop=hop)
|
||||||
|
if verbose:
|
||||||
|
print(" done")
|
||||||
|
|
||||||
|
sf.write(str(filename_out), y, 48000)
|
||||||
|
return y
|
||||||
|
|
||||||
387
source/level_3/core/aac_filterbank.py
Normal file
387
source/level_3/core/aac_filterbank.py
Normal file
@ -0,0 +1,387 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Filterbank module
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking
|
||||||
|
#
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from core.aac_utils import mdct, imdct
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
from scipy.signal.windows import kaiser
|
||||||
|
|
||||||
|
# Private helpers for Filterbank
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
def _sin_window(N: int) -> Window:
|
||||||
|
"""
|
||||||
|
Build a sinusoidal (SIN) window of length N.
|
||||||
|
|
||||||
|
The AAC sinusoid window is:
|
||||||
|
w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
N : int
|
||||||
|
Window length in samples.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (N, ) with dtype float64.
|
||||||
|
"""
|
||||||
|
n = np.arange(N, dtype=np.float64)
|
||||||
|
return np.sin((np.pi / N) * (n + 0.5))
|
||||||
|
|
||||||
|
|
||||||
|
def _kbd_window(N: int, alpha: float) -> Window:
|
||||||
|
"""
|
||||||
|
Build a Kaiser-Bessel-Derived (KBD) window of length N.
|
||||||
|
|
||||||
|
This follows the standard KBD construction used in AAC:
|
||||||
|
1) Build a Kaiser kernel of length (N/2 + 1).
|
||||||
|
2) Form the left half by cumulative summation, normalization, and sqrt.
|
||||||
|
3) Mirror the left half to form the right half (symmetric full-length window).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
- N must be even (AAC uses N=2048 for long and N=256 for short).
|
||||||
|
- The assignment specifies alpha=6 for long windows and alpha=4 for short windows.
|
||||||
|
- The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
N : int
|
||||||
|
Window length in samples (must be even).
|
||||||
|
alpha : float
|
||||||
|
KBD alpha parameter.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (N,) with dtype float64.
|
||||||
|
"""
|
||||||
|
half = N // 2
|
||||||
|
|
||||||
|
# Kaiser kernel length: half + 1 samples (0 .. half)
|
||||||
|
# beta = pi * alpha per the usual correspondence with the ISO definition
|
||||||
|
kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64)
|
||||||
|
|
||||||
|
csum = np.cumsum(kernel)
|
||||||
|
denom = csum[-1]
|
||||||
|
|
||||||
|
w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1
|
||||||
|
w_right = w_left[::-1] # mirror for second half
|
||||||
|
|
||||||
|
return np.concatenate([w_left, w_right])
|
||||||
|
|
||||||
|
|
||||||
|
def _long_window(win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Return the long AAC window (length 2048) for the selected window family.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (2048,) with dtype float64.
|
||||||
|
"""
|
||||||
|
if win_type == "SIN":
|
||||||
|
return _sin_window(2048)
|
||||||
|
if win_type == "KBD":
|
||||||
|
# Assignment-specific alpha values
|
||||||
|
return _kbd_window(2048, alpha=6.0)
|
||||||
|
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _short_window(win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Return the short AAC window (length 256) for the selected window family.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (256,) with dtype float64.
|
||||||
|
"""
|
||||||
|
if win_type == "SIN":
|
||||||
|
return _sin_window(256)
|
||||||
|
if win_type == "KBD":
|
||||||
|
# Assignment-specific alpha values
|
||||||
|
return _kbd_window(256, alpha=4.0)
|
||||||
|
raise ValueError(f"Invalid win_type: {win_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window:
|
||||||
|
"""
|
||||||
|
Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS.
|
||||||
|
|
||||||
|
In this assignment we assume a single window family is used globally
|
||||||
|
(no mixed KBD/SIN halves). Therefore, both the long and short windows
|
||||||
|
are drawn from the same family.
|
||||||
|
|
||||||
|
For frame_type:
|
||||||
|
- "OLS": return the long window Wl (2048).
|
||||||
|
- "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)].
|
||||||
|
- "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
One of "OLS", "LSS", "LPS".
|
||||||
|
win_type : WinType
|
||||||
|
Either "SIN" or "KBD".
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Window
|
||||||
|
1-D array of shape (2048,) with dtype float64.
|
||||||
|
"""
|
||||||
|
wL = _long_window(win_type) # length 2048
|
||||||
|
wS = _short_window(win_type) # length 256
|
||||||
|
|
||||||
|
if frame_type == "OLS":
|
||||||
|
return wL
|
||||||
|
|
||||||
|
if frame_type == "LSS":
|
||||||
|
# 0..1023: left half of long window
|
||||||
|
# 1024..1471: ones (448 samples)
|
||||||
|
# 1472..1599: right half of short window (128 samples)
|
||||||
|
# 1600..2047: zeros (448 samples)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
out[0:1024] = wL[0:1024]
|
||||||
|
out[1024:1472] = 1.0
|
||||||
|
out[1472:1600] = wS[128:256]
|
||||||
|
out[1600:2048] = 0.0
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "LPS":
|
||||||
|
# 0..447: zeros (448)
|
||||||
|
# 448..575: left half of short window (128)
|
||||||
|
# 576..1023: ones (448)
|
||||||
|
# 1024..2047: right half of long window (1024)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
out[0:448] = 0.0
|
||||||
|
out[448:576] = wS[0:128]
|
||||||
|
out[576:1024] = 1.0
|
||||||
|
out[1024:2048] = wL[1024:2048]
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF:
|
||||||
|
"""
|
||||||
|
ESH analysis for one channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_ch : FrameChannelT
|
||||||
|
Time-domain channel frame (expected shape: (2048,)).
|
||||||
|
win_type : WinType
|
||||||
|
Window family ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelF
|
||||||
|
Array of shape (128, 8). Column j contains the 128 MDCT coefficients
|
||||||
|
of the j-th short window.
|
||||||
|
"""
|
||||||
|
wS = _short_window(win_type) # (256,)
|
||||||
|
X_esh = np.empty((128, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
# ESH subwindows are taken from the central region:
|
||||||
|
# start positions: 448 + 128*j, j = 0..7
|
||||||
|
for j in range(8):
|
||||||
|
start = 448 + 128 * j
|
||||||
|
seg = x_ch[start:start + 256] * wS # (256,)
|
||||||
|
X_esh[:, j] = mdct(seg) # (128,)
|
||||||
|
|
||||||
|
return X_esh
|
||||||
|
|
||||||
|
|
||||||
|
def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]:
|
||||||
|
"""
|
||||||
|
Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameF
|
||||||
|
Packed ESH spectrum (expected shape: (128, 16)).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
left : FrameChannelF
|
||||||
|
Left channel spectrum, shape (128, 8).
|
||||||
|
right : FrameChannelF
|
||||||
|
Right channel spectrum, shape (128, 8).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Inverse mapping of the packing used in aac_filter_bank():
|
||||||
|
packed[:, 2*j] = left[:, j]
|
||||||
|
packed[:, 2*j+1] = right[:, j]
|
||||||
|
"""
|
||||||
|
if frame_F.shape != (128, 16):
|
||||||
|
raise ValueError("ESH frame_F must have shape (128, 16).")
|
||||||
|
|
||||||
|
left = np.empty((128, 8), dtype=np.float64)
|
||||||
|
right = np.empty((128, 8), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
left[:, j] = frame_F[:, 2 * j + 0]
|
||||||
|
right[:, j] = frame_F[:, 2 * j + 1]
|
||||||
|
return left, right
|
||||||
|
|
||||||
|
|
||||||
|
def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT:
|
||||||
|
"""
|
||||||
|
ESH synthesis for one channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X_esh : FrameChannelF
|
||||||
|
MDCT coefficients for 8 short windows (expected shape: (128, 8)).
|
||||||
|
win_type : WinType
|
||||||
|
Window family ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelT
|
||||||
|
Time-domain channel contribution, shape (2048,).
|
||||||
|
This is already overlap-added internally for the 8 short blocks and
|
||||||
|
ready for OLA at the caller level.
|
||||||
|
"""
|
||||||
|
if X_esh.shape != (128, 8):
|
||||||
|
raise ValueError("X_esh must have shape (128, 8).")
|
||||||
|
|
||||||
|
wS = _short_window(win_type) # (256,)
|
||||||
|
out = np.zeros(2048, dtype=np.float64)
|
||||||
|
|
||||||
|
# Each short IMDCT returns 256 samples. Place them at:
|
||||||
|
# start = 448 + 128*j, j=0..7 (50% overlap)
|
||||||
|
for j in range(8):
|
||||||
|
seg = imdct(X_esh[:, j]) * wS # (256,)
|
||||||
|
start = 448 + 128 * j
|
||||||
|
out[start:start + 256] += seg
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Function prototypes
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF:
|
||||||
|
"""
|
||||||
|
Filterbank stage (MDCT analysis).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_T : FrameT
|
||||||
|
Time-domain frame, stereo, shape (2048, 2).
|
||||||
|
frame_type : FrameType
|
||||||
|
Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS").
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("KBD" or "SIN") used for the current frame.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
frame_F : FrameF
|
||||||
|
Frequency-domain MDCT coefficients:
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2)
|
||||||
|
containing MDCT coefficients for both channels.
|
||||||
|
- If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2),
|
||||||
|
placed in columns according to subframe order, i.e. overall shape (128, 16).
|
||||||
|
"""
|
||||||
|
if frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("frame_T must have shape (2048, 2).")
|
||||||
|
|
||||||
|
xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False)
|
||||||
|
xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False)
|
||||||
|
|
||||||
|
if frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
w = _window_sequence(frame_type, win_type) # length 2048
|
||||||
|
XL = mdct(xL * w) # length 1024
|
||||||
|
XR = mdct(xR * w) # length 1024
|
||||||
|
out = np.empty((1024, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = XL
|
||||||
|
out[:, 1] = XR
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8)
|
||||||
|
Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8)
|
||||||
|
|
||||||
|
# Pack into (128, 16): each subframe as (128,2) placed in columns
|
||||||
|
out = np.empty((128, 16), dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
out[:, 2 * j + 0] = Xl[:, j]
|
||||||
|
out[:, 2 * j + 1] = Xr[:, j]
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT:
|
||||||
|
"""
|
||||||
|
Inverse filterbank (IMDCT synthesis).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameF
|
||||||
|
Frequency-domain MDCT coefficients as produced by filter_bank().
|
||||||
|
frame_type : FrameType
|
||||||
|
Frame type ("OLS"|"LSS"|"ESH"|"LPS").
|
||||||
|
win_type : WinType
|
||||||
|
Window type ("KBD" or "SIN").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
frame_T : FrameT
|
||||||
|
Reconstructed time-domain frame, stereo, shape (2048, 2).
|
||||||
|
"""
|
||||||
|
if frame_type in ("OLS", "LSS", "LPS"):
|
||||||
|
if frame_F.shape != (1024, 2):
|
||||||
|
raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).")
|
||||||
|
|
||||||
|
w = _window_sequence(frame_type, win_type)
|
||||||
|
|
||||||
|
xL = imdct(frame_F[:, 0]) * w
|
||||||
|
xR = imdct(frame_F[:, 1]) * w
|
||||||
|
|
||||||
|
out = np.empty((2048, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = xL
|
||||||
|
out[:, 1] = xR
|
||||||
|
return out
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if frame_F.shape != (128, 16):
|
||||||
|
raise ValueError("For ESH, frame_F must have shape (128, 16).")
|
||||||
|
|
||||||
|
Xl, Xr = _unpack_esh(frame_F)
|
||||||
|
xL = _i_filter_bank_esh_channel(Xl, win_type)
|
||||||
|
xR = _i_filter_bank_esh_channel(Xr, win_type)
|
||||||
|
|
||||||
|
out = np.empty((2048, 2), dtype=np.float64)
|
||||||
|
out[:, 0] = xL
|
||||||
|
out[:, 1] = xR
|
||||||
|
return out
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid frame_type: {frame_type!r}")
|
||||||
112
source/level_3/core/aac_huffman.py
Normal file
112
source/level_3/core/aac_huffman.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Huffman wrappers (Level 3)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Thin wrappers around the provided Huffman utilities (material/huff_utils.py)
|
||||||
|
# so that the API matches the assignment text.
|
||||||
|
#
|
||||||
|
# Exposed API (assignment):
|
||||||
|
# huff_sec, huff_codebook = aac_encode_huff(coeff_sec, huff_LUT_list, force_codebook)
|
||||||
|
# dec_coeffs = aac_decode_huff(huff_sec, huff_codebook, huff_LUT_list)
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# - Huffman coding operates on tuples. Therefore, decode(encode(x)) may return
|
||||||
|
# extra trailing symbols due to tuple padding. The AAC decoder knows the
|
||||||
|
# true section length from side information (band limits) and truncates.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from material.huff_utils import encode_huff, decode_huff
|
||||||
|
|
||||||
|
|
||||||
|
def aac_encode_huff(
|
||||||
|
coeff_sec: np.ndarray,
|
||||||
|
huff_LUT_list: list[dict[str, Any]],
|
||||||
|
force_codebook: int | None = None,
|
||||||
|
) -> tuple[str, int]:
|
||||||
|
"""
|
||||||
|
Huffman-encode a section of coefficients (MDCT symbols or scalefactors).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
coeff_sec : np.ndarray
|
||||||
|
Coefficient section to be encoded. Any shape is accepted; the input
|
||||||
|
is flattened and treated as a 1-D sequence of int64 symbols.
|
||||||
|
huff_LUT_list : list[dict[str, Any]]
|
||||||
|
List of Huffman Look-Up Tables (LUTs) as returned by material.load_LUT().
|
||||||
|
Index corresponds to codebook id (typically 1..11, with 0 reserved).
|
||||||
|
force_codebook : int | None
|
||||||
|
If provided, forces the use of this Huffman codebook. In the assignment,
|
||||||
|
scalefactors are encoded with codebook 11. For MDCT coefficients, this
|
||||||
|
argument is usually omitted (auto-selection).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple[str, int]
|
||||||
|
(huff_sec, huff_codebook)
|
||||||
|
- huff_sec: bitstream as a string of '0'/'1'
|
||||||
|
- huff_codebook: codebook id used by the encoder
|
||||||
|
"""
|
||||||
|
coeff_sec_arr = np.asarray(coeff_sec, dtype=np.int64).reshape(-1)
|
||||||
|
|
||||||
|
if force_codebook is None:
|
||||||
|
# Provided utility returns (bitstream, codebook) in the auto-selection case.
|
||||||
|
huff_sec, huff_codebook = encode_huff(coeff_sec_arr, huff_LUT_list)
|
||||||
|
return str(huff_sec), int(huff_codebook)
|
||||||
|
|
||||||
|
# Provided utility returns ONLY the bitstream when force_codebook is set.
|
||||||
|
cb = int(force_codebook)
|
||||||
|
huff_sec = encode_huff(coeff_sec_arr, huff_LUT_list, force_codebook=cb)
|
||||||
|
return str(huff_sec), cb
|
||||||
|
|
||||||
|
|
||||||
|
def aac_decode_huff(
|
||||||
|
huff_sec: str | np.ndarray,
|
||||||
|
huff_codebook: int,
|
||||||
|
huff_LUT: list[dict[str, Any]],
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Huffman-decode a bitstream using the specified codebook.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
huff_sec : str | np.ndarray
|
||||||
|
Huffman bitstream. Typically a string of '0'/'1'. If an array is provided,
|
||||||
|
it is passed through to the provided decoder.
|
||||||
|
huff_codebook : int
|
||||||
|
Codebook id that was returned by aac_encode_huff.
|
||||||
|
Codebook 0 represents an all-zero section.
|
||||||
|
huff_LUT : list[dict[str, Any]]
|
||||||
|
Huffman LUT list as returned by material.load_LUT().
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray
|
||||||
|
Decoded coefficients as a 1-D np.int64 array.
|
||||||
|
|
||||||
|
Note: Due to tuple coding, the decoded array may contain extra trailing
|
||||||
|
padding symbols. The caller must truncate to the known section length.
|
||||||
|
"""
|
||||||
|
cb = int(huff_codebook)
|
||||||
|
|
||||||
|
if cb == 0:
|
||||||
|
# Codebook 0 represents an all-zero section. The decoded length is not
|
||||||
|
# recoverable from the bitstream alone; the caller must expand/truncate.
|
||||||
|
return np.zeros((0,), dtype=np.int64)
|
||||||
|
|
||||||
|
if cb < 0 or cb >= len(huff_LUT):
|
||||||
|
raise ValueError(f"Invalid Huffman codebook index: {cb}")
|
||||||
|
|
||||||
|
lut = huff_LUT[cb]
|
||||||
|
dec = decode_huff(huff_sec, lut)
|
||||||
|
return np.asarray(dec, dtype=np.int64).reshape(-1)
|
||||||
441
source/level_3/core/aac_psycho.py
Normal file
441
source/level_3/core/aac_psycho.py
Normal file
@ -0,0 +1,441 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Psychoacoustic Model
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Psychoacoustic model for ONE channel, based on the assignment notes (Section 2.4).
|
||||||
|
#
|
||||||
|
# Public API:
|
||||||
|
# SMR = aac_psycho(frame_T, frame_type, frame_T_prev_1, frame_T_prev_2)
|
||||||
|
#
|
||||||
|
# Output:
|
||||||
|
# - For long frames ("OLS", "LSS", "LPS"): SMR has shape (69,)
|
||||||
|
# - For short frames ("ESH"): SMR has shape (42, 8) (one column per subframe)
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# - Uses Bark band tables from material/TableB219.mat:
|
||||||
|
# * B219a for long windows (69 bands, N=2048 FFT, N/2=1024 bins)
|
||||||
|
# * B219b for short windows (42 bands, N=256 FFT, N/2=128 bins)
|
||||||
|
# - Applies a Hann window in time domain before FFT magnitude/phase extraction.
|
||||||
|
# - Implements:
|
||||||
|
# spreading function -> band spreading -> tonality index -> masking thresholds -> SMR.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from core.aac_utils import band_limits, get_table
|
||||||
|
from core.aac_configuration import NMT_DB, TMN_DB
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Spreading function
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _spreading_matrix(bval: BandValueArray) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute the spreading function matrix between psychoacoustic bands.
|
||||||
|
|
||||||
|
The spreading function describes how energy in one critical band masks
|
||||||
|
nearby bands. The formula follows the assignment pseudo-code.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bval : BandValueArray
|
||||||
|
Bark value per band, shape (B,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Spreading matrix S of shape (B, B), where:
|
||||||
|
S[bb, b] quantifies the contribution of band bb masking band b.
|
||||||
|
"""
|
||||||
|
bval = np.asarray(bval, dtype=np.float64).reshape(-1)
|
||||||
|
B = int(bval.shape[0])
|
||||||
|
|
||||||
|
spread = np.zeros((B, B), dtype=np.float64)
|
||||||
|
|
||||||
|
for b in range(B):
|
||||||
|
for bb in range(B):
|
||||||
|
# tmpx depends on direction (asymmetric spreading)
|
||||||
|
if bb >= b:
|
||||||
|
tmpx = 3.0 * (bval[bb] - bval[b])
|
||||||
|
else:
|
||||||
|
tmpx = 1.5 * (bval[bb] - bval[b])
|
||||||
|
|
||||||
|
# tmpz uses the "min(..., 0)" nonlinearity exactly as in the notes
|
||||||
|
tmpz = 8.0 * min((tmpx - 0.5) ** 2 - 2.0 * (tmpx - 0.5), 0.0)
|
||||||
|
tmpy = 15.811389 + 7.5 * (tmpx + 0.474) - 17.5 * np.sqrt(1.0 + (tmpx + 0.474) ** 2)
|
||||||
|
|
||||||
|
# Clamp very small values (below -100 dB) to 0 contribution
|
||||||
|
if tmpy < -100.0:
|
||||||
|
spread[bb, b] = 0.0
|
||||||
|
else:
|
||||||
|
spread[bb, b] = 10.0 ** ((tmpz + tmpy) / 10.0)
|
||||||
|
|
||||||
|
return spread
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Windowing + FFT feature extraction
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _hann_window(N: int) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Hann window as specified in the notes:
|
||||||
|
w[n] = 0.5 - 0.5*cos(2*pi*(n + 0.5)/N)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
N : int
|
||||||
|
Window length.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
1-D array of shape (N,), dtype float64.
|
||||||
|
"""
|
||||||
|
n = np.arange(N, dtype=np.float64)
|
||||||
|
return 0.5 - 0.5 * np.cos((2.0 * np.pi / N) * (n + 0.5))
|
||||||
|
|
||||||
|
|
||||||
|
def _r_phi_from_time(x: FrameChannelT, N: int) -> tuple[FloatArray, FloatArray]:
|
||||||
|
"""
|
||||||
|
Compute FFT magnitude r(w) and phase phi(w) for bins w = 0 .. N/2-1.
|
||||||
|
|
||||||
|
Processing:
|
||||||
|
1) Apply Hann window in time domain.
|
||||||
|
2) Compute N-point FFT.
|
||||||
|
3) Keep only the positive-frequency bins [0 .. N/2-1].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : FrameChannelT
|
||||||
|
Time-domain samples, shape (N,).
|
||||||
|
N : int
|
||||||
|
FFT size (2048 or 256).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
r : FloatArray
|
||||||
|
Magnitude spectrum for bins 0 .. N/2-1, shape (N/2,).
|
||||||
|
phi : FloatArray
|
||||||
|
Phase spectrum for bins 0 .. N/2-1, shape (N/2,).
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||||
|
if x.shape[0] != N:
|
||||||
|
raise ValueError(f"Expected time vector of length {N}, got {x.shape[0]}.")
|
||||||
|
|
||||||
|
w = _hann_window(N)
|
||||||
|
X = np.fft.fft(x * w, n=N)
|
||||||
|
|
||||||
|
Xp = X[: N // 2]
|
||||||
|
r = np.abs(Xp).astype(np.float64, copy=False)
|
||||||
|
phi = np.angle(Xp).astype(np.float64, copy=False)
|
||||||
|
return r, phi
|
||||||
|
|
||||||
|
|
||||||
|
def _predictability(
|
||||||
|
r: FloatArray,
|
||||||
|
phi: FloatArray,
|
||||||
|
r_m1: FloatArray,
|
||||||
|
phi_m1: FloatArray,
|
||||||
|
r_m2: FloatArray,
|
||||||
|
phi_m2: FloatArray,
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute predictability c(w) per spectral bin.
|
||||||
|
|
||||||
|
The notes define:
|
||||||
|
r_pred(w) = 2*r_{-1}(w) - r_{-2}(w)
|
||||||
|
phi_pred(w) = 2*phi_{-1}(w) - phi_{-2}(w)
|
||||||
|
|
||||||
|
c(w) = |X(w) - X_pred(w)| / (r(w) + |r_pred(w)|)
|
||||||
|
|
||||||
|
where X(w) is represented in polar form using r(w), phi(w).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
r, phi : FloatArray
|
||||||
|
Current magnitude and phase, shape (N/2,).
|
||||||
|
r_m1, phi_m1 : FloatArray
|
||||||
|
Previous magnitude and phase, shape (N/2,).
|
||||||
|
r_m2, phi_m2 : FloatArray
|
||||||
|
Pre-previous magnitude and phase, shape (N/2,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Predictability c(w), shape (N/2,).
|
||||||
|
"""
|
||||||
|
r_pred = 2.0 * r_m1 - r_m2
|
||||||
|
phi_pred = 2.0 * phi_m1 - phi_m2
|
||||||
|
|
||||||
|
num = np.sqrt(
|
||||||
|
(r * np.cos(phi) - r_pred * np.cos(phi_pred)) ** 2
|
||||||
|
+ (r * np.sin(phi) - r_pred * np.sin(phi_pred)) ** 2
|
||||||
|
)
|
||||||
|
den = r + np.abs(r_pred) + 1e-12 # avoid division-by-zero without altering behavior
|
||||||
|
return (num / den).astype(np.float64, copy=False)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Band-domain aggregation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _band_energy_and_weighted_predictability(
|
||||||
|
r: FloatArray,
|
||||||
|
c: FloatArray,
|
||||||
|
wlow: BandIndexArray,
|
||||||
|
whigh: BandIndexArray,
|
||||||
|
) -> tuple[FloatArray, FloatArray]:
|
||||||
|
"""
|
||||||
|
Aggregate spectral bin quantities into psychoacoustic bands.
|
||||||
|
|
||||||
|
Definitions (notes):
|
||||||
|
e(b) = sum_{w=wlow(b)..whigh(b)} r(w)^2
|
||||||
|
c_num(b) = sum_{w=wlow(b)..whigh(b)} c(w) * r(w)^2
|
||||||
|
|
||||||
|
The band predictability c(b) is later computed after spreading as:
|
||||||
|
cb(b) = ct(b) / ecb(b)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
r : FloatArray
|
||||||
|
Magnitude spectrum, shape (N/2,).
|
||||||
|
c : FloatArray
|
||||||
|
Predictability per bin, shape (N/2,).
|
||||||
|
wlow, whigh : BandIndexArray
|
||||||
|
Band limits (inclusive indices), shape (B,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
e_b : FloatArray
|
||||||
|
Band energies e(b), shape (B,).
|
||||||
|
c_num_b : FloatArray
|
||||||
|
Weighted predictability numerators c_num(b), shape (B,).
|
||||||
|
"""
|
||||||
|
r2 = (r * r).astype(np.float64, copy=False)
|
||||||
|
|
||||||
|
B = int(wlow.shape[0])
|
||||||
|
e_b = np.zeros(B, dtype=np.float64)
|
||||||
|
c_num_b = np.zeros(B, dtype=np.float64)
|
||||||
|
|
||||||
|
for b in range(B):
|
||||||
|
a = int(wlow[b])
|
||||||
|
z = int(whigh[b])
|
||||||
|
|
||||||
|
seg_r2 = r2[a : z + 1]
|
||||||
|
e_b[b] = float(np.sum(seg_r2))
|
||||||
|
c_num_b[b] = float(np.sum(c[a : z + 1] * seg_r2))
|
||||||
|
|
||||||
|
return e_b, c_num_b
|
||||||
|
|
||||||
|
|
||||||
|
def _psycho_one_window(
|
||||||
|
time_x: FrameChannelT,
|
||||||
|
prev1_x: FrameChannelT,
|
||||||
|
prev2_x: FrameChannelT,
|
||||||
|
*,
|
||||||
|
N: int,
|
||||||
|
table: BarkTable,
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute SMR for one FFT analysis window (N=2048 for long, N=256 for short).
|
||||||
|
|
||||||
|
This implements the pipeline described in the notes:
|
||||||
|
- FFT magnitude/phase
|
||||||
|
- predictability per bin
|
||||||
|
- band energies and predictability
|
||||||
|
- band spreading
|
||||||
|
- tonality index tb(b)
|
||||||
|
- masking threshold (noise + threshold in quiet)
|
||||||
|
- SMR(b) = e(b) / np(b)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
time_x : FrameChannelT
|
||||||
|
Current time-domain samples, shape (N,).
|
||||||
|
prev1_x : FrameChannelT
|
||||||
|
Previous time-domain samples, shape (N,).
|
||||||
|
prev2_x : FrameChannelT
|
||||||
|
Pre-previous time-domain samples, shape (N,).
|
||||||
|
N : int
|
||||||
|
FFT size.
|
||||||
|
table : BarkTable
|
||||||
|
Psychoacoustic band table (B219a or B219b).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
SMR per band, shape (B,).
|
||||||
|
"""
|
||||||
|
wlow, whigh, bval, qthr_db = band_limits(table)
|
||||||
|
spread = _spreading_matrix(bval)
|
||||||
|
|
||||||
|
# FFT features for current and history windows
|
||||||
|
r, phi = _r_phi_from_time(time_x, N)
|
||||||
|
r_m1, phi_m1 = _r_phi_from_time(prev1_x, N)
|
||||||
|
r_m2, phi_m2 = _r_phi_from_time(prev2_x, N)
|
||||||
|
|
||||||
|
# Predictability per bin
|
||||||
|
c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2)
|
||||||
|
|
||||||
|
# Aggregate into psycho bands
|
||||||
|
e_b, c_num_b = _band_energy_and_weighted_predictability(r, c_w, wlow, whigh)
|
||||||
|
|
||||||
|
# Spread energies and predictability across bands:
|
||||||
|
# ecb(b) = sum_bb e(bb) * S(bb, b)
|
||||||
|
# ct(b) = sum_bb c_num(bb) * S(bb, b)
|
||||||
|
ecb = spread.T @ e_b
|
||||||
|
ct = spread.T @ c_num_b
|
||||||
|
|
||||||
|
# Band predictability after spreading: cb(b) = ct(b) / ecb(b)
|
||||||
|
cb = ct / (ecb + 1e-12)
|
||||||
|
|
||||||
|
# Normalized energy term:
|
||||||
|
# en(b) = ecb(b) / sum_bb S(bb, b)
|
||||||
|
spread_colsum = np.sum(spread, axis=0)
|
||||||
|
en = ecb / (spread_colsum + 1e-12)
|
||||||
|
|
||||||
|
# Tonality index (clamped to [0, 1])
|
||||||
|
tb = -0.299 - 0.43 * np.log(np.maximum(cb, 1e-12))
|
||||||
|
tb = np.clip(tb, 0.0, 1.0)
|
||||||
|
|
||||||
|
# Required SNR per band (dB): interpolate between TMN and NMT
|
||||||
|
snr_b = tb * TMN_DB + (1.0 - tb) * NMT_DB
|
||||||
|
bc = 10.0 ** (-snr_b / 10.0)
|
||||||
|
|
||||||
|
# Noise masking threshold estimate (power domain)
|
||||||
|
nb = en * bc
|
||||||
|
|
||||||
|
# Threshold in quiet (convert from dB to power domain):
|
||||||
|
# qthr_power = eps * (N/2) * 10^(qthr_db/10)
|
||||||
|
qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0))
|
||||||
|
|
||||||
|
# Final masking threshold per band:
|
||||||
|
# np(b) = max(nb(b), qthr(b))
|
||||||
|
npart = np.maximum(nb, qthr_power)
|
||||||
|
|
||||||
|
# Signal-to-mask ratio:
|
||||||
|
# SMR(b) = e(b) / np(b)
|
||||||
|
smr = e_b / (npart + 1e-12)
|
||||||
|
return smr.astype(np.float64, copy=False)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# ESH window slicing (match filterbank conventions)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _esh_subframes_256(x_2048: FrameChannelT) -> list[FrameChannelT]:
|
||||||
|
"""
|
||||||
|
Extract the 8 overlapping 256-sample short windows used by AAC ESH.
|
||||||
|
|
||||||
|
The project convention (matching the filterbank) is:
|
||||||
|
start_j = 448 + 128*j, for j = 0..7
|
||||||
|
subframe_j = x[start_j : start_j + 256]
|
||||||
|
|
||||||
|
This selects the central 1152-sample region [448, 1600) and produces
|
||||||
|
8 windows with 50% overlap.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_2048 : FrameChannelT
|
||||||
|
Time-domain channel frame, shape (2048,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[FrameChannelT]
|
||||||
|
List of 8 subframes, each of shape (256,).
|
||||||
|
"""
|
||||||
|
x_2048 = np.asarray(x_2048, dtype=np.float64).reshape(-1)
|
||||||
|
if x_2048.shape[0] != 2048:
|
||||||
|
raise ValueError("ESH requires 2048-sample input frames.")
|
||||||
|
|
||||||
|
subs: list[FrameChannelT] = []
|
||||||
|
for j in range(8):
|
||||||
|
start = 448 + 128 * j
|
||||||
|
subs.append(x_2048[start : start + 256])
|
||||||
|
return subs
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_psycho(
|
||||||
|
frame_T: FrameChannelT,
|
||||||
|
frame_type: FrameType,
|
||||||
|
frame_T_prev_1: FrameChannelT,
|
||||||
|
frame_T_prev_2: FrameChannelT,
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Psychoacoustic model for ONE channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_T : FrameChannelT
|
||||||
|
Current time-domain channel frame, shape (2048,).
|
||||||
|
For "ESH", the 8 short windows are derived internally.
|
||||||
|
frame_type : FrameType
|
||||||
|
AAC frame type ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
frame_T_prev_1 : FrameChannelT
|
||||||
|
Previous time-domain channel frame, shape (2048,).
|
||||||
|
frame_T_prev_2 : FrameChannelT
|
||||||
|
Pre-previous time-domain channel frame, shape (2048,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Signal-to-Mask Ratio (SMR), per psychoacoustic band.
|
||||||
|
- If frame_type == "ESH": shape (42, 8)
|
||||||
|
- Else: shape (69,)
|
||||||
|
"""
|
||||||
|
frame_T = np.asarray(frame_T, dtype=np.float64).reshape(-1)
|
||||||
|
frame_T_prev_1 = np.asarray(frame_T_prev_1, dtype=np.float64).reshape(-1)
|
||||||
|
frame_T_prev_2 = np.asarray(frame_T_prev_2, dtype=np.float64).reshape(-1)
|
||||||
|
|
||||||
|
if frame_T.shape[0] != 2048 or frame_T_prev_1.shape[0] != 2048 or frame_T_prev_2.shape[0] != 2048:
|
||||||
|
raise ValueError("aac_psycho expects 2048-sample frames for current/prev1/prev2.")
|
||||||
|
|
||||||
|
table, N = get_table(frame_type)
|
||||||
|
|
||||||
|
# Long frame types: compute one SMR vector (69 bands)
|
||||||
|
if frame_type != "ESH":
|
||||||
|
return _psycho_one_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table)
|
||||||
|
|
||||||
|
# ESH: compute 8 SMR vectors (42 bands each), one per short subframe.
|
||||||
|
#
|
||||||
|
# The notes use short-window history for predictability:
|
||||||
|
# - For j=0: use previous frame's subframes (7, 6)
|
||||||
|
# - For j=1: use current subframe 0 and previous frame's subframe 7
|
||||||
|
# - For j>=2: use current subframes (j-1, j-2)
|
||||||
|
#
|
||||||
|
# This matches the "within-frame history" convention commonly used in
|
||||||
|
# simplified psycho models for ESH.
|
||||||
|
cur_subs = _esh_subframes_256(frame_T)
|
||||||
|
prev1_subs = _esh_subframes_256(frame_T_prev_1)
|
||||||
|
|
||||||
|
B = int(table.shape[0]) # expected 42
|
||||||
|
smr_out = np.zeros((B, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
for j in range(8):
|
||||||
|
if j == 0:
|
||||||
|
x_m1 = prev1_subs[7]
|
||||||
|
x_m2 = prev1_subs[6]
|
||||||
|
elif j == 1:
|
||||||
|
x_m1 = cur_subs[0]
|
||||||
|
x_m2 = prev1_subs[7]
|
||||||
|
else:
|
||||||
|
x_m1 = cur_subs[j - 1]
|
||||||
|
x_m2 = cur_subs[j - 2]
|
||||||
|
|
||||||
|
smr_out[:, j] = _psycho_one_window(cur_subs[j], x_m1, x_m2, N=256, table=table)
|
||||||
|
|
||||||
|
return smr_out
|
||||||
604
source/level_3/core/aac_quantizer.py
Normal file
604
source/level_3/core/aac_quantizer.py
Normal file
@ -0,0 +1,604 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Implements AAC quantizer and inverse quantizer for one channel.
|
||||||
|
# Based on assignment section 2.6 (Eq. 12-15).
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# - Bit reservoir is not implemented (assignment simplification).
|
||||||
|
# - Scalefactor bands are assumed equal to psychoacoustic bands
|
||||||
|
# (Table B.2.1.9a / B.2.1.9b from TableB219.mat).
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from core.aac_utils import get_table, band_limits
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Constants (assignment)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
MAGIC_NUMBER: float = 0.4054
|
||||||
|
MQ: int = 8191
|
||||||
|
|
||||||
|
EPS: float = 1e-12
|
||||||
|
MAX_SFC_DIFF: int = 60
|
||||||
|
|
||||||
|
# Safeguard: prevents infinite loops in pathological cases
|
||||||
|
MAX_ALPHA_ITERS: int = 2000
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helpers: ESH packing/unpacking (128x8 <-> 1024x1)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1).
|
||||||
|
|
||||||
|
Packing order:
|
||||||
|
Columns are concatenated in subframe order (0..7), column-major.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_128x8 : FloatArray
|
||||||
|
ESH coefficients, shape (128, 8).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Packed coefficients, shape (1024, 1).
|
||||||
|
"""
|
||||||
|
x_128x8 = np.asarray(x_128x8, dtype=np.float64)
|
||||||
|
if x_128x8.shape != (128, 8):
|
||||||
|
raise ValueError("ESH pack expects shape (128, 8).")
|
||||||
|
return x_128x8.reshape(1024, 1, order="F")
|
||||||
|
|
||||||
|
|
||||||
|
def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Unpack a packed ESH vector (1024 elements) back to shape (128, 8).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_1024x1 : FloatArray
|
||||||
|
Packed ESH vector, shape (1024,) or (1024, 1) after flattening.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Unpacked ESH coefficients, shape (128, 8).
|
||||||
|
"""
|
||||||
|
x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1)
|
||||||
|
if x_1024x1.shape[0] != 1024:
|
||||||
|
raise ValueError("ESH unpack expects 1024 elements.")
|
||||||
|
return x_1024x1.reshape(128, 8, order="F")
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Core quantizer formulas (Eq. 12, Eq. 13)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols:
|
||||||
|
"""
|
||||||
|
Quantize MDCT coefficients to integer symbols S(k).
|
||||||
|
|
||||||
|
Implements Eq. (12):
|
||||||
|
S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER )
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : FloatArray
|
||||||
|
MDCT coefficients for a contiguous set of spectral lines.
|
||||||
|
Shape: (N,)
|
||||||
|
alpha : float
|
||||||
|
Scalefactor gain for the corresponding scalefactor band.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
QuantizedSymbols
|
||||||
|
Quantized symbols S(k) as int64, shape (N,).
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=np.float64)
|
||||||
|
|
||||||
|
scale = 2.0 ** (-0.25 * float(alpha))
|
||||||
|
ax = np.abs(x) * scale
|
||||||
|
|
||||||
|
y = np.power(ax, 0.75, dtype=np.float64)
|
||||||
|
|
||||||
|
# "int" in the assignment corresponds to truncation.
|
||||||
|
q = np.floor(y + MAGIC_NUMBER).astype(np.int64)
|
||||||
|
return (np.sign(x).astype(np.int64) * q).astype(np.int64)
|
||||||
|
|
||||||
|
|
||||||
|
def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Inverse quantizer (dequantization of symbols).
|
||||||
|
|
||||||
|
Implements Eq. (13):
|
||||||
|
Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
S : QuantizedSymbols
|
||||||
|
Quantized symbols S(k), int64, shape (N,).
|
||||||
|
alpha : float
|
||||||
|
Scalefactor gain for the corresponding scalefactor band.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Reconstructed MDCT coefficients Xhat(k), float64, shape (N,).
|
||||||
|
"""
|
||||||
|
S = np.asarray(S, dtype=np.int64)
|
||||||
|
|
||||||
|
scale = 2.0 ** (0.25 * float(alpha))
|
||||||
|
aS = np.abs(S).astype(np.float64)
|
||||||
|
y = np.power(aS, 4.0 / 3.0, dtype=np.float64)
|
||||||
|
|
||||||
|
return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Alpha initialization (Eq. 14)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _alpha_initial_from_frame_max(x_all: FloatArray) -> int:
|
||||||
|
"""
|
||||||
|
Compute the initial scalefactor gain alpha_hat for a frame.
|
||||||
|
|
||||||
|
Implements Eq. (14):
|
||||||
|
alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ )
|
||||||
|
|
||||||
|
The same initial value is used for all bands before the per-band refinement.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_all : FloatArray
|
||||||
|
All MDCT coefficients of a frame (one channel), flattened.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int
|
||||||
|
Initial alpha value (integer).
|
||||||
|
"""
|
||||||
|
x_all = np.asarray(x_all, dtype=np.float64).reshape(-1)
|
||||||
|
if x_all.size == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
max_abs = float(np.max(np.abs(x_all)))
|
||||||
|
if max_abs <= 0.0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
val = (max_abs ** 0.75) / float(MQ)
|
||||||
|
if val <= 0.0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
alpha_hat = (16.0 / 3.0) * np.log2(val)
|
||||||
|
return int(np.floor(alpha_hat))
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Band utilities
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type.
|
||||||
|
|
||||||
|
These are derived from the psychoacoustic tables (TableB219),
|
||||||
|
and map directly to MDCT indices:
|
||||||
|
- long: 0..1023
|
||||||
|
- short (ESH subframe): 0..127
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
Frame type ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[tuple[int, int]]
|
||||||
|
List of (lo, hi) inclusive index pairs for each band.
|
||||||
|
"""
|
||||||
|
table, _Nfft = get_table(frame_type)
|
||||||
|
wlow, whigh, _bval, _qthr_db = band_limits(table)
|
||||||
|
|
||||||
|
bands: list[tuple[int, int]] = []
|
||||||
|
for lo, hi in zip(wlow, whigh):
|
||||||
|
bands.append((int(lo), int(hi)))
|
||||||
|
return bands
|
||||||
|
|
||||||
|
|
||||||
|
def _band_energy(x: FloatArray, lo: int, hi: int) -> float:
|
||||||
|
"""
|
||||||
|
Compute energy of a spectral segment x[lo:hi+1].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : FloatArray
|
||||||
|
MDCT coefficient vector.
|
||||||
|
lo, hi : int
|
||||||
|
Inclusive index range.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
Sum of squares (energy) within the band.
|
||||||
|
"""
|
||||||
|
sec = x[lo : hi + 1]
|
||||||
|
return float(np.sum(sec * sec))
|
||||||
|
|
||||||
|
|
||||||
|
def _threshold_T_from_SMR(
|
||||||
|
X: FloatArray,
|
||||||
|
SMR_col: FloatArray,
|
||||||
|
bands: list[tuple[int, int]],
|
||||||
|
) -> FloatArray:
|
||||||
|
"""
|
||||||
|
Compute psychoacoustic thresholds T(b) per band.
|
||||||
|
|
||||||
|
Uses:
|
||||||
|
P(b) = sum_{k in band} X(k)^2
|
||||||
|
T(b) = P(b) / SMR(b)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : FloatArray
|
||||||
|
MDCT coefficients for a frame (long) or one ESH subframe (short).
|
||||||
|
SMR_col : FloatArray
|
||||||
|
SMR values for this frame/subframe, shape (NB,).
|
||||||
|
bands : list[tuple[int, int]]
|
||||||
|
Band index ranges.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FloatArray
|
||||||
|
Threshold vector T(b), shape (NB,).
|
||||||
|
"""
|
||||||
|
nb = len(bands)
|
||||||
|
T = np.zeros((nb,), dtype=np.float64)
|
||||||
|
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
P = _band_energy(X, lo, hi)
|
||||||
|
smr = float(SMR_col[b])
|
||||||
|
if smr <= EPS:
|
||||||
|
T[b] = 0.0
|
||||||
|
else:
|
||||||
|
T[b] = P / smr
|
||||||
|
|
||||||
|
return T
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Alpha selection per band + neighbor-difference constraint
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def _best_alpha_for_band(
|
||||||
|
X: FloatArray,
|
||||||
|
lo: int,
|
||||||
|
hi: int,
|
||||||
|
T_b: float,
|
||||||
|
alpha0: int,
|
||||||
|
alpha_min: int,
|
||||||
|
alpha_max: int,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha.
|
||||||
|
|
||||||
|
The algorithm increases alpha until the quantization error power exceeds
|
||||||
|
the band threshold:
|
||||||
|
|
||||||
|
P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2
|
||||||
|
select the largest alpha such that P_e(b) <= T(b)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : FloatArray
|
||||||
|
Full MDCT vector (one frame or one subframe), shape (N,).
|
||||||
|
lo, hi : int
|
||||||
|
Inclusive MDCT index range defining the band.
|
||||||
|
T_b : float
|
||||||
|
Psychoacoustic threshold for this band.
|
||||||
|
alpha0 : int
|
||||||
|
Initial alpha value for the band.
|
||||||
|
alpha_min, alpha_max : int
|
||||||
|
Bounds for alpha, used as a safeguard.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int
|
||||||
|
Selected integer alpha(b).
|
||||||
|
"""
|
||||||
|
if T_b <= 0.0:
|
||||||
|
return int(alpha0)
|
||||||
|
|
||||||
|
Xsec = X[lo : hi + 1]
|
||||||
|
|
||||||
|
alpha = int(alpha0)
|
||||||
|
alpha = max(alpha_min, min(alpha, alpha_max))
|
||||||
|
|
||||||
|
# Evaluate at current alpha
|
||||||
|
Ssec = _quantize_symbol(Xsec, alpha)
|
||||||
|
Xhat = _dequantize_symbol(Ssec, alpha)
|
||||||
|
Pe = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
|
if Pe > T_b:
|
||||||
|
return alpha
|
||||||
|
|
||||||
|
iters = 0
|
||||||
|
while iters < MAX_ALPHA_ITERS:
|
||||||
|
iters += 1
|
||||||
|
alpha_next = alpha + 1
|
||||||
|
if alpha_next > alpha_max:
|
||||||
|
break
|
||||||
|
|
||||||
|
Ssec = _quantize_symbol(Xsec, alpha_next)
|
||||||
|
Xhat = _dequantize_symbol(Ssec, alpha_next)
|
||||||
|
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
|
if Pe_next > T_b:
|
||||||
|
break
|
||||||
|
|
||||||
|
alpha = alpha_next
|
||||||
|
Pe = Pe_next
|
||||||
|
|
||||||
|
return alpha
|
||||||
|
|
||||||
|
|
||||||
|
def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping.
|
||||||
|
|
||||||
|
Uses a forward pass and a backward pass to reduce drift.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
alpha : np.ndarray
|
||||||
|
Alpha vector, shape (NB,).
|
||||||
|
max_diff : int
|
||||||
|
Maximum allowed absolute difference between adjacent bands.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray
|
||||||
|
Clamped alpha vector, int64, shape (NB,).
|
||||||
|
"""
|
||||||
|
a = np.asarray(alpha, dtype=np.int64).copy()
|
||||||
|
nb = a.shape[0]
|
||||||
|
|
||||||
|
for b in range(1, nb):
|
||||||
|
lo = int(a[b - 1] - max_diff)
|
||||||
|
hi = int(a[b - 1] + max_diff)
|
||||||
|
if a[b] < lo:
|
||||||
|
a[b] = lo
|
||||||
|
elif a[b] > hi:
|
||||||
|
a[b] = hi
|
||||||
|
|
||||||
|
for b in range(nb - 2, -1, -1):
|
||||||
|
lo = int(a[b + 1] - max_diff)
|
||||||
|
hi = int(a[b + 1] + max_diff)
|
||||||
|
if a[b] < lo:
|
||||||
|
a[b] = lo
|
||||||
|
elif a[b] > hi:
|
||||||
|
a[b] = hi
|
||||||
|
|
||||||
|
return a
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def aac_quantizer(
|
||||||
|
frame_F: FrameChannelF,
|
||||||
|
frame_type: FrameType,
|
||||||
|
SMR: FloatArray,
|
||||||
|
) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]:
|
||||||
|
"""
|
||||||
|
AAC quantizer for one channel (Level 3).
|
||||||
|
|
||||||
|
Quantizes MDCT coefficients using band-wise scalefactors derived from SMR.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F : FrameChannelF
|
||||||
|
MDCT coefficients after TNS, one channel.
|
||||||
|
Shapes:
|
||||||
|
- Long frames: (1024,) or (1024, 1)
|
||||||
|
- ESH: (128, 8)
|
||||||
|
frame_type : FrameType
|
||||||
|
AAC frame type ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
SMR : FloatArray
|
||||||
|
Signal-to-Mask Ratio per band.
|
||||||
|
Shapes:
|
||||||
|
- Long: (NB,) or (NB, 1)
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
S : QuantizedSymbols
|
||||||
|
Quantized symbols S(k), shape (1024, 1) for all frame types.
|
||||||
|
sfc : ScaleFactors
|
||||||
|
DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1).
|
||||||
|
Shapes:
|
||||||
|
- Long: (NB, 1)
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
G : GlobalGain
|
||||||
|
Global gain G = alpha(0).
|
||||||
|
- Long: scalar float
|
||||||
|
- ESH: array shape (1, 8)
|
||||||
|
"""
|
||||||
|
bands = _band_slices(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
X = np.asarray(frame_F, dtype=np.float64)
|
||||||
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if X.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F must have shape (128, 8).")
|
||||||
|
if SMR.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
|
||||||
|
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
|
||||||
|
G_arr = np.zeros((1, 8), dtype=np.int64)
|
||||||
|
|
||||||
|
for j in range(8):
|
||||||
|
Xj = X[:, j].reshape(128)
|
||||||
|
SMRj = SMR[:, j].reshape(NB)
|
||||||
|
|
||||||
|
T = _threshold_T_from_SMR(Xj, SMRj, bands)
|
||||||
|
|
||||||
|
alpha0 = _alpha_initial_from_frame_max(Xj)
|
||||||
|
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
||||||
|
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
alpha[b] = _best_alpha_for_band(
|
||||||
|
X=Xj, lo=lo, hi=hi, T_b=float(T[b]),
|
||||||
|
alpha0=int(alpha[b]),
|
||||||
|
alpha_min=-4096,
|
||||||
|
alpha_max=4096,
|
||||||
|
)
|
||||||
|
|
||||||
|
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
||||||
|
|
||||||
|
G_arr[0, j] = int(alpha[0])
|
||||||
|
sfc[0, j] = int(alpha[0])
|
||||||
|
for b in range(1, NB):
|
||||||
|
sfc[b, j] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
|
Sj = np.zeros((128,), dtype=np.int64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
|
# Place this subframe in the packed output (column-major subframe layout)
|
||||||
|
S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj
|
||||||
|
|
||||||
|
return S_out, sfc, G_arr.astype(np.float64)
|
||||||
|
|
||||||
|
# Long frames
|
||||||
|
if X.shape == (1024,):
|
||||||
|
Xv = X
|
||||||
|
elif X.shape == (1024, 1):
|
||||||
|
Xv = X[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).")
|
||||||
|
|
||||||
|
if SMR.shape == (NB,):
|
||||||
|
SMRv = SMR
|
||||||
|
elif SMR.shape == (NB, 1):
|
||||||
|
SMRv = SMR[:, 0]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
T = _threshold_T_from_SMR(Xv, SMRv, bands)
|
||||||
|
|
||||||
|
alpha0 = _alpha_initial_from_frame_max(Xv)
|
||||||
|
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
||||||
|
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
alpha[b] = _best_alpha_for_band(
|
||||||
|
X=Xv, lo=lo, hi=hi, T_b=float(T[b]),
|
||||||
|
alpha0=int(alpha[b]),
|
||||||
|
alpha_min=-4096,
|
||||||
|
alpha_max=4096,
|
||||||
|
)
|
||||||
|
|
||||||
|
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
||||||
|
|
||||||
|
sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
|
||||||
|
sfc[0, 0] = int(alpha[0])
|
||||||
|
for b in range(1, NB):
|
||||||
|
sfc[b, 0] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
|
G: float = float(alpha[0])
|
||||||
|
|
||||||
|
S_vec = np.zeros((1024,), dtype=np.int64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
|
return S_vec.reshape(1024, 1), sfc, G
|
||||||
|
|
||||||
|
|
||||||
|
def aac_i_quantizer(
|
||||||
|
S: QuantizedSymbols,
|
||||||
|
sfc: ScaleFactors,
|
||||||
|
G: GlobalGain,
|
||||||
|
frame_type: FrameType,
|
||||||
|
) -> FrameChannelF:
|
||||||
|
"""
|
||||||
|
Inverse quantizer (iQuantizer) for one channel.
|
||||||
|
|
||||||
|
Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
S : QuantizedSymbols
|
||||||
|
Quantized symbols, shape (1024, 1) (or any array with 1024 elements).
|
||||||
|
sfc : ScaleFactors
|
||||||
|
DPCM-coded scalefactors.
|
||||||
|
Shapes:
|
||||||
|
- Long: (NB, 1)
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
G : GlobalGain
|
||||||
|
Global gain (not strictly required if sfc includes sfc(0)=alpha(0)).
|
||||||
|
Present for API compatibility with the assignment.
|
||||||
|
frame_type : FrameType
|
||||||
|
AAC frame type.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelF
|
||||||
|
Reconstructed MDCT coefficients:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- Long: (1024, 1)
|
||||||
|
"""
|
||||||
|
bands = _band_slices(frame_type)
|
||||||
|
NB = len(bands)
|
||||||
|
|
||||||
|
S_flat = np.asarray(S, dtype=np.int64).reshape(-1)
|
||||||
|
if S_flat.shape[0] != 1024:
|
||||||
|
raise ValueError("S must contain 1024 symbols.")
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
sfc = np.asarray(sfc, dtype=np.int64)
|
||||||
|
if sfc.shape != (NB, 8):
|
||||||
|
raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).")
|
||||||
|
|
||||||
|
S_128x8 = _esh_unpack_from_1024(S_flat)
|
||||||
|
|
||||||
|
Xrec = np.zeros((128, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
for j in range(8):
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha[0] = int(sfc[0, j])
|
||||||
|
for b in range(1, NB):
|
||||||
|
alpha[b] = int(alpha[b - 1] + sfc[b, j])
|
||||||
|
|
||||||
|
Xj = np.zeros((128,), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b]))
|
||||||
|
|
||||||
|
Xrec[:, j] = Xj
|
||||||
|
|
||||||
|
return Xrec
|
||||||
|
|
||||||
|
sfc = np.asarray(sfc, dtype=np.int64)
|
||||||
|
if sfc.shape != (NB, 1):
|
||||||
|
raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).")
|
||||||
|
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha[0] = int(sfc[0, 0])
|
||||||
|
for b in range(1, NB):
|
||||||
|
alpha[b] = int(alpha[b - 1] + sfc[b, 0])
|
||||||
|
|
||||||
|
Xrec = np.zeros((1024,), dtype=np.float64)
|
||||||
|
for b, (lo, hi) in enumerate(bands):
|
||||||
|
Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
|
return Xrec.reshape(1024, 1)
|
||||||
217
source/level_3/core/aac_ssc.py
Normal file
217
source/level_3/core/aac_ssc.py
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Sequence Segmentation Control module
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Sequence Segmentation Control module (SSC).
|
||||||
|
# Selects and returns the frame type based on input parameters.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
from core.aac_types import FrameType, FrameT, FrameChannelT
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Private helpers for SSC
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# See Table 1 in mm-2025-hw-v0.1.pdf
|
||||||
|
STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = {
|
||||||
|
("OLS", "OLS"): "OLS",
|
||||||
|
("OLS", "LSS"): "LSS",
|
||||||
|
("OLS", "ESH"): "ESH",
|
||||||
|
("OLS", "LPS"): "LPS",
|
||||||
|
("LSS", "OLS"): "LSS",
|
||||||
|
("LSS", "LSS"): "LSS",
|
||||||
|
("LSS", "ESH"): "ESH",
|
||||||
|
("LSS", "LPS"): "ESH",
|
||||||
|
("ESH", "OLS"): "ESH",
|
||||||
|
("ESH", "LSS"): "ESH",
|
||||||
|
("ESH", "ESH"): "ESH",
|
||||||
|
("ESH", "LPS"): "ESH",
|
||||||
|
("LPS", "OLS"): "LPS",
|
||||||
|
("LPS", "LSS"): "ESH",
|
||||||
|
("LPS", "ESH"): "ESH",
|
||||||
|
("LPS", "LPS"): "LPS",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_attack(next_frame_channel: FrameChannelT) -> bool:
|
||||||
|
"""
|
||||||
|
Detect whether the *next* frame (single channel) implies an attack, i.e. ESH
|
||||||
|
according to the assignment's criterion.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
next_frame_channel : FrameChannelT
|
||||||
|
One channel of next_frame_T (expected shape: (2048,)).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if an attack is detected (=> next frame predicted ESH), else False.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The criterion is implemented as described in the spec:
|
||||||
|
|
||||||
|
1) Apply the high-pass filter:
|
||||||
|
H(z) = (1 - z^-1) / (1 - 0.5 z^-1)
|
||||||
|
implemented in the time domain as:
|
||||||
|
y[n] = x[n] - x[n-1] + 0.5*y[n-1]
|
||||||
|
|
||||||
|
2) Split y into 16 segments of length 128 and compute segment energies s[l].
|
||||||
|
|
||||||
|
3) Compute the ratio:
|
||||||
|
ds[l] = s[l] / s[l-1]
|
||||||
|
|
||||||
|
4) An attack exists if there exists l in {1..7} such that:
|
||||||
|
s[l] > 1e-3 and ds[l] > 10
|
||||||
|
"""
|
||||||
|
# Local alias; expected to be a 1-D array of length 2048.
|
||||||
|
x = next_frame_channel
|
||||||
|
|
||||||
|
# High-pass filter reference implementation (scalar recurrence).
|
||||||
|
y = np.zeros_like(x)
|
||||||
|
prev_x = 0.0
|
||||||
|
prev_y = 0.0
|
||||||
|
for n in range(x.shape[0]):
|
||||||
|
xn = float(x[n])
|
||||||
|
yn = (xn - prev_x) + 0.5 * prev_y
|
||||||
|
y[n] = yn
|
||||||
|
prev_x = xn
|
||||||
|
prev_y = yn
|
||||||
|
|
||||||
|
# Segment energies over 16 blocks of 128 samples.
|
||||||
|
s = np.empty(16, dtype=np.float64)
|
||||||
|
for l in range(16):
|
||||||
|
a = l * 128
|
||||||
|
b = (l + 1) * 128
|
||||||
|
seg = y[a:b]
|
||||||
|
s[l] = float(np.sum(seg * seg))
|
||||||
|
|
||||||
|
# ds[l] for l>=1. For l=0 not defined, keep 0.
|
||||||
|
ds = np.zeros(16, dtype=np.float64)
|
||||||
|
eps = 1e-12 # Avoid division by zero without materially changing the logic.
|
||||||
|
for l in range(1, 16):
|
||||||
|
ds[l] = s[l] / max(s[l - 1], eps)
|
||||||
|
|
||||||
|
# Spec: check l in {1..7}.
|
||||||
|
for l in range(1, 8):
|
||||||
|
if (s[l] > 1e-3) and (ds[l] > 10.0):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType:
|
||||||
|
"""
|
||||||
|
Decide the current frame type for a single channel based on the previous
|
||||||
|
frame type and whether the next frame is predicted to be ESH.
|
||||||
|
|
||||||
|
Rules (spec):
|
||||||
|
|
||||||
|
- If prev is "LSS" => current is "ESH"
|
||||||
|
- If prev is "LPS" => current is "OLS"
|
||||||
|
- If prev is "OLS" => current is "LSS" if attack else "OLS"
|
||||||
|
- If prev is "ESH" => current is "ESH" if attack else "LPS"
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
prev_frame_type : FrameType
|
||||||
|
Previous frame type (one of "OLS", "LSS", "ESH", "LPS").
|
||||||
|
attack : bool
|
||||||
|
True if the next frame is predicted ESH for this channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
The per-channel decision for the current frame.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if prev_frame_type == "LSS":
|
||||||
|
return "ESH"
|
||||||
|
if prev_frame_type == "LPS":
|
||||||
|
return "OLS"
|
||||||
|
if prev_frame_type == "OLS":
|
||||||
|
return "LSS" if attack else "OLS"
|
||||||
|
if prev_frame_type == "ESH":
|
||||||
|
return "ESH" if attack else "LPS"
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType:
|
||||||
|
"""
|
||||||
|
Merge per-channel frame type decisions into one common frame type using
|
||||||
|
the stereo merge table from the spec.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ft_l : FrameType
|
||||||
|
Frame type decision for the left channel.
|
||||||
|
ft_r : FrameType
|
||||||
|
Frame type decision for the right channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
The merged common frame type.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return STEREO_MERGE_TABLE[(ft_l, ft_r)]
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Function prototypes
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType:
|
||||||
|
"""
|
||||||
|
Sequence Segmentation Control (SSC).
|
||||||
|
|
||||||
|
Select and return the frame type for the current frame (i) based on:
|
||||||
|
- the current time-domain frame (stereo),
|
||||||
|
- the next time-domain frame (stereo), used for attack detection,
|
||||||
|
- the previous frame type.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_T : FrameT
|
||||||
|
Current time-domain frame i (expected shape: (2048, 2)).
|
||||||
|
next_frame_T : FrameT
|
||||||
|
Next time-domain frame (i+1), used to decide transitions to/from ESH
|
||||||
|
(expected shape: (2048, 2)).
|
||||||
|
prev_frame_type : FrameType
|
||||||
|
Frame type chosen for the previous frame (i-1).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameType
|
||||||
|
One of: "OLS", "LSS", "ESH", "LPS".
|
||||||
|
"""
|
||||||
|
if frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("frame_T must have shape (2048, 2).")
|
||||||
|
if next_frame_T.shape != (2048, 2):
|
||||||
|
raise ValueError("next_frame_T must have shape (2048, 2).")
|
||||||
|
|
||||||
|
# Detect attack independently per channel on the next frame.
|
||||||
|
attack_l = _detect_attack(next_frame_T[:, 0])
|
||||||
|
attack_r = _detect_attack(next_frame_T[:, 1])
|
||||||
|
|
||||||
|
# Decide per-channel type based on shared prev_frame_type.
|
||||||
|
ft_l = _decide_frame_type(prev_frame_type, attack_l)
|
||||||
|
ft_r = _decide_frame_type(prev_frame_type, attack_r)
|
||||||
|
|
||||||
|
# Stereo merge as per the spec table.
|
||||||
|
return _stereo_merge(ft_l, ft_r)
|
||||||
514
source/level_3/core/aac_tns.py
Normal file
514
source/level_3/core/aac_tns.py
Normal file
@ -0,0 +1,514 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Temporal Noise Shaping (TNS)
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Temporal Noise Shaping (TNS) module (Level 2).
|
||||||
|
#
|
||||||
|
# Public API:
|
||||||
|
# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type)
|
||||||
|
# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs)
|
||||||
|
#
|
||||||
|
# Notes (per assignment):
|
||||||
|
# - TNS is applied per channel (not stereo).
|
||||||
|
# - For ESH, TNS is applied independently to each of the 8 short subframes.
|
||||||
|
# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short)
|
||||||
|
# provided in TableB219.mat.
|
||||||
|
# - Predictor order is fixed to p = 4.
|
||||||
|
# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1.
|
||||||
|
# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||||
|
# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from core.aac_utils import load_b219_tables
|
||||||
|
from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _band_ranges_for_kcount(k_count: int) -> BandRanges:
|
||||||
|
"""
|
||||||
|
Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
k_count : int
|
||||||
|
Number of MDCT lines:
|
||||||
|
- 1024 for long frames
|
||||||
|
- 128 for short subframes (ESH)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
BandRanges (list[tuple[int, int]])
|
||||||
|
Each tuple is (start_k, end_k) inclusive.
|
||||||
|
"""
|
||||||
|
tables = load_b219_tables()
|
||||||
|
if k_count == 1024:
|
||||||
|
tbl = tables["B219a"]
|
||||||
|
elif k_count == 128:
|
||||||
|
tbl = tables["B219b"]
|
||||||
|
else:
|
||||||
|
raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).")
|
||||||
|
|
||||||
|
start = tbl[:, 1].astype(int)
|
||||||
|
end = tbl[:, 2].astype(int)
|
||||||
|
|
||||||
|
ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)]
|
||||||
|
|
||||||
|
for s, e in ranges:
|
||||||
|
if s < 0 or e < s or e >= k_count:
|
||||||
|
raise ValueError("Invalid band table ranges for given k_count.")
|
||||||
|
return ranges
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Core DSP helpers
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _smooth_sw_inplace(sw: MdctCoeffs) -> None:
|
||||||
|
"""
|
||||||
|
Smooth Sw(k) to reduce discontinuities between adjacent Bark bands.
|
||||||
|
|
||||||
|
The assignment applies two passes:
|
||||||
|
- Backward: Sw(k) = (Sw(k) + Sw(k+1))/2
|
||||||
|
- Forward: Sw(k) = (Sw(k) + Sw(k-1))/2
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
sw : MdctCoeffs
|
||||||
|
1-D array of length K (float64). Modified in-place.
|
||||||
|
"""
|
||||||
|
k_count = int(sw.shape[0])
|
||||||
|
|
||||||
|
for k in range(k_count - 2, -1, -1):
|
||||||
|
sw[k] = 0.5 * (sw[k] + sw[k + 1])
|
||||||
|
|
||||||
|
for k in range(1, k_count):
|
||||||
|
sw[k] = 0.5 * (sw[k] + sw[k - 1])
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_sw(x: MdctCoeffs) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
Compute Sw(k) from band energies P(j) and apply boundary smoothing.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : MdctCoeffs
|
||||||
|
1-D MDCT line array, length K.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
Sw(k), 1-D array of length K, float64.
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||||
|
k_count = int(x.shape[0])
|
||||||
|
|
||||||
|
bands = _band_ranges_for_kcount(k_count)
|
||||||
|
sw = np.zeros(k_count, dtype=np.float64)
|
||||||
|
|
||||||
|
for s, e in bands:
|
||||||
|
seg = x[s : e + 1]
|
||||||
|
p_j = float(np.sum(seg * seg))
|
||||||
|
sw_val = float(np.sqrt(p_j))
|
||||||
|
sw[s : e + 1] = sw_val
|
||||||
|
|
||||||
|
_smooth_sw_inplace(sw)
|
||||||
|
return sw
|
||||||
|
|
||||||
|
|
||||||
|
def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
Autocorrelation r(m) for m=0..p.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : MdctCoeffs
|
||||||
|
1-D signal.
|
||||||
|
p : int
|
||||||
|
Maximum lag.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
r, shape (p+1,), float64.
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||||
|
n = int(x.shape[0])
|
||||||
|
|
||||||
|
r = np.zeros(p + 1, dtype=np.float64)
|
||||||
|
for m in range(p + 1):
|
||||||
|
r[m] = float(np.dot(x[m:], x[: n - m]))
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
Solve Yule-Walker normal equations for LPC coefficients of order p.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
xw : MdctCoeffs
|
||||||
|
1-D normalized sequence Xw(k).
|
||||||
|
p : int
|
||||||
|
Predictor order.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
LPC coefficients a[0..p-1], shape (p,), float64.
|
||||||
|
"""
|
||||||
|
r = _autocorr(xw, p)
|
||||||
|
|
||||||
|
R = np.empty((p, p), dtype=np.float64)
|
||||||
|
for i in range(p):
|
||||||
|
for j in range(p):
|
||||||
|
R[i, j] = r[abs(i - j)]
|
||||||
|
|
||||||
|
rhs = r[1 : p + 1].reshape(p)
|
||||||
|
|
||||||
|
reg = 1e-12
|
||||||
|
R_reg = R + reg * np.eye(p, dtype=np.float64)
|
||||||
|
|
||||||
|
a = np.linalg.solve(R_reg, rhs)
|
||||||
|
return a
|
||||||
|
|
||||||
|
|
||||||
|
def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
Quantize LPC coefficients with uniform symmetric quantizer and clamp.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
a : MdctCoeffs
|
||||||
|
LPC coefficient array, shape (p,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
Quantized coefficients, shape (p,), float64.
|
||||||
|
"""
|
||||||
|
a = np.asarray(a, dtype=np.float64).reshape(-1)
|
||||||
|
q = np.round(a / QUANT_STEP) * QUANT_STEP
|
||||||
|
q = np.clip(q, -QUANT_MAX, QUANT_MAX)
|
||||||
|
return q.astype(np.float64, copy=False)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_inverse_stable(a_q: MdctCoeffs) -> bool:
|
||||||
|
"""
|
||||||
|
Check stability of the inverse TNS filter H_TNS^{-1}.
|
||||||
|
|
||||||
|
Forward filter:
|
||||||
|
H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||||
|
|
||||||
|
Inverse filter poles are roots of:
|
||||||
|
A(z) = 1 - a1 z^-1 - ... - ap z^-p
|
||||||
|
Multiply by z^p:
|
||||||
|
z^p - a1 z^{p-1} - ... - ap = 0
|
||||||
|
|
||||||
|
Stability condition:
|
||||||
|
all roots satisfy |z| < 1.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
a_q : MdctCoeffs
|
||||||
|
Quantized predictor coefficients, shape (p,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if stable, else False.
|
||||||
|
"""
|
||||||
|
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||||
|
p = int(a_q.shape[0])
|
||||||
|
|
||||||
|
# Polynomial in z: z^p - a1 z^{p-1} - ... - ap
|
||||||
|
poly = np.empty(p + 1, dtype=np.float64)
|
||||||
|
poly[0] = 1.0
|
||||||
|
poly[1:] = -a_q
|
||||||
|
|
||||||
|
roots = np.roots(poly)
|
||||||
|
|
||||||
|
# Strictly inside unit circle for stability. Add tiny margin for numeric safety.
|
||||||
|
margin = 1e-12
|
||||||
|
return bool(np.all(np.abs(roots) < (1.0 - margin)))
|
||||||
|
|
||||||
|
|
||||||
|
def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
Make quantized predictor coefficients stable for inverse filtering.
|
||||||
|
|
||||||
|
Policy:
|
||||||
|
- If already stable: return as-is.
|
||||||
|
- Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid.
|
||||||
|
- If still unstable after attempts: fall back to all-zero coefficients (disable TNS).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
a_q : MdctCoeffs
|
||||||
|
Quantized predictor coefficients, shape (p,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
Stable quantized coefficients, shape (p,).
|
||||||
|
"""
|
||||||
|
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||||
|
|
||||||
|
if _is_inverse_stable(a_q):
|
||||||
|
return a_q
|
||||||
|
|
||||||
|
# Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid.
|
||||||
|
gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1)
|
||||||
|
|
||||||
|
for g in gammas:
|
||||||
|
cand = _quantize_coeffs(g * a_q)
|
||||||
|
if _is_inverse_stable(cand):
|
||||||
|
return cand
|
||||||
|
|
||||||
|
# Last resort: disable TNS for this vector
|
||||||
|
return np.zeros_like(a_q, dtype=np.float64)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
Apply forward TNS FIR filter:
|
||||||
|
y[k] = x[k] - sum_{l=1..p} a_l * x[k-l]
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : MdctCoeffs
|
||||||
|
1-D MDCT lines, length K.
|
||||||
|
a_q : MdctCoeffs
|
||||||
|
Quantized LPC coefficients, shape (p,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
Filtered MDCT lines y, length K.
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||||
|
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||||
|
p = int(a_q.shape[0])
|
||||||
|
k_count = int(x.shape[0])
|
||||||
|
|
||||||
|
y = np.zeros(k_count, dtype=np.float64)
|
||||||
|
for k in range(k_count):
|
||||||
|
acc = x[k]
|
||||||
|
for l in range(1, p + 1):
|
||||||
|
if k - l >= 0:
|
||||||
|
acc -= a_q[l - 1] * x[k - l]
|
||||||
|
y[k] = acc
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
Apply inverse TNS IIR filter:
|
||||||
|
x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l]
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : MdctCoeffs
|
||||||
|
1-D MDCT lines after TNS, length K.
|
||||||
|
a_q : MdctCoeffs
|
||||||
|
Quantized LPC coefficients, shape (p,).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
Reconstructed MDCT lines x_hat, length K.
|
||||||
|
"""
|
||||||
|
y = np.asarray(y, dtype=np.float64).reshape(-1)
|
||||||
|
a_q = np.asarray(a_q, dtype=np.float64).reshape(-1)
|
||||||
|
p = int(a_q.shape[0])
|
||||||
|
k_count = int(y.shape[0])
|
||||||
|
|
||||||
|
x_hat = np.zeros(k_count, dtype=np.float64)
|
||||||
|
for k in range(k_count):
|
||||||
|
acc = y[k]
|
||||||
|
for l in range(1, p + 1):
|
||||||
|
if k - l >= 0:
|
||||||
|
acc += a_q[l - 1] * x_hat[k - l]
|
||||||
|
x_hat[k] = acc
|
||||||
|
return x_hat
|
||||||
|
|
||||||
|
|
||||||
|
def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]:
|
||||||
|
"""
|
||||||
|
TNS for a single MDCT vector (one long frame or one short subframe).
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1) Compute Sw(k) from Bark band energies and smooth it.
|
||||||
|
2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0).
|
||||||
|
3) Compute LPC coefficients (order p=PRED_ORDER) on Xw.
|
||||||
|
4) Quantize coefficients (4-bit symmetric, step QUANT_STEP).
|
||||||
|
5) Apply FIR filter on original X(k) using quantized coefficients.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : MdctCoeffs
|
||||||
|
1-D MDCT vector.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
y : MdctCoeffs
|
||||||
|
TNS-processed MDCT vector (same length).
|
||||||
|
a_q : MdctCoeffs
|
||||||
|
Quantized LPC coefficients, shape (PRED_ORDER,).
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=np.float64).reshape(-1)
|
||||||
|
sw = _compute_sw(x)
|
||||||
|
|
||||||
|
eps = 1e-12
|
||||||
|
xw = np.zeros_like(x, dtype=np.float64)
|
||||||
|
mask = sw > eps
|
||||||
|
np.divide(x, sw, out=xw, where=mask)
|
||||||
|
|
||||||
|
a = _lpc_coeffs(xw, PRED_ORDER)
|
||||||
|
a_q = _quantize_coeffs(a)
|
||||||
|
|
||||||
|
# Ensure inverse stability (assignment requirement)
|
||||||
|
a_q = _stabilize_quantized_coeffs(a_q)
|
||||||
|
|
||||||
|
y = _apply_tns_fir(x, a_q)
|
||||||
|
return y, a_q
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Functions
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]:
|
||||||
|
"""
|
||||||
|
Temporal Noise Shaping (TNS) for ONE channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F_in : FrameChannelF
|
||||||
|
Per-channel MDCT coefficients.
|
||||||
|
Expected (typical) shapes:
|
||||||
|
- If frame_type == "ESH": (128, 8)
|
||||||
|
- Else: (1024, 1) or (1024,)
|
||||||
|
|
||||||
|
frame_type : FrameType
|
||||||
|
Frame type code ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
frame_F_out : FrameChannelF
|
||||||
|
Per-channel MDCT coefficients after applying TNS.
|
||||||
|
Same shape convention as input.
|
||||||
|
|
||||||
|
tns_coeffs : TnsCoeffs
|
||||||
|
Quantized TNS predictor coefficients.
|
||||||
|
Expected shapes:
|
||||||
|
- If frame_type == "ESH": (PRED_ORDER, 8)
|
||||||
|
- Else: (PRED_ORDER, 1)
|
||||||
|
"""
|
||||||
|
x = np.asarray(frame_F_in, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if x.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
|
||||||
|
|
||||||
|
y = np.empty_like(x, dtype=np.float64)
|
||||||
|
a_out = np.empty((PRED_ORDER, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
for j in range(8):
|
||||||
|
y[:, j], a_out[:, j] = _tns_one_vector(x[:, j])
|
||||||
|
|
||||||
|
return y, a_out
|
||||||
|
|
||||||
|
if x.shape == (1024,):
|
||||||
|
x_vec = x
|
||||||
|
out_shape = (1024,)
|
||||||
|
elif x.shape == (1024, 1):
|
||||||
|
x_vec = x[:, 0]
|
||||||
|
out_shape = (1024, 1)
|
||||||
|
else:
|
||||||
|
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
|
||||||
|
|
||||||
|
y_vec, a_q = _tns_one_vector(x_vec)
|
||||||
|
|
||||||
|
if out_shape == (1024,):
|
||||||
|
y_out = y_vec
|
||||||
|
else:
|
||||||
|
y_out = y_vec.reshape(1024, 1)
|
||||||
|
|
||||||
|
a_out = a_q.reshape(PRED_ORDER, 1)
|
||||||
|
return y_out, a_out
|
||||||
|
|
||||||
|
|
||||||
|
def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF:
|
||||||
|
"""
|
||||||
|
Inverse Temporal Noise Shaping (iTNS) for ONE channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_F_in : FrameChannelF
|
||||||
|
Per-channel MDCT coefficients after TNS.
|
||||||
|
Expected (typical) shapes:
|
||||||
|
- If frame_type == "ESH": (128, 8)
|
||||||
|
- Else: (1024, 1) or (1024,)
|
||||||
|
|
||||||
|
frame_type : FrameType
|
||||||
|
Frame type code ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
|
||||||
|
tns_coeffs : TnsCoeffs
|
||||||
|
Quantized TNS predictor coefficients.
|
||||||
|
Expected shapes:
|
||||||
|
- If frame_type == "ESH": (PRED_ORDER, 8)
|
||||||
|
- Else: (PRED_ORDER, 1)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
FrameChannelF
|
||||||
|
Per-channel MDCT coefficients after inverse TNS.
|
||||||
|
Same shape convention as input frame_F_in.
|
||||||
|
"""
|
||||||
|
x = np.asarray(frame_F_in, dtype=np.float64)
|
||||||
|
a = np.asarray(tns_coeffs, dtype=np.float64)
|
||||||
|
|
||||||
|
if frame_type == "ESH":
|
||||||
|
if x.shape != (128, 8):
|
||||||
|
raise ValueError("For ESH, frame_F_in must have shape (128, 8).")
|
||||||
|
if a.shape != (PRED_ORDER, 8):
|
||||||
|
raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).")
|
||||||
|
|
||||||
|
y = np.empty_like(x, dtype=np.float64)
|
||||||
|
for j in range(8):
|
||||||
|
y[:, j] = _apply_itns_iir(x[:, j], a[:, j])
|
||||||
|
return y
|
||||||
|
|
||||||
|
if a.shape != (PRED_ORDER, 1):
|
||||||
|
raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).")
|
||||||
|
|
||||||
|
if x.shape == (1024,):
|
||||||
|
x_vec = x
|
||||||
|
out_shape = (1024,)
|
||||||
|
elif x.shape == (1024, 1):
|
||||||
|
x_vec = x[:, 0]
|
||||||
|
out_shape = (1024, 1)
|
||||||
|
else:
|
||||||
|
raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).')
|
||||||
|
|
||||||
|
y_vec = _apply_itns_iir(x_vec, a[:, 0])
|
||||||
|
|
||||||
|
if out_shape == (1024,):
|
||||||
|
return y_vec
|
||||||
|
return y_vec.reshape(1024, 1)
|
||||||
411
source/level_3/core/aac_types.py
Normal file
411
source/level_3/core/aac_types.py
Normal file
@ -0,0 +1,411 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Public Type Aliases
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module implements Public Type aliases
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import List, Literal, TypeAlias, TypedDict
|
||||||
|
import numpy as np
|
||||||
|
from numpy.typing import NDArray
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Code enums (for readability; not intended to enforce shapes/lengths)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"]
|
||||||
|
"""
|
||||||
|
Frame type codes (AAC):
|
||||||
|
- "OLS": ONLY_LONG_SEQUENCE
|
||||||
|
- "LSS": LONG_START_SEQUENCE
|
||||||
|
- "ESH": EIGHT_SHORT_SEQUENCE
|
||||||
|
- "LPS": LONG_STOP_SEQUENCE
|
||||||
|
"""
|
||||||
|
|
||||||
|
WinType: TypeAlias = Literal["KBD", "SIN"]
|
||||||
|
"""
|
||||||
|
Window type codes (AAC):
|
||||||
|
- "KBD": Kaiser-Bessel-Derived
|
||||||
|
- "SIN": sinusoid
|
||||||
|
"""
|
||||||
|
|
||||||
|
ChannelKey: TypeAlias = Literal["chl", "chr"]
|
||||||
|
"""Channel dictionary keys used in Level payloads."""
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Array “semantic” aliases
|
||||||
|
#
|
||||||
|
# Goal: communicate meaning (time/frequency/window, stereo/channel) without
|
||||||
|
# forcing strict shapes in the type system.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FloatArray: TypeAlias = NDArray[np.float64]
|
||||||
|
"""
|
||||||
|
Generic float64 NumPy array.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- We standardize internal numeric computations to float64 for stability and
|
||||||
|
reproducibility. External I/O can still be float32, but we convert at the
|
||||||
|
boundaries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Window: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain window (weighting sequence), 1-D.
|
||||||
|
|
||||||
|
Typical lengths in this assignment:
|
||||||
|
- Long: 2048
|
||||||
|
- Short: 256
|
||||||
|
- Window sequences for LSS/LPS are also 2048
|
||||||
|
|
||||||
|
Expected shape: (N,)
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
TimeSignal: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain signal samples, typically 1-D.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- Windowed MDCT input: shape (N,)
|
||||||
|
- IMDCT output: shape (N,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
StereoSignal: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain stereo signal stream.
|
||||||
|
|
||||||
|
Expected (typical) shape: (N, 2)
|
||||||
|
- axis 0: time samples
|
||||||
|
- axis 1: channels [L, R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
MdctCoeffs: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
MDCT coefficient vector, typically 1-D.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- Long: shape (1024,)
|
||||||
|
- Short: shape (128,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
MdctFrameChannel: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Per-channel MDCT container used in Level-1/2 sequences.
|
||||||
|
|
||||||
|
Typical shapes:
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||||
|
- If frame_type == "ESH": (128, 8) (8 short subframes for one channel)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Some parts of the assignment store long-frame coefficients as a column vector
|
||||||
|
(1024, 1) to match MATLAB conventions. Internally you may also use (1024,)
|
||||||
|
when convenient, but the semantic meaning is identical.
|
||||||
|
"""
|
||||||
|
|
||||||
|
TnsCoeffs: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Quantized TNS predictor coefficients (one channel).
|
||||||
|
|
||||||
|
Typical shapes (Level 2):
|
||||||
|
- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes)
|
||||||
|
- Else: (4, 1) (order p=4 for the long frame)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1.
|
||||||
|
We store the quantized coefficient values as float64 (typically multiples of 0.1)
|
||||||
|
to keep the pipeline simple and readable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
FrameT: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain frame (stereo), as used by the filterbank input/output.
|
||||||
|
|
||||||
|
Expected (typical) shape for stereo: (2048, 2)
|
||||||
|
- axis 0: time samples
|
||||||
|
- axis 1: channels [L, R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameChannelT: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Time-domain single-channel frame.
|
||||||
|
|
||||||
|
Expected (typical) shape: (2048,)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameF: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Frequency-domain frame (MDCT coefficients), stereo container.
|
||||||
|
|
||||||
|
Typical shapes (Level 1):
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: (1024, 2)
|
||||||
|
- If frame_type == "ESH": (128, 16)
|
||||||
|
|
||||||
|
Rationale for ESH (128, 16):
|
||||||
|
- 8 short subframes per channel => 8 * 2 = 16 columns total
|
||||||
|
- Each short subframe per stereo is (128, 2), flattened into columns
|
||||||
|
in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R]
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
FrameChannelF: TypeAlias = MdctFrameChannel
|
||||||
|
"""
|
||||||
|
Frequency-domain single-channel MDCT coefficients.
|
||||||
|
|
||||||
|
Typical shapes (Level 1/2):
|
||||||
|
- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,)
|
||||||
|
- If frame_type == "ESH": (128, 8)
|
||||||
|
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
BandRanges: TypeAlias = list[tuple[int, int]]
|
||||||
|
"""
|
||||||
|
Bark-band index ranges [start, end] (inclusive) for MDCT lines.
|
||||||
|
|
||||||
|
Used by TNS to map MDCT indices k to Bark bands.
|
||||||
|
"""
|
||||||
|
|
||||||
|
BarkTable: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Psychoacoustic Bark band table loaded from TableB219.mat.
|
||||||
|
|
||||||
|
Typical shapes:
|
||||||
|
- Long: (69, 6)
|
||||||
|
- Short: (42, 6)
|
||||||
|
"""
|
||||||
|
|
||||||
|
BandIndexArray: TypeAlias = NDArray[np.int_]
|
||||||
|
"""
|
||||||
|
Array of FFT bin indices per psychoacoustic band.
|
||||||
|
"""
|
||||||
|
|
||||||
|
BandValueArray: TypeAlias = FloatArray
|
||||||
|
"""
|
||||||
|
Per-band psychoacoustic values (e.g. Bark position, thresholds).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Quantizer-related semantic aliases
|
||||||
|
|
||||||
|
QuantizedSymbols: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Quantized MDCT symbols S(k).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols).
|
||||||
|
"""
|
||||||
|
|
||||||
|
ScaleFactors: TypeAlias = NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1).
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- Long frames: (NB, 1)
|
||||||
|
- ESH frames: (NB, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
GlobalGain: TypeAlias = float | NDArray[np.generic]
|
||||||
|
"""
|
||||||
|
Global gain G = alpha(0).
|
||||||
|
|
||||||
|
- Long frames: scalar float
|
||||||
|
- ESH frames: array shape (1, 8)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Huffman semantic aliases
|
||||||
|
|
||||||
|
HuffmanBitstream: TypeAlias = str
|
||||||
|
"""Huffman-coded bitstream stored as a string of '0'/'1'."""
|
||||||
|
|
||||||
|
HuffmanCodebook: TypeAlias = int
|
||||||
|
"""Huffman codebook id (e.g., 0..11)."""
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 1 AAC sequence payload types
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
frame_F:
|
||||||
|
The MDCT coefficients for ONE channel.
|
||||||
|
Typical shapes:
|
||||||
|
- ESH: (128, 8) (8 short subframes)
|
||||||
|
- else: (1024, 1) or (1024,)
|
||||||
|
"""
|
||||||
|
frame_F: FrameChannelF
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq1Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_1 (Level 1).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF
|
||||||
|
chr: AACChannelFrameF
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq1: TypeAlias = List[AACSeq1Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 1:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 2 AAC sequence payload types (TNS)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF2(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
frame_F:
|
||||||
|
The TNS-processed MDCT coefficients for ONE channel.
|
||||||
|
Typical shapes:
|
||||||
|
- ESH: (128, 8)
|
||||||
|
- else: (1024, 1) or (1024,)
|
||||||
|
tns_coeffs:
|
||||||
|
Quantized TNS predictor coefficients for ONE channel.
|
||||||
|
Typical shapes:
|
||||||
|
- ESH: (PRED_ORDER, 8)
|
||||||
|
- else: (PRED_ORDER, 1)
|
||||||
|
"""
|
||||||
|
frame_F: FrameChannelF
|
||||||
|
tns_coeffs: TnsCoeffs
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq2Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_2 (Level 2).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF2
|
||||||
|
chr: AACChannelFrameF2
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq2: TypeAlias = List[AACSeq2Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 2:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
|
||||||
|
Level 2 adds:
|
||||||
|
- per-channel "tns_coeffs"
|
||||||
|
and stores:
|
||||||
|
- per-channel "frame_F" after applying TNS.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Level 3 AAC sequence payload types (Quantizer + Huffman)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AACChannelFrameF3(TypedDict):
|
||||||
|
"""
|
||||||
|
Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3).
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
tns_coeffs:
|
||||||
|
Quantized TNS predictor coefficients for ONE channel.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (PRED_ORDER, 8)
|
||||||
|
- else: (PRED_ORDER, 1)
|
||||||
|
|
||||||
|
T:
|
||||||
|
Psychoacoustic thresholds per band.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (NB, 8)
|
||||||
|
- else: (NB, 1)
|
||||||
|
Note: Stored for completeness / debugging; not entropy-coded.
|
||||||
|
|
||||||
|
G:
|
||||||
|
Quantized global gains.
|
||||||
|
Shapes:
|
||||||
|
- ESH: (1, 8) (one per short subframe)
|
||||||
|
- else: scalar (or compatible np scalar)
|
||||||
|
|
||||||
|
sfc:
|
||||||
|
Huffman-coded scalefactor differences (DPCM sequence).
|
||||||
|
|
||||||
|
stream:
|
||||||
|
Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols).
|
||||||
|
|
||||||
|
codebook:
|
||||||
|
Huffman codebook id used for MDCT symbols (stream).
|
||||||
|
(Scalefactors typically use fixed codebook 11 and do not need to store it.)
|
||||||
|
"""
|
||||||
|
tns_coeffs: TnsCoeffs
|
||||||
|
T: FloatArray
|
||||||
|
G: FloatArray | float
|
||||||
|
sfc: HuffmanBitstream
|
||||||
|
stream: HuffmanBitstream
|
||||||
|
codebook: HuffmanCodebook
|
||||||
|
|
||||||
|
|
||||||
|
class AACSeq3Frame(TypedDict):
|
||||||
|
"""
|
||||||
|
One frame dictionary element of aac_seq_3 (Level 3).
|
||||||
|
"""
|
||||||
|
frame_type: FrameType
|
||||||
|
win_type: WinType
|
||||||
|
chl: AACChannelFrameF3
|
||||||
|
chr: AACChannelFrameF3
|
||||||
|
|
||||||
|
|
||||||
|
AACSeq3: TypeAlias = List[AACSeq3Frame]
|
||||||
|
"""
|
||||||
|
AAC sequence for Level 3:
|
||||||
|
List of length K (K = number of frames).
|
||||||
|
|
||||||
|
Each element is a dict with keys:
|
||||||
|
- "frame_type", "win_type", "chl", "chr"
|
||||||
|
|
||||||
|
Level 3 adds (per channel):
|
||||||
|
- "tns_coeffs"
|
||||||
|
- "T" thresholds (not entropy-coded)
|
||||||
|
- "G" global gain(s)
|
||||||
|
- "sfc" Huffman-coded scalefactor differences
|
||||||
|
- "stream" Huffman-coded MDCT quantized symbols
|
||||||
|
- "codebook" Huffman codebook for MDCT symbols
|
||||||
|
"""
|
||||||
270
source/level_3/core/aac_utils.py
Normal file
270
source/level_3/core/aac_utils.py
Normal file
@ -0,0 +1,270 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - AAC Utilities
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Shared utility functions used across AAC encoder/decoder levels.
|
||||||
|
#
|
||||||
|
# This module currently provides:
|
||||||
|
# - MDCT / IMDCT conversions
|
||||||
|
# - Signal-to-Noise Ratio (SNR) computation in dB
|
||||||
|
# - Loading and access helpers for psychoacoustic band tables
|
||||||
|
# (TableB219.mat, Tables B.2.1.9a / B.2.1.9b of the AAC specification)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from scipy.io import loadmat
|
||||||
|
from core.aac_types import *
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Global cached data
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Cached contents of TableB219.mat to avoid repeated disk I/O.
|
||||||
|
# Keys:
|
||||||
|
# - "B219a": long-window psychoacoustic bands (69 bands, FFT size 2048)
|
||||||
|
# - "B219b": short-window psychoacoustic bands (42 bands, FFT size 256)
|
||||||
|
B219_CACHE: dict[str, BarkTable] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# MDCT / IMDCT
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
def mdct(s: TimeSignal) -> MdctCoeffs:
|
||||||
|
"""
|
||||||
|
MDCT (direct form) as specified in the assignment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
s : TimeSignal
|
||||||
|
Windowed time samples, 1-D array of length N (N = 2048 or 256).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MdctCoeffs
|
||||||
|
MDCT coefficients, 1-D array of length N/2.
|
||||||
|
|
||||||
|
Definition
|
||||||
|
----------
|
||||||
|
X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||||
|
where n0 = (N/2 + 1)/2.
|
||||||
|
"""
|
||||||
|
s = np.asarray(s, dtype=np.float64).reshape(-1)
|
||||||
|
N = int(s.shape[0])
|
||||||
|
if N not in (2048, 256):
|
||||||
|
raise ValueError("MDCT input length must be 2048 or 256.")
|
||||||
|
|
||||||
|
n0 = (N / 2.0 + 1.0) / 2.0
|
||||||
|
n = np.arange(N, dtype=np.float64) + n0
|
||||||
|
k = np.arange(N // 2, dtype=np.float64) + 0.5
|
||||||
|
|
||||||
|
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2)
|
||||||
|
X = 2.0 * (s @ C) # (N/2,)
|
||||||
|
return X
|
||||||
|
|
||||||
|
|
||||||
|
def imdct(X: MdctCoeffs) -> TimeSignal:
|
||||||
|
"""
|
||||||
|
IMDCT (direct form) as specified in the assignment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : MdctCoeffs
|
||||||
|
MDCT coefficients, 1-D array of length K (K = 1024 or 128).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
TimeSignal
|
||||||
|
Reconstructed time samples, 1-D array of length N = 2K.
|
||||||
|
|
||||||
|
Definition
|
||||||
|
----------
|
||||||
|
s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)),
|
||||||
|
where n0 = (N/2 + 1)/2.
|
||||||
|
"""
|
||||||
|
X = np.asarray(X, dtype=np.float64).reshape(-1)
|
||||||
|
K = int(X.shape[0])
|
||||||
|
if K not in (1024, 128):
|
||||||
|
raise ValueError("IMDCT input length must be 1024 or 128.")
|
||||||
|
|
||||||
|
N = 2 * K
|
||||||
|
n0 = (N / 2.0 + 1.0) / 2.0
|
||||||
|
|
||||||
|
n = np.arange(N, dtype=np.float64) + n0
|
||||||
|
k = np.arange(K, dtype=np.float64) + 0.5
|
||||||
|
|
||||||
|
C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K)
|
||||||
|
s = (2.0 / N) * (C @ X) # (N,)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Signal quality metrics
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float:
|
||||||
|
"""
|
||||||
|
Compute the overall Signal-to-Noise Ratio (SNR) in dB.
|
||||||
|
|
||||||
|
The SNR is computed over all available samples and channels,
|
||||||
|
after conservatively aligning the two signals to their common
|
||||||
|
length and channel count.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_ref : StereoSignal
|
||||||
|
Reference (original) signal.
|
||||||
|
Typical shape: (N, 2) for stereo.
|
||||||
|
x_hat : StereoSignal
|
||||||
|
Reconstructed or processed signal.
|
||||||
|
Typical shape: (M, 2) for stereo.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
SNR in dB.
|
||||||
|
- +inf if the noise power is zero (perfect reconstruction).
|
||||||
|
- -inf if the reference signal power is zero.
|
||||||
|
"""
|
||||||
|
x_ref = np.asarray(x_ref, dtype=np.float64)
|
||||||
|
x_hat = np.asarray(x_hat, dtype=np.float64)
|
||||||
|
|
||||||
|
# Ensure 2-D shape: (samples, channels)
|
||||||
|
if x_ref.ndim == 1:
|
||||||
|
x_ref = x_ref.reshape(-1, 1)
|
||||||
|
if x_hat.ndim == 1:
|
||||||
|
x_hat = x_hat.reshape(-1, 1)
|
||||||
|
|
||||||
|
# Align lengths and channel count conservatively
|
||||||
|
n = min(x_ref.shape[0], x_hat.shape[0])
|
||||||
|
c = min(x_ref.shape[1], x_hat.shape[1])
|
||||||
|
|
||||||
|
x_ref = x_ref[:n, :c]
|
||||||
|
x_hat = x_hat[:n, :c]
|
||||||
|
|
||||||
|
err = x_ref - x_hat
|
||||||
|
ps = float(np.sum(x_ref * x_ref)) # signal power
|
||||||
|
pn = float(np.sum(err * err)) # noise power
|
||||||
|
|
||||||
|
if pn <= 0.0:
|
||||||
|
return float("inf")
|
||||||
|
if ps <= 0.0:
|
||||||
|
return float("-inf")
|
||||||
|
|
||||||
|
return float(10.0 * np.log10(ps / pn))
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Psychoacoustic band tables (TableB219.mat)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_b219_tables() -> dict[str, BarkTable]:
|
||||||
|
"""
|
||||||
|
Load and cache psychoacoustic band tables from TableB219.mat.
|
||||||
|
|
||||||
|
The assignment/project layout assumes that a 'material' directory
|
||||||
|
is available in the current working directory when running:
|
||||||
|
- tests
|
||||||
|
- level_1 / level_2 / level_3 entrypoints
|
||||||
|
|
||||||
|
This function loads the tables once and caches them for subsequent calls.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, BarkTable]
|
||||||
|
Dictionary with the following entries:
|
||||||
|
- "B219a": long-window psychoacoustic table
|
||||||
|
(69 bands, FFT size 2048 / 1024 spectral lines)
|
||||||
|
- "B219b": short-window psychoacoustic table
|
||||||
|
(42 bands, FFT size 256 / 128 spectral lines)
|
||||||
|
"""
|
||||||
|
global B219_CACHE
|
||||||
|
if B219_CACHE is not None:
|
||||||
|
return B219_CACHE
|
||||||
|
|
||||||
|
mat_path = Path("material") / "TableB219.mat"
|
||||||
|
if not mat_path.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"Could not locate material/TableB219.mat in the current working directory."
|
||||||
|
)
|
||||||
|
|
||||||
|
data = loadmat(str(mat_path))
|
||||||
|
if "B219a" not in data or "B219b" not in data:
|
||||||
|
raise ValueError(
|
||||||
|
"TableB219.mat missing required variables 'B219a' and/or 'B219b'."
|
||||||
|
)
|
||||||
|
|
||||||
|
B219_CACHE = {
|
||||||
|
"B219a": np.asarray(data["B219a"], dtype=np.float64),
|
||||||
|
"B219b": np.asarray(data["B219b"], dtype=np.float64),
|
||||||
|
}
|
||||||
|
return B219_CACHE
|
||||||
|
|
||||||
|
|
||||||
|
def get_table(frame_type: FrameType) -> tuple[BarkTable, int]:
|
||||||
|
"""
|
||||||
|
Select the appropriate psychoacoustic band table and FFT size
|
||||||
|
based on the AAC frame type.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame_type : FrameType
|
||||||
|
AAC frame type ("OLS", "LSS", "ESH", "LPS").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
table : BarkTable
|
||||||
|
Psychoacoustic band table:
|
||||||
|
- B219a for long frames
|
||||||
|
- B219b for ESH short subframes
|
||||||
|
N : int
|
||||||
|
FFT size corresponding to the table:
|
||||||
|
- 2048 for long frames
|
||||||
|
- 256 for short frames (ESH)
|
||||||
|
"""
|
||||||
|
tables = load_b219_tables()
|
||||||
|
if frame_type == "ESH":
|
||||||
|
return tables["B219b"], 256
|
||||||
|
return tables["B219a"], 2048
|
||||||
|
|
||||||
|
|
||||||
|
def band_limits(
|
||||||
|
table: BarkTable,
|
||||||
|
) -> tuple[BandIndexArray, BandIndexArray, BandValueArray, BandValueArray]:
|
||||||
|
"""
|
||||||
|
Extract per-band metadata from a TableB2.1.9 psychoacoustic table.
|
||||||
|
|
||||||
|
The column layout follows the provided TableB219.mat file and the
|
||||||
|
AAC specification tables B.2.1.9a / B.2.1.9b.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : BarkTable
|
||||||
|
Psychoacoustic band table (B219a or B219b).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
wlow : BandIndexArray
|
||||||
|
Lower FFT bin index (inclusive) for each band.
|
||||||
|
whigh : BandIndexArray
|
||||||
|
Upper FFT bin index (inclusive) for each band.
|
||||||
|
bval : BandValueArray
|
||||||
|
Bark-scale (or equivalent) band position values.
|
||||||
|
Used in the spreading function.
|
||||||
|
qthr_db : BandValueArray
|
||||||
|
Threshold in quiet for each band, in dB.
|
||||||
|
"""
|
||||||
|
wlow = table[:, 1].astype(int)
|
||||||
|
whigh = table[:, 2].astype(int)
|
||||||
|
bval = table[:, 4].astype(np.float64)
|
||||||
|
qthr_db = table[:, 5].astype(np.float64)
|
||||||
|
return wlow, whigh, bval, qthr_db
|
||||||
237
source/level_3/level_3.py
Normal file
237
source/level_3/level_3.py
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# AAC Coder/Decoder - Level 3 Wrappers + Demo
|
||||||
|
#
|
||||||
|
# Multimedia course at Aristotle University of
|
||||||
|
# Thessaloniki (AUTh)
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Level 3 wrapper module.
|
||||||
|
#
|
||||||
|
# This file provides:
|
||||||
|
# - Thin wrappers for Level 3 API functions (encode/decode) that delegate
|
||||||
|
# to the corresponding core implementations.
|
||||||
|
# - A demo function that runs end-to-end and computes:
|
||||||
|
# * SNR
|
||||||
|
# * bitrate (coded)
|
||||||
|
# * compression ratio
|
||||||
|
# - A small CLI entrypoint for convenience.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
|
import os
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from core.aac_types import AACSeq3, StereoSignal
|
||||||
|
from core.aac_coder import aac_coder_3 as core_aac_coder_3
|
||||||
|
from core.aac_coder import aac_read_wav_stereo_48k
|
||||||
|
from core.aac_decoder import aac_decoder_3 as core_aac_decoder_3
|
||||||
|
from core.aac_utils import snr_db
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Helpers (Level 3 metrics)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _wav_duration_seconds(wav_path: Path) -> float:
|
||||||
|
"""Return WAV duration in seconds using soundfile metadata."""
|
||||||
|
info = sf.info(str(wav_path))
|
||||||
|
if info.samplerate <= 0:
|
||||||
|
raise ValueError("Invalid samplerate in WAV header.")
|
||||||
|
if info.frames < 0:
|
||||||
|
raise ValueError("Invalid frame count in WAV header.")
|
||||||
|
return float(info.frames) / float(info.samplerate)
|
||||||
|
|
||||||
|
|
||||||
|
def _bitrate_before_from_file(wav_path: Path) -> float:
|
||||||
|
"""
|
||||||
|
Compute input bitrate (bits/s) from file size and duration.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This is a file-based bitrate estimate (includes WAV header), which is
|
||||||
|
acceptable for a simple compression ratio metric.
|
||||||
|
"""
|
||||||
|
duration = _wav_duration_seconds(wav_path)
|
||||||
|
if duration <= 0.0:
|
||||||
|
raise ValueError("Non-positive WAV duration.")
|
||||||
|
nbits = float(os.path.getsize(wav_path)) * 8.0
|
||||||
|
return nbits / duration
|
||||||
|
|
||||||
|
|
||||||
|
def _bitrate_after_from_aacseq(aac_seq_3: AACSeq3, duration_sec: float) -> float:
|
||||||
|
"""
|
||||||
|
Compute coded bitrate (bits/s) from Huffman streams stored in AACSeq3.
|
||||||
|
|
||||||
|
We count bits from:
|
||||||
|
- scalefactor Huffman bitstream ("sfc")
|
||||||
|
- MDCT symbols Huffman bitstream ("stream")
|
||||||
|
for both channels and all frames.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
We intentionally ignore side-info overhead (frame_type, G, T, TNS coeffs,
|
||||||
|
codebook ids, etc.). This matches a common simplified metric in demos.
|
||||||
|
"""
|
||||||
|
if duration_sec <= 0.0:
|
||||||
|
raise ValueError("Non-positive duration for bitrate computation.")
|
||||||
|
|
||||||
|
total_bits = 0
|
||||||
|
for fr in aac_seq_3:
|
||||||
|
total_bits += len(fr["chl"]["sfc"])
|
||||||
|
total_bits += len(fr["chl"]["stream"])
|
||||||
|
total_bits += len(fr["chr"]["sfc"])
|
||||||
|
total_bits += len(fr["chr"]["stream"])
|
||||||
|
|
||||||
|
return float(total_bits) / float(duration_sec)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Public Level 3 API (wrappers)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def aac_coder_3(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
filename_aac_coded: Optional[Union[str, Path]] = None,
|
||||||
|
) -> AACSeq3:
|
||||||
|
"""
|
||||||
|
Level-3 AAC encoder (wrapper).
|
||||||
|
|
||||||
|
Delegates to core implementation.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename.
|
||||||
|
Assumption: stereo audio, sampling rate 48 kHz.
|
||||||
|
filename_aac_coded : Optional[Union[str, Path]]
|
||||||
|
Optional filename to store the encoded AAC sequence (e.g., .mat).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
AACSeq3
|
||||||
|
List of encoded frames (Level 3 schema).
|
||||||
|
"""
|
||||||
|
return core_aac_coder_3(filename_in, filename_aac_coded, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
|
def i_aac_coder_3(
|
||||||
|
aac_seq_3: AACSeq3,
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
) -> StereoSignal:
|
||||||
|
"""
|
||||||
|
Level-3 AAC decoder (wrapper).
|
||||||
|
|
||||||
|
Delegates to core implementation.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
aac_seq_3 : AACSeq3
|
||||||
|
Encoded sequence as produced by aac_coder_3().
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename. Assumption: 48 kHz, stereo.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
StereoSignal
|
||||||
|
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||||||
|
"""
|
||||||
|
return core_aac_decoder_3(aac_seq_3, filename_out, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Demo (Level 3)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def demo_aac_3(
|
||||||
|
filename_in: Union[str, Path],
|
||||||
|
filename_out: Union[str, Path],
|
||||||
|
filename_aac_coded: Optional[Union[str, Path]] = None,
|
||||||
|
) -> Tuple[float, float, float]:
|
||||||
|
"""
|
||||||
|
Demonstration for the Level-3 codec.
|
||||||
|
|
||||||
|
Runs:
|
||||||
|
- aac_coder_3(filename_in, filename_aac_coded)
|
||||||
|
- i_aac_coder_3(aac_seq_3, filename_out)
|
||||||
|
and computes:
|
||||||
|
- total SNR between original and decoded audio
|
||||||
|
- coded bitrate (bits/s) based on Huffman streams
|
||||||
|
- compression ratio (bitrate_before / bitrate_after)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename_in : Union[str, Path]
|
||||||
|
Input WAV filename (stereo, 48 kHz).
|
||||||
|
filename_out : Union[str, Path]
|
||||||
|
Output WAV filename (stereo, 48 kHz).
|
||||||
|
filename_aac_coded : Optional[Union[str, Path]]
|
||||||
|
Optional filename to store the encoded AAC sequence (e.g., .mat).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tuple[float, float, float]
|
||||||
|
(SNR_dB, bitrate_after_bits_per_s, compression_ratio)
|
||||||
|
"""
|
||||||
|
filename_in = Path(filename_in)
|
||||||
|
filename_out = Path(filename_out)
|
||||||
|
filename_aac_coded = Path(filename_aac_coded) if filename_aac_coded else None
|
||||||
|
|
||||||
|
# Read original audio (reference) with the same validation as the codec.
|
||||||
|
x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in)
|
||||||
|
if int(fs_ref) != 48000:
|
||||||
|
raise ValueError("Input sampling rate must be 48 kHz.")
|
||||||
|
|
||||||
|
# Encode / decode
|
||||||
|
aac_seq_3 = aac_coder_3(filename_in, filename_aac_coded)
|
||||||
|
x_hat = i_aac_coder_3(aac_seq_3, filename_out)
|
||||||
|
|
||||||
|
# Optional sanity: ensure output file exists and is readable
|
||||||
|
_, fs_hat = sf.read(str(filename_out), always_2d=True)
|
||||||
|
if int(fs_hat) != 48000:
|
||||||
|
raise ValueError("Decoded output sampling rate must be 48 kHz.")
|
||||||
|
|
||||||
|
# Metrics
|
||||||
|
s = snr_db(x_ref, x_hat)
|
||||||
|
|
||||||
|
duration = _wav_duration_seconds(filename_in)
|
||||||
|
bitrate_before = _bitrate_before_from_file(filename_in)
|
||||||
|
bitrate_after = _bitrate_after_from_aacseq(aac_seq_3, duration)
|
||||||
|
compression = float("inf") if bitrate_after <= 0.0 else (bitrate_before / bitrate_after)
|
||||||
|
|
||||||
|
return float(s), float(bitrate_after), float(compression)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example:
|
||||||
|
# cd level_3
|
||||||
|
# python -m level_3 input.wav output.wav
|
||||||
|
# for example:
|
||||||
|
# python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav
|
||||||
|
# or
|
||||||
|
# python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav aac_seq_3.mat
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if len(sys.argv) not in (3, 4):
|
||||||
|
raise SystemExit("Usage: python -m level_3 <input.wav> <output.wav> [aac_seq_3.mat]")
|
||||||
|
|
||||||
|
in_wav = Path(sys.argv[1])
|
||||||
|
out_wav = Path(sys.argv[2])
|
||||||
|
aac_mat = Path(sys.argv[3]) if len(sys.argv) == 4 else None
|
||||||
|
|
||||||
|
print(f"Encoding/Decoding {in_wav} to {out_wav}")
|
||||||
|
if aac_mat is not None:
|
||||||
|
print(f"Storing coded sequence to {aac_mat}")
|
||||||
|
|
||||||
|
snr, bitrate, compression = demo_aac_3(in_wav, out_wav, aac_mat)
|
||||||
|
print(f"SNR = {snr:.3f} dB")
|
||||||
|
print(f"Bitrate (coded) = {bitrate:.2f} bits/s")
|
||||||
|
print(f"Compression ratio = {compression:.4f}")
|
||||||
BIN
source/level_3/material/LicorDeCalandraca.wav
Normal file
BIN
source/level_3/material/LicorDeCalandraca.wav
Normal file
Binary file not shown.
BIN
source/level_3/material/LicorDeCalandraca_out2.wav
Normal file
BIN
source/level_3/material/LicorDeCalandraca_out2.wav
Normal file
Binary file not shown.
BIN
source/level_3/material/TableB219.mat
Normal file
BIN
source/level_3/material/TableB219.mat
Normal file
Binary file not shown.
BIN
source/level_3/material/aac_seq_3.mat
Normal file
BIN
source/level_3/material/aac_seq_3.mat
Normal file
Binary file not shown.
BIN
source/level_3/material/huffCodebooks.mat
Normal file
BIN
source/level_3/material/huffCodebooks.mat
Normal file
Binary file not shown.
400
source/level_3/material/huff_utils.py
Normal file
400
source/level_3/material/huff_utils.py
Normal file
@ -0,0 +1,400 @@
|
|||||||
|
import numpy as np
|
||||||
|
import scipy.io as sio
|
||||||
|
import os
|
||||||
|
|
||||||
|
# ------------------ LOAD LUT ------------------
|
||||||
|
|
||||||
|
def load_LUT(mat_filename=None):
|
||||||
|
"""
|
||||||
|
Loads the list of Huffman Codebooks (LUTs)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
huffLUT : list (index 1..11 used, index 0 unused)
|
||||||
|
"""
|
||||||
|
if mat_filename is None:
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
mat_filename = os.path.join(current_dir, "huffCodebooks.mat")
|
||||||
|
|
||||||
|
mat = sio.loadmat(mat_filename)
|
||||||
|
|
||||||
|
|
||||||
|
huffCodebooks_raw = mat['huffCodebooks'].squeeze()
|
||||||
|
|
||||||
|
huffCodebooks = []
|
||||||
|
for i in range(11):
|
||||||
|
huffCodebooks.append(np.array(huffCodebooks_raw[i]))
|
||||||
|
|
||||||
|
# Build inverse VLC tables
|
||||||
|
invTable = [None] * 11
|
||||||
|
|
||||||
|
for i in range(11):
|
||||||
|
h = huffCodebooks[i][:, 2].astype(int) # column 3
|
||||||
|
hlength = huffCodebooks[i][:, 1].astype(int) # column 2
|
||||||
|
|
||||||
|
hbin = []
|
||||||
|
for j in range(len(h)):
|
||||||
|
hbin.append(format(h[j], f'0{hlength[j]}b'))
|
||||||
|
|
||||||
|
invTable[i] = vlc_table(hbin)
|
||||||
|
|
||||||
|
# Build Huffman LUT dicts
|
||||||
|
huffLUT = [None] * 12 # index 0 unused
|
||||||
|
params = [
|
||||||
|
(4, 1, True),
|
||||||
|
(4, 1, True),
|
||||||
|
(4, 2, False),
|
||||||
|
(4, 2, False),
|
||||||
|
(2, 4, True),
|
||||||
|
(2, 4, True),
|
||||||
|
(2, 7, False),
|
||||||
|
(2, 7, False),
|
||||||
|
(2, 12, False),
|
||||||
|
(2, 12, False),
|
||||||
|
(2, 16, False),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1):
|
||||||
|
huffLUT[i] = {
|
||||||
|
'LUT': huffCodebooks[i-1],
|
||||||
|
'invTable': invTable[i-1],
|
||||||
|
'codebook': i,
|
||||||
|
'nTupleSize': nTupleSize,
|
||||||
|
'maxAbsCodeVal': maxAbs,
|
||||||
|
'signedValues': signed
|
||||||
|
}
|
||||||
|
|
||||||
|
return huffLUT
|
||||||
|
|
||||||
|
def vlc_table(code_array):
|
||||||
|
"""
|
||||||
|
codeArray: list of strings, each string is a Huffman codeword (e.g. '0101')
|
||||||
|
returns:
|
||||||
|
h : NumPy array of shape (num_nodes, 3)
|
||||||
|
columns:
|
||||||
|
[ next_if_0 , next_if_1 , symbol_index ]
|
||||||
|
"""
|
||||||
|
h = np.zeros((1, 3), dtype=int)
|
||||||
|
|
||||||
|
for code_index, code in enumerate(code_array, start=1):
|
||||||
|
word = [int(bit) for bit in code]
|
||||||
|
h_index = 0
|
||||||
|
|
||||||
|
for bit in word:
|
||||||
|
k = bit
|
||||||
|
next_node = h[h_index, k]
|
||||||
|
if next_node == 0:
|
||||||
|
h = np.vstack([h, [0, 0, 0]])
|
||||||
|
new_index = h.shape[0] - 1
|
||||||
|
h[h_index, k] = new_index
|
||||||
|
h_index = new_index
|
||||||
|
else:
|
||||||
|
h_index = next_node
|
||||||
|
|
||||||
|
h[h_index, 2] = code_index
|
||||||
|
|
||||||
|
return h
|
||||||
|
|
||||||
|
# ------------------ ENCODE ------------------
|
||||||
|
|
||||||
|
def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None):
|
||||||
|
"""
|
||||||
|
Huffman-encode a sequence of quantized coefficients.
|
||||||
|
|
||||||
|
This function selects the appropriate Huffman codebook based on the
|
||||||
|
maximum absolute value of the input coefficients, encodes the coefficients
|
||||||
|
into a binary Huffman bitstream, and returns both the bitstream and the
|
||||||
|
selected codebook index.
|
||||||
|
|
||||||
|
This is the Python equivalent of the MATLAB `encodeHuff.m` function used
|
||||||
|
in audio/image coding (e.g., scale factor band encoding). The input
|
||||||
|
coefficient sequence is grouped into fixed-size tuples as defined by
|
||||||
|
the chosen Huffman LUT. Zero-padding may be applied internally.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
coeff_sec : array_like of int
|
||||||
|
1-D array of quantized integer coefficients to encode.
|
||||||
|
Typically corresponds to a "section" or scale-factor band.
|
||||||
|
|
||||||
|
huff_LUT_list : list
|
||||||
|
List of Huffman lookup-table dictionaries as returned by `loadLUT()`.
|
||||||
|
Index 1..11 correspond to valid Huffman codebooks.
|
||||||
|
Index 0 is unused.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
huffSec : str
|
||||||
|
Huffman-encoded bitstream represented as a string of '0' and '1'
|
||||||
|
characters.
|
||||||
|
|
||||||
|
huffCodebook : int
|
||||||
|
Index (1..11) of the Huffman codebook used for encoding.
|
||||||
|
A value of 0 indicates a special all-zero section.
|
||||||
|
"""
|
||||||
|
if force_codebook is not None:
|
||||||
|
return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec)
|
||||||
|
|
||||||
|
maxAbsVal = np.max(np.abs(coeff_sec))
|
||||||
|
|
||||||
|
if maxAbsVal == 0:
|
||||||
|
huffCodebook = 0
|
||||||
|
huffSec = huff_LUT_code_0()
|
||||||
|
|
||||||
|
elif maxAbsVal == 1:
|
||||||
|
candidates = [1, 2]
|
||||||
|
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||||
|
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||||
|
if len(huffSec1) <= len(huffSec2):
|
||||||
|
huffSec = huffSec1
|
||||||
|
huffCodebook = candidates[0]
|
||||||
|
else:
|
||||||
|
huffSec = huffSec2
|
||||||
|
huffCodebook = candidates[1]
|
||||||
|
|
||||||
|
elif maxAbsVal == 2:
|
||||||
|
candidates = [3, 4]
|
||||||
|
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||||
|
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||||
|
if len(huffSec1) <= len(huffSec2):
|
||||||
|
huffSec = huffSec1
|
||||||
|
huffCodebook = candidates[0]
|
||||||
|
else:
|
||||||
|
huffSec = huffSec2
|
||||||
|
huffCodebook = candidates[1]
|
||||||
|
|
||||||
|
elif maxAbsVal in (3, 4):
|
||||||
|
candidates = [5, 6]
|
||||||
|
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||||
|
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||||
|
if len(huffSec1) <= len(huffSec2):
|
||||||
|
huffSec = huffSec1
|
||||||
|
huffCodebook = candidates[0]
|
||||||
|
else:
|
||||||
|
huffSec = huffSec2
|
||||||
|
huffCodebook = candidates[1]
|
||||||
|
|
||||||
|
elif maxAbsVal in (5, 6, 7):
|
||||||
|
candidates = [7, 8]
|
||||||
|
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||||
|
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||||
|
if len(huffSec1) <= len(huffSec2):
|
||||||
|
huffSec = huffSec1
|
||||||
|
huffCodebook = candidates[0]
|
||||||
|
else:
|
||||||
|
huffSec = huffSec2
|
||||||
|
huffCodebook = candidates[1]
|
||||||
|
|
||||||
|
elif maxAbsVal in (8, 9, 10, 11, 12):
|
||||||
|
candidates = [9, 10]
|
||||||
|
huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec)
|
||||||
|
huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec)
|
||||||
|
if len(huffSec1) <= len(huffSec2):
|
||||||
|
huffSec = huffSec1
|
||||||
|
huffCodebook = candidates[0]
|
||||||
|
else:
|
||||||
|
huffSec = huffSec2
|
||||||
|
huffCodebook = candidates[1]
|
||||||
|
|
||||||
|
elif maxAbsVal in (13, 14, 15):
|
||||||
|
huffCodebook = 11
|
||||||
|
huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec)
|
||||||
|
|
||||||
|
else:
|
||||||
|
huffCodebook = 11
|
||||||
|
huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec)
|
||||||
|
|
||||||
|
return huffSec, huffCodebook
|
||||||
|
|
||||||
|
def huff_LUT_code_1(huff_LUT, coeff_sec):
|
||||||
|
LUT = huff_LUT['LUT']
|
||||||
|
nTupleSize = huff_LUT['nTupleSize']
|
||||||
|
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||||
|
signedValues = huff_LUT['signedValues']
|
||||||
|
|
||||||
|
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
||||||
|
|
||||||
|
if signedValues:
|
||||||
|
coeff = coeff_sec + maxAbsCodeVal
|
||||||
|
base = 2 * maxAbsCodeVal + 1
|
||||||
|
else:
|
||||||
|
coeff = coeff_sec
|
||||||
|
base = maxAbsCodeVal + 1
|
||||||
|
|
||||||
|
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
||||||
|
coeffPad[:len(coeff)] = coeff
|
||||||
|
|
||||||
|
huffSec = []
|
||||||
|
|
||||||
|
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
||||||
|
|
||||||
|
for i in range(numTuples):
|
||||||
|
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
||||||
|
huffIndex = int(np.abs(nTuple) @ powers)
|
||||||
|
|
||||||
|
hexVal = LUT[huffIndex, 2]
|
||||||
|
huffLen = LUT[huffIndex, 1]
|
||||||
|
|
||||||
|
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
||||||
|
|
||||||
|
if signedValues:
|
||||||
|
huffSec.append(bits)
|
||||||
|
else:
|
||||||
|
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
||||||
|
huffSec.append(bits + signBits)
|
||||||
|
|
||||||
|
return ''.join(huffSec)
|
||||||
|
|
||||||
|
def huff_LUT_code_0():
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def huff_LUT_code_ESC(huff_LUT, coeff_sec):
|
||||||
|
LUT = huff_LUT['LUT']
|
||||||
|
nTupleSize = huff_LUT['nTupleSize']
|
||||||
|
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||||
|
|
||||||
|
numTuples = int(np.ceil(len(coeff_sec) / nTupleSize))
|
||||||
|
base = maxAbsCodeVal + 1
|
||||||
|
|
||||||
|
coeffPad = np.zeros(numTuples * nTupleSize, dtype=int)
|
||||||
|
coeffPad[:len(coeff_sec)] = coeff_sec
|
||||||
|
|
||||||
|
huffSec = []
|
||||||
|
powers = base ** np.arange(nTupleSize - 1, -1, -1)
|
||||||
|
|
||||||
|
for i in range(numTuples):
|
||||||
|
nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize]
|
||||||
|
|
||||||
|
lnTuple = nTuple.astype(float)
|
||||||
|
lnTuple[lnTuple == 0] = np.finfo(float).eps
|
||||||
|
|
||||||
|
N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int))
|
||||||
|
N = np.maximum(0, N4 - 4)
|
||||||
|
esc = np.abs(nTuple) > 15
|
||||||
|
|
||||||
|
nTupleESC = nTuple.copy()
|
||||||
|
nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16
|
||||||
|
|
||||||
|
huffIndex = int(np.abs(nTupleESC) @ powers)
|
||||||
|
|
||||||
|
hexVal = LUT[huffIndex, 2]
|
||||||
|
huffLen = LUT[huffIndex, 1]
|
||||||
|
|
||||||
|
bits = format(int(hexVal), f'0{int(huffLen)}b')
|
||||||
|
|
||||||
|
escSeq = ''
|
||||||
|
for k in range(nTupleSize):
|
||||||
|
if esc[k]:
|
||||||
|
escSeq += '1' * N[k]
|
||||||
|
escSeq += '0'
|
||||||
|
escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b')
|
||||||
|
|
||||||
|
signBits = ''.join('1' if v < 0 else '0' for v in nTuple)
|
||||||
|
huffSec.append(bits + signBits + escSeq)
|
||||||
|
|
||||||
|
return ''.join(huffSec)
|
||||||
|
|
||||||
|
# ------------------ DECODE ------------------
|
||||||
|
|
||||||
|
def decode_huff(huff_sec, huff_LUT):
|
||||||
|
"""
|
||||||
|
Decode a Huffman-encoded stream.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
huff_sec : array-like of int or str
|
||||||
|
Huffman encoded stream as a sequence of 0 and 1 (string or list/array).
|
||||||
|
huff_LUT : dict
|
||||||
|
Huffman lookup table with keys:
|
||||||
|
- 'invTable': inverse table (numpy array)
|
||||||
|
- 'codebook': codebook number
|
||||||
|
- 'nTupleSize': tuple size
|
||||||
|
- 'maxAbsCodeVal': maximum absolute code value
|
||||||
|
- 'signedValues': True/False
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
decCoeffs : list of int
|
||||||
|
Decoded quantized coefficients.
|
||||||
|
"""
|
||||||
|
|
||||||
|
h = huff_LUT['invTable']
|
||||||
|
huffCodebook = huff_LUT['codebook']
|
||||||
|
nTupleSize = huff_LUT['nTupleSize']
|
||||||
|
maxAbsCodeVal = huff_LUT['maxAbsCodeVal']
|
||||||
|
signedValues = huff_LUT['signedValues']
|
||||||
|
|
||||||
|
# Convert string to array of ints
|
||||||
|
if isinstance(huff_sec, str):
|
||||||
|
huff_sec = np.array([int(b) for b in huff_sec])
|
||||||
|
|
||||||
|
eos = False
|
||||||
|
decCoeffs = []
|
||||||
|
streamIndex = 0
|
||||||
|
|
||||||
|
while not eos:
|
||||||
|
wordbit = 0
|
||||||
|
r = 0 # start at root
|
||||||
|
|
||||||
|
# Decode Huffman word using inverse table
|
||||||
|
while True:
|
||||||
|
b = huff_sec[streamIndex + wordbit]
|
||||||
|
wordbit += 1
|
||||||
|
rOld = r
|
||||||
|
r = h[rOld, b]
|
||||||
|
if h[r, 0] == 0 and h[r, 1] == 0:
|
||||||
|
symbolIndex = h[r, 2] - 1 # zero-based
|
||||||
|
streamIndex += wordbit
|
||||||
|
break
|
||||||
|
|
||||||
|
# Decode n-tuple magnitudes
|
||||||
|
if signedValues:
|
||||||
|
base = 2 * maxAbsCodeVal + 1
|
||||||
|
nTupleDec = []
|
||||||
|
tmp = symbolIndex
|
||||||
|
for p in reversed(range(nTupleSize)):
|
||||||
|
val = tmp // (base ** p)
|
||||||
|
nTupleDec.append(val - maxAbsCodeVal)
|
||||||
|
tmp = tmp % (base ** p)
|
||||||
|
nTupleDec = np.array(nTupleDec)
|
||||||
|
else:
|
||||||
|
base = maxAbsCodeVal + 1
|
||||||
|
nTupleDec = []
|
||||||
|
tmp = symbolIndex
|
||||||
|
for p in reversed(range(nTupleSize)):
|
||||||
|
val = tmp // (base ** p)
|
||||||
|
nTupleDec.append(val)
|
||||||
|
tmp = tmp % (base ** p)
|
||||||
|
nTupleDec = np.array(nTupleDec)
|
||||||
|
|
||||||
|
# Apply sign bits
|
||||||
|
nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize]
|
||||||
|
nTupleSign = -(np.sign(nTupleSignBits - 0.5))
|
||||||
|
streamIndex += nTupleSize
|
||||||
|
nTupleDec = nTupleDec * nTupleSign
|
||||||
|
|
||||||
|
# Handle escape sequences
|
||||||
|
escIndex = np.where(np.abs(nTupleDec) == 16)[0]
|
||||||
|
if huffCodebook == 11 and escIndex.size > 0:
|
||||||
|
for idx in escIndex:
|
||||||
|
N = 0
|
||||||
|
b = huff_sec[streamIndex]
|
||||||
|
while b:
|
||||||
|
N += 1
|
||||||
|
b = huff_sec[streamIndex + N]
|
||||||
|
streamIndex += N +1
|
||||||
|
N4 = N + 4
|
||||||
|
escape_word = huff_sec[streamIndex:streamIndex + N4]
|
||||||
|
escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2)
|
||||||
|
nTupleDec[idx] = escape_value
|
||||||
|
streamIndex += N4
|
||||||
|
# Apply signs again
|
||||||
|
nTupleDec[escIndex] *= nTupleSign[escIndex]
|
||||||
|
|
||||||
|
decCoeffs.extend(nTupleDec.tolist())
|
||||||
|
|
||||||
|
if streamIndex >= len(huff_sec):
|
||||||
|
eos = True
|
||||||
|
|
||||||
|
return decCoeffs
|
||||||
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user