167 lines
5.3 KiB
Python
167 lines
5.3 KiB
Python
# ------------------------------------------------------------
|
||
# AAC Coder/Decoder - Inverse AAC Coder (Core)
|
||
#
|
||
# Multimedia course at Aristotle University of
|
||
# Thessaloniki (AUTh)
|
||
#
|
||
# Author:
|
||
# Christos Choutouridis (ΑΕΜ 8997)
|
||
# cchoutou@ece.auth.gr
|
||
#
|
||
# Description:
|
||
# Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
|
||
# Keeps the same functional behavior as the original level_1 implementation:
|
||
# - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
|
||
# - IMDCT synthesis per frame
|
||
# - Overlap-add with hop=1024
|
||
# - Remove encoder boundary padding: hop at start and hop at end
|
||
#
|
||
# Note:
|
||
# This core module returns the reconstructed samples. Writing to disk is kept
|
||
# in level_x demos.
|
||
# ------------------------------------------------------------
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
from typing import Union
|
||
|
||
import soundfile as sf
|
||
|
||
from core.aac_filterbank import aac_i_filter_bank
|
||
from core.aac_types import *
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Public helpers (useful for level_x demo wrappers)
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
|
||
"""
|
||
Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
|
||
FrameF container expected by aac_i_filter_bank().
|
||
|
||
Parameters
|
||
----------
|
||
frame_type : FrameType
|
||
"OLS" | "LSS" | "ESH" | "LPS".
|
||
chl_f : FrameChannelF
|
||
Left channel coefficients:
|
||
- ESH: (128, 8)
|
||
- else: (1024, 1)
|
||
chr_f : FrameChannelF
|
||
Right channel coefficients:
|
||
- ESH: (128, 8)
|
||
- else: (1024, 1)
|
||
|
||
Returns
|
||
-------
|
||
FrameF
|
||
Stereo coefficients:
|
||
- ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
|
||
- else: (1024, 2)
|
||
"""
|
||
if frame_type == "ESH":
|
||
if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
|
||
raise ValueError("ESH channel frame_F must have shape (128, 8).")
|
||
|
||
frame_f = np.empty((128, 16), dtype=np.float64)
|
||
for j in range(8):
|
||
frame_f[:, 2 * j + 0] = chl_f[:, j]
|
||
frame_f[:, 2 * j + 1] = chr_f[:, j]
|
||
return frame_f
|
||
|
||
# Non-ESH: expected (1024, 1) per channel in Level-1 schema.
|
||
if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
|
||
raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")
|
||
|
||
frame_f = np.empty((1024, 2), dtype=np.float64)
|
||
frame_f[:, 0] = chl_f[:, 0]
|
||
frame_f[:, 1] = chr_f[:, 0]
|
||
return frame_f
|
||
|
||
|
||
def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
|
||
"""
|
||
Remove the boundary padding that the Level-1 encoder adds:
|
||
hop samples at start and hop samples at end.
|
||
|
||
Parameters
|
||
----------
|
||
y_pad : StereoSignal (np.ndarray)
|
||
Reconstructed padded stream, shape (N_pad, 2).
|
||
hop : int
|
||
Hop size in samples (default 1024).
|
||
|
||
Returns
|
||
-------
|
||
StereoSignal (np.ndarray)
|
||
Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
If y_pad is too short to unpad.
|
||
"""
|
||
if y_pad.shape[0] < 2 * hop:
|
||
raise ValueError("Decoded stream too short to unpad.")
|
||
return y_pad[hop:-hop, :]
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
# Level 1 decoder (core)
|
||
# -----------------------------------------------------------------------------
|
||
|
||
def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
|
||
"""
|
||
Level-1 AAC decoder (inverse of aac_coder_1()).
|
||
|
||
This function preserves the behavior of the original level_1 implementation:
|
||
- Reconstruct the full padded stream by overlap-adding K synthesized frames
|
||
- Remove hop padding at the beginning and hop padding at the end
|
||
- Write the reconstructed stereo WAV file (48 kHz)
|
||
- Return reconstructed stereo samples as float64
|
||
|
||
Parameters
|
||
----------
|
||
aac_seq_1 : AACSeq1
|
||
Encoded sequence as produced by aac_coder_1().
|
||
filename_out : Union[str, Path]
|
||
Output WAV filename. Assumption: 48 kHz, stereo.
|
||
|
||
Returns
|
||
-------
|
||
StereoSignal
|
||
Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
|
||
"""
|
||
filename_out = Path(filename_out)
|
||
|
||
hop = 1024
|
||
win = 2048
|
||
K = len(aac_seq_1)
|
||
|
||
# Output includes the encoder padding region, so we reconstruct the full padded stream.
|
||
# For K frames: last frame starts at (K-1)*hop and spans win,
|
||
# so total length = (K-1)*hop + win.
|
||
n_pad = (K - 1) * hop + win
|
||
y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)
|
||
|
||
for i, fr in enumerate(aac_seq_1):
|
||
frame_type: FrameType = fr["frame_type"]
|
||
win_type: WinType = fr["win_type"]
|
||
|
||
chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
|
||
chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)
|
||
|
||
frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
|
||
frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2)
|
||
|
||
start = i * hop
|
||
y_pad[start:start + win, :] += frame_t_hat
|
||
|
||
y: StereoSignal = aac_remove_padding(y_pad, hop=hop)
|
||
|
||
# Level 1 assumption: 48 kHz output.
|
||
sf.write(str(filename_out), y, 48000)
|
||
|
||
return y
|