# ------------------------------------------------------------ # AAC Coder/Decoder - Inverse AAC Coder (Core) # # Multimedia course at Aristotle University of # Thessaloniki (AUTh) # # Author: # Christos Choutouridis (ΑΕΜ 8997) # cchoutou@ece.auth.gr # # Description: # Level 1 AAC decoder orchestration (inverse of aac_coder_1()). # Keeps the same functional behavior as the original level_1 implementation: # - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank() # - IMDCT synthesis per frame # - Overlap-add with hop=1024 # - Remove encoder boundary padding: hop at start and hop at end # # Note: # This core module returns the reconstructed samples. Writing to disk is kept # in level_x demos. # ------------------------------------------------------------ from __future__ import annotations from pathlib import Path from typing import Union import soundfile as sf from core.aac_filterbank import aac_i_filter_bank from core.aac_types import * # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: """ Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo FrameF container expected by aac_i_filter_bank(). Parameters ---------- frame_type : FrameType "OLS" | "LSS" | "ESH" | "LPS". chl_f : FrameChannelF Left channel coefficients: - ESH: (128, 8) - else: (1024, 1) chr_f : FrameChannelF Right channel coefficients: - ESH: (128, 8) - else: (1024, 1) Returns ------- FrameF Stereo coefficients: - ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] - else: (1024, 2) """ if frame_type == "ESH": if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): raise ValueError("ESH channel frame_F must have shape (128, 8).") frame_f = np.empty((128, 16), dtype=np.float64) for j in range(8): frame_f[:, 2 * j + 0] = chl_f[:, j] frame_f[:, 2 * j + 1] = chr_f[:, j] return frame_f # Non-ESH: expected (1024, 1) per channel in Level-1 schema. if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1): raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).") frame_f = np.empty((1024, 2), dtype=np.float64) frame_f[:, 0] = chl_f[:, 0] frame_f[:, 1] = chr_f[:, 0] return frame_f def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: """ Remove the boundary padding that the Level-1 encoder adds: hop samples at start and hop samples at end. Parameters ---------- y_pad : StereoSignal (np.ndarray) Reconstructed padded stream, shape (N_pad, 2). hop : int Hop size in samples (default 1024). Returns ------- StereoSignal (np.ndarray) Unpadded reconstructed stream, shape (N_pad - 2*hop, 2). Raises ------ ValueError If y_pad is too short to unpad. """ if y_pad.shape[0] < 2 * hop: raise ValueError("Decoded stream too short to unpad.") return y_pad[hop:-hop, :] # ----------------------------------------------------------------------------- # Level 1 decoder (core) # ----------------------------------------------------------------------------- def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: """ Level-1 AAC decoder (inverse of aac_coder_1()). This function preserves the behavior of the original level_1 implementation: - Reconstruct the full padded stream by overlap-adding K synthesized frames - Remove hop padding at the beginning and hop padding at the end - Write the reconstructed stereo WAV file (48 kHz) - Return reconstructed stereo samples as float64 Parameters ---------- aac_seq_1 : AACSeq1 Encoded sequence as produced by aac_coder_1(). filename_out : Union[str, Path] Output WAV filename. Assumption: 48 kHz, stereo. Returns ------- StereoSignal Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. """ filename_out = Path(filename_out) hop = 1024 win = 2048 K = len(aac_seq_1) # Output includes the encoder padding region, so we reconstruct the full padded stream. # For K frames: last frame starts at (K-1)*hop and spans win, # so total length = (K-1)*hop + win. n_pad = (K - 1) * hop + win y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) for i, fr in enumerate(aac_seq_1): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) start = i * hop y_pad[start:start + win, :] += frame_t_hat y: StereoSignal = aac_remove_padding(y_pad, hop=hop) # Level 1 assumption: 48 kHz output. sf.write(str(filename_out), y, 48000) return y