Multimedia_AAC_Project/source/core/aac_decoder.py

# ------------------------------------------------------------
# AAC Coder/Decoder - Inverse AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
#   Christos Choutouridis (ΑΕΜ 8997)
#   cchoutou@ece.auth.gr
#
# Description:
#   Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
#   Keeps the same functional behavior as the original level_1 implementation:
#   - Re-pack per-channel spectra into FrameF expected by aac_i_filter_bank()
#   - IMDCT synthesis per frame
#   - Overlap-add with hop=1024
#   - Remove encoder boundary padding: hop at start and hop at end
#
#   Note:
#   This core module returns the reconstructed samples. Writing to disk is kept
#   in level_x demos.
# ------------------------------------------------------------
from __future__ import annotations

from pathlib import Path
from typing import Union

import soundfile as sf

from core.aac_filterbank import aac_i_filter_bank
from core.aac_types import *


# -----------------------------------------------------------------------------
# Public helpers (useful for level_x demo wrappers)
# -----------------------------------------------------------------------------

def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
    """
    Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
    FrameF container expected by aac_i_filter_bank().

    Parameters
    ----------
    frame_type : FrameType
        "OLS" | "LSS" | "ESH" | "LPS".
    chl_f : FrameChannelF
        Left channel coefficients:
        - ESH: (128, 8)
        - else: (1024, 1)
    chr_f : FrameChannelF
        Right channel coefficients:
        - ESH: (128, 8)
        - else: (1024, 1)

    Returns
    -------
    FrameF
        Stereo coefficients:
        - ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
        - else: (1024, 2)
    """
    if frame_type == "ESH":
        if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
            raise ValueError("ESH channel frame_F must have shape (128, 8).")

        frame_f = np.empty((128, 16), dtype=np.float64)
        for j in range(8):
            frame_f[:, 2 * j + 0] = chl_f[:, j]
            frame_f[:, 2 * j + 1] = chr_f[:, j]
        return frame_f

    # Non-ESH: expected (1024, 1) per channel in Level-1 schema.
    if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
        raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")

    frame_f = np.empty((1024, 2), dtype=np.float64)
    frame_f[:, 0] = chl_f[:, 0]
    frame_f[:, 1] = chr_f[:, 0]
    return frame_f


def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
    """
    Remove the boundary padding that the Level-1 encoder adds:
    hop samples at start and hop samples at end.

    Parameters
    ----------
    y_pad : StereoSignal (np.ndarray)
        Reconstructed padded stream, shape (N_pad, 2).
    hop : int
        Hop size in samples (default 1024).

    Returns
    -------
    StereoSignal (np.ndarray)
        Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).

    Raises
    ------
    ValueError
        If y_pad is too short to unpad.
    """
    if y_pad.shape[0] < 2 * hop:
        raise ValueError("Decoded stream too short to unpad.")
    return y_pad[hop:-hop, :]


# -----------------------------------------------------------------------------
# Level 1 decoder (core)
# -----------------------------------------------------------------------------

def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
    """
    Level-1 AAC decoder (inverse of aac_coder_1()).

    This function preserves the behavior of the original level_1 implementation:
    - Reconstruct the full padded stream by overlap-adding K synthesized frames
    - Remove hop padding at the beginning and hop padding at the end
    - Write the reconstructed stereo WAV file (48 kHz)
    - Return reconstructed stereo samples as float64

    Parameters
    ----------
    aac_seq_1 : AACSeq1
        Encoded sequence as produced by aac_coder_1().
    filename_out : Union[str, Path]
        Output WAV filename. Assumption: 48 kHz, stereo.

    Returns
    -------
    StereoSignal
        Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
    """
    filename_out = Path(filename_out)

    hop = 1024
    win = 2048
    K = len(aac_seq_1)

    # Output includes the encoder padding region, so we reconstruct the full padded stream.
    # For K frames: last frame starts at (K-1)*hop and spans win,
    # so total length = (K-1)*hop + win.
    n_pad = (K - 1) * hop + win
    y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)

    for i, fr in enumerate(aac_seq_1):
        frame_type: FrameType = fr["frame_type"]
        win_type: WinType = fr["win_type"]

        chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
        chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)

        frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
        frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)  # (2048, 2)

        start = i * hop
        y_pad[start:start + win, :] += frame_t_hat

    y: StereoSignal = aac_remove_padding(y_pad, hop=hop)

    # Level 1 assumption: 48 kHz output.
    sf.write(str(filename_out), y, 48000)

    return y