Multimedia_AAC_Project/source/core/aac_decoder.py

# ------------------------------------------------------------
# AAC Coder/Decoder - Inverse AAC Coder (Core)
#
# Multimedia course at Aristotle University of
# Thessaloniki (AUTh)
#
# Author:
#   Christos Choutouridis (ΑΕΜ 8997)
#   cchoutou@ece.auth.gr
#
# Description:
#   - Level 1 AAC decoder orchestration (inverse of aac_coder_1()).
#   - Level 2 AAC decoder orchestration (inverse of aac_coder_1()).
#
# ------------------------------------------------------------
from __future__ import annotations

from pathlib import Path
from typing import Union

import soundfile as sf

from core.aac_filterbank import aac_i_filter_bank
from core.aac_tns import aac_i_tns
from core.aac_types import *


# -----------------------------------------------------------------------------
# Public helpers
# -----------------------------------------------------------------------------

def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF:
    """
    Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo
    FrameF container expected by aac_i_filter_bank().

    Parameters
    ----------
    frame_type : FrameType
        "OLS" | "LSS" | "ESH" | "LPS".
    chl_f : FrameChannelF
        Left channel coefficients:
        - ESH: (128, 8)
        - else: (1024, 1)
    chr_f : FrameChannelF
        Right channel coefficients:
        - ESH: (128, 8)
        - else: (1024, 1)

    Returns
    -------
    FrameF
        Stereo coefficients:
        - ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7]
        - else: (1024, 2)
    """
    if frame_type == "ESH":
        if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
            raise ValueError("ESH channel frame_F must have shape (128, 8).")

        frame_f = np.empty((128, 16), dtype=np.float64)
        for j in range(8):
            frame_f[:, 2 * j + 0] = chl_f[:, j]
            frame_f[:, 2 * j + 1] = chr_f[:, j]
        return frame_f

    # Non-ESH: expected (1024, 1) per channel in Level-1 schema.
    if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1):
        raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).")

    frame_f = np.empty((1024, 2), dtype=np.float64)
    frame_f[:, 0] = chl_f[:, 0]
    frame_f[:, 1] = chr_f[:, 0]
    return frame_f


def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal:
    """
    Remove the boundary padding that the Level-1 encoder adds:
    hop samples at start and hop samples at end.

    Parameters
    ----------
    y_pad : StereoSignal (np.ndarray)
        Reconstructed padded stream, shape (N_pad, 2).
    hop : int
        Hop size in samples (default 1024).

    Returns
    -------
    StereoSignal (np.ndarray)
        Unpadded reconstructed stream, shape (N_pad - 2*hop, 2).

    Raises
    ------
    ValueError
        If y_pad is too short to unpad.
    """
    if y_pad.shape[0] < 2 * hop:
        raise ValueError("Decoded stream too short to unpad.")
    return y_pad[hop:-hop, :]


# -----------------------------------------------------------------------------
# Level 1 decoder
# -----------------------------------------------------------------------------

def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal:
    """
    Level-1 AAC decoder (inverse of aac_coder_1()).

    This function preserves the behavior of the original level_1 implementation:
    - Reconstruct the full padded stream by overlap-adding K synthesized frames
    - Remove hop padding at the beginning and hop padding at the end
    - Write the reconstructed stereo WAV file (48 kHz)
    - Return reconstructed stereo samples as float64

    Parameters
    ----------
    aac_seq_1 : AACSeq1
        Encoded sequence as produced by aac_coder_1().
    filename_out : Union[str, Path]
        Output WAV filename. Assumption: 48 kHz, stereo.

    Returns
    -------
    StereoSignal
        Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
    """
    filename_out = Path(filename_out)

    hop = 1024
    win = 2048
    K = len(aac_seq_1)

    # Output includes the encoder padding region, so we reconstruct the full padded stream.
    # For K frames: last frame starts at (K-1)*hop and spans win,
    # so total length = (K-1)*hop + win.
    n_pad = (K - 1) * hop + win
    y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64)

    for i, fr in enumerate(aac_seq_1):
        frame_type: FrameType = fr["frame_type"]
        win_type: WinType = fr["win_type"]

        chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
        chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)

        frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f)
        frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)  # (2048, 2)

        start = i * hop
        y_pad[start:start + win, :] += frame_t_hat

    y: StereoSignal = aac_remove_padding(y_pad, hop=hop)

    # Level 1 assumption: 48 kHz output.
    sf.write(str(filename_out), y, 48000)

    return y


# -----------------------------------------------------------------------------
# Level 2 decoder
# -----------------------------------------------------------------------------

def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal:
    """
    Level-2 AAC decoder (inverse of aac_coder_2).

    Behavior matches Level 1 decoder pipeline, with additional iTNS stage:
    - Per frame/channel: inverse TNS using stored coefficients
    - Re-pack to stereo frame_F
    - IMDCT + windowing
    - Overlap-add over frames
    - Remove Level-1 padding (hop samples start/end)
    - Write output WAV (48 kHz)

    Parameters
    ----------
    aac_seq_2 : AACSeq2
        Encoded sequence as produced by aac_coder_2().
    filename_out : Union[str, Path]
        Output WAV filename.

    Returns
    -------
    StereoSignal
        Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64.
    """
    filename_out = Path(filename_out)

    hop = 1024
    win = 2048
    K = len(aac_seq_2)

    if K <= 0:
        raise ValueError("aac_seq_2 must contain at least one frame.")

    n_pad = (K - 1) * hop + win
    y_pad = np.zeros((n_pad, 2), dtype=np.float64)

    for i, fr in enumerate(aac_seq_2):
        frame_type: FrameType = fr["frame_type"]
        win_type: WinType = fr["win_type"]

        chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64)
        chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64)

        chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64)
        chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64)

        # Inverse TNS per channel
        chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs)
        chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs)

        # Re-pack to the stereo container expected by aac_i_filter_bank
        if frame_type == "ESH":
            if chl_f.shape != (128, 8) or chr_f.shape != (128, 8):
                raise ValueError("ESH channel frame_F must have shape (128, 8).")

            frame_f: FrameF = np.empty((128, 16), dtype=np.float64)
            for j in range(8):
                frame_f[:, 2 * j + 0] = chl_f[:, j]
                frame_f[:, 2 * j + 1] = chr_f[:, j]
        else:
            # Accept either (1024,1) or (1024,) from your internal convention.
            if chl_f.shape == (1024,):
                chl_col = chl_f.reshape(1024, 1)
            elif chl_f.shape == (1024, 1):
                chl_col = chl_f
            else:
                raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).")

            if chr_f.shape == (1024,):
                chr_col = chr_f.reshape(1024, 1)
            elif chr_f.shape == (1024, 1):
                chr_col = chr_f
            else:
                raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).")

            frame_f = np.empty((1024, 2), dtype=np.float64)
            frame_f[:, 0] = chl_col[:, 0]
            frame_f[:, 1] = chr_col[:, 0]

        frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type)

        start = i * hop
        y_pad[start : start + win, :] += frame_t_hat

    y = aac_remove_padding(y_pad, hop=hop)

    sf.write(str(filename_out), y, 48000)
    return y