#! /usr/bin/env python from __future__ import annotations from pathlib import Path from typing import Dict, Tuple, List, Literal, TypedDict, Union import numpy as np import soundfile as sf from scipy.signal.windows import kaiser # -------------------------------- # Public Type aliases (Level 1) # -------------------------------- FrameType = Literal["OLS", "LSS", "ESH", "LPS"] """ Frame type codes: - "OLS": ONLY_LONG_SEQUENCE - "LSS": LONG_START_SEQUENCE - "ESH": EIGHT_SHORT_SEQUENCE - "LPS": LONG_STOP_SEQUENCE """ WinType = Literal["KBD", "SIN"] """ Window type codes: - "KBD": Kaiser-Bessel-Derived - "SIN": sinusoid """ FrameT = np.ndarray """ Time-domain frame. Expected shape: (2048, 2) for stereo (two channels). dtype: float (e.g., float32/float64). """ FrameChannelT = np.ndarray """ Time-domain single channel frame. Expected shape: (2048,). dtype: float (e.g., float32/float64). """ FrameF = np.ndarray """ Frequency-domain frame (MDCT coefficients). As per spec (Level 1): - If frame_type in {"OLS","LSS","LPS"}: shape (1024, 2) - If frame_type == "ESH": shape (128, 16) where 8 subframes x 2 channels are placed in columns according to the subframe order (i.e., each subframe is (128,2)). """ ChannelKey = Literal["chl", "chr"] class AACChannelFrameF(TypedDict): """Channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1).""" frame_F: np.ndarray # frame_F for one channel: # - ESH: shape (128, 8) # - else: shape (1024, 1) class AACSeq1Frame(TypedDict): """One frame dictionary of aac_seq_1 (Level 1).""" frame_type: FrameType win_type: WinType chl: AACChannelFrameF chr: AACChannelFrameF AACSeq1 = List[AACSeq1Frame] """AAC sequence for Level 1: List of length K (K = number of frames). Each element is a dict with keys: - "frame_type", "win_type", "chl", "chr" """ # Global Options # ----------------------------------------------------------------------------- # Window type # Options: "SIN", "KBD" WIN_TYPE: WinType = "SIN" # Private helpers for SSC # ----------------------------------------------------------------------------- # See Table 1 in mm-2025-hw-v0.1.pdf STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = { ("OLS", "OLS"): "OLS", ("OLS", "LSS"): "LSS", ("OLS", "ESH"): "ESH", ("OLS", "LPS"): "LPS", ("LSS", "OLS"): "LSS", ("LSS", "LSS"): "LSS", ("LSS", "ESH"): "ESH", ("LSS", "LPS"): "ESH", ("ESH", "OLS"): "ESH", ("ESH", "LSS"): "ESH", ("ESH", "ESH"): "ESH", ("ESH", "LPS"): "ESH", ("LPS", "OLS"): "LPS", ("LPS", "LSS"): "ESH", ("LPS", "ESH"): "ESH", ("LPS", "LPS"): "LPS", } def _detect_attack(next_frame_channel: FrameChannelT) -> bool: """ Detect if next frame (single channel) implies ESH according to the spec's attack criterion. Parameters ---------- next_frame_channel : FrameChannelT One channel of next_frame_T (shape: (2048,), dtype float). Returns ------- attack : bool True if an attack is detected (=> next frame predicted ESH), else False. Notes ----- The spec describes: - High-pass filter applied to next_frame_channel - Split into 16 segments of length 128 - Compute segment energies s(l) - Compute ds(l) = s(l) / s(l-1) - Attack exists if there exists l in {1..7} such that: s(l) > 1e-3 and ds(l) > 10 """ x = next_frame_channel # local alias, x assumed to be a 1-D array of length 2048 # High-pass filter H(z) = (1 - z^-1) / (1 - 0.5 z^-1) # Implemented as: y[n] = x[n] - x[n-1] + 0.5*y[n-1] y = np.zeros_like(x) prev_x = 0.0 prev_y = 0.0 for n in range(x.shape[0]): xn = float(x[n]) yn = (xn - prev_x) + 0.5 * prev_y y[n] = yn prev_x = xn prev_y = yn # Segment energies over 16 blocks of 128 samples. s = np.empty(16, dtype=np.float64) for l in range(16): a = l * 128 b = (l + 1) * 128 seg = y[a:b] s[l] = float(np.sum(seg * seg)) # ds(l) for l>=1. For l=0 not defined, keep 0. ds = np.zeros(16, dtype=np.float64) eps = 1e-12 # avoid division by zero without changing logic materially for l in range(1, 16): ds[l] = s[l] / max(s[l - 1], eps) # Spec: check l in {1..7} for l in range(1, 8): if (s[l] > 1e-3) and (ds[l] > 10.0): return True return False def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType: """ Decide current frame type for a single channel based on prev_frame_type and next-frame attack. Parameters ---------- prev_frame_type : FrameType Previous frame type (one of "OLS","LSS","ESH","LPS"). attack : bool Whether next frame is predicted ESH for this channel. Returns ------- frame_type : FrameType The per-channel decision for the current frame. Rules (spec) ------------ - If prev is "LSS" => current is "ESH" (fixed) - If prev is "LPS" => current is "OLS" (fixed) - If prev is "OLS" => current is "LSS" if attack else "OLS" - If prev is "ESH" => current is "ESH" if attack else "LPS" """ if prev_frame_type == "LSS": return "ESH" if prev_frame_type == "LPS": return "OLS" if prev_frame_type == "OLS": return "LSS" if attack else "OLS" if prev_frame_type == "ESH": return "ESH" if attack else "LPS" raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}") def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: """ Merge per-channel frame types into one common frame type using the spec table. Parameters ---------- ft_l : FrameType Frame type decision for channel 0 (left). ft_r : FrameType Frame type decision for channel 1 (right). Returns ------- common : FrameType The common final frame type. """ try: return STEREO_MERGE_TABLE[(ft_l, ft_r)] except KeyError as e: raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e # Private helpers for Filterbank # ----------------------------------------------------------------------------- def _sin_window(N: int) -> np.ndarray: """ Sine window (full length N). w[n] = sin(pi/N * (n + 0.5)), 0 <= n < N """ n = np.arange(N, dtype=np.float64) return np.sin((np.pi / N) * (n + 0.5)) def _kbd_window(N: int, alpha: float) -> np.ndarray: """ Kaiser-Bessel-Derived (KBD) window (full length N). This follows the standard KBD construction: - Build Kaiser kernel of length N/2 + 1 - Use cumulative sum and sqrt normalization to form left and right halves """ half = N // 2 # Kaiser kernel length: half + 1 samples (0 .. half) # beta = pi * alpha per the usual correspondence with the ISO definition kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64) csum = np.cumsum(kernel) denom = csum[-1] w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1 w_right = w_left[::-1] # mirror for second half return np.concatenate([w_left, w_right]) def _long_window(win_type: WinType) -> np.ndarray: """ Long window (length 2048) for the selected win_type. """ if win_type == "SIN": return _sin_window(2048) if win_type == "KBD": # Assignment-specific alpha values return _kbd_window(2048, alpha=6.0) raise ValueError(f"Invalid win_type: {win_type!r}") def _short_window(win_type: WinType) -> np.ndarray: """ Short window (length 256) for the selected win_type. """ if win_type == "SIN": return _sin_window(256) if win_type == "KBD": # Assignment-specific alpha values return _kbd_window(256, alpha=4.0) raise ValueError(f"Invalid win_type: {win_type!r}") def _window_sequence(frame_type: FrameType, win_type: WinType) -> np.ndarray: """ Build the 2048-sample window sequence for OLS/LSS/LPS. We follow the simplified assumption: - The same window shape (KBD or SIN) is used globally (no mixed halves). - Therefore, the left and right halves are drawn from the same family. """ wL = _long_window(win_type) # length 2048 wS = _short_window(win_type) # length 256 if frame_type == "OLS": return wL if frame_type == "LSS": # 0..1023: left half of long window # 1024..1471: ones (448 samples) # 1472..1599: right half of short window (128 samples) # 1600..2047: zeros (448 samples) out = np.zeros(2048, dtype=np.float64) out[0:1024] = wL[0:1024] out[1024:1472] = 1.0 out[1472:1600] = wS[128:256] out[1600:2048] = 0.0 return out if frame_type == "LPS": # 0..447: zeros (448) # 448..575: left half of short window (128) # 576..1023: ones (448) # 1024..2047: right half of long window (1024) out = np.zeros(2048, dtype=np.float64) out[0:448] = 0.0 out[448:576] = wS[0:128] out[576:1024] = 1.0 out[1024:2048] = wL[1024:2048] return out raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}") def _mdct(s: np.ndarray) -> np.ndarray: """ MDCT (direct form) as given in the assignment. Input: s: windowed time samples of length N (N = 2048 or 256) Output: X: MDCT coefficients of length N/2 Definition: X[k] = 2 * sum_{n=0 .. N-1} s[n] * cos(2*pi/N * (n + n0) * (k + 1/2)) where n0 = (N/2 + 1)/2 """ s = np.asarray(s, dtype=np.float64) N = int(s.shape[0]) if N not in (2048, 256): raise ValueError("MDCT input length must be 2048 or 256.") n0 = (N / 2.0 + 1.0) / 2.0 n = np.arange(N, dtype=np.float64) + n0 k = np.arange(N // 2, dtype=np.float64) + 0.5 # Cosine matrix: shape (N, N/2) C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) X = 2.0 * (s @ C) return X def _imdct(X: np.ndarray) -> np.ndarray: """ IMDCT (direct form) as given in the assignment. Input: X: MDCT coefficients of length N/2 (N = 2048 or 256) Output: s: time samples of length N Definition: s[n] = (2/N) * sum_{k=0 .. N/2-1} X[k] * cos(2*pi/N * (n + n0) * (k + 1/2)) where n0 = (N/2 + 1)/2 """ X = np.asarray(X, dtype=np.float64).reshape(-1) K = int(X.shape[0]) if K not in (1024, 128): raise ValueError("IMDCT input length must be 1024 or 128.") N = 2 * K n0 = (N / 2.0 + 1.0) / 2.0 n = np.arange(N, dtype=np.float64) + n0 k = np.arange(K, dtype=np.float64) + 0.5 C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K) s = (2.0 / N) * (C @ X) return s def _filter_bank_esh_channel(x_ch: np.ndarray, win_type: WinType) -> np.ndarray: """ ESH analysis for one channel. Returns: X_esh: shape (128, 8), where each column is the 128 MDCT coeffs of one short window. """ wS = _short_window(win_type) X_esh = np.empty((128, 8), dtype=np.float64) # ESH subwindows are taken from the central region: # start positions: 448 + 128*j, j = 0..7 for j in range(8): start = 448 + 128 * j seg = x_ch[start:start + 256] * wS X_esh[:, j] = _mdct(seg) return X_esh def _unpack_esh(frame_F: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """ Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8). Mapping is the inverse of the packing used in filter_bank(): out[:, 2*j] = left[:, j] out[:, 2*j+1] = right[:, j] """ if frame_F.shape != (128, 16): raise ValueError("ESH frame_F must have shape (128, 16).") left = np.empty((128, 8), dtype=np.float64) right = np.empty((128, 8), dtype=np.float64) for j in range(8): left[:, j] = frame_F[:, 2 * j + 0] right[:, j] = frame_F[:, 2 * j + 1] return left, right def _i_filter_bank_esh_channel(X_esh: np.ndarray, win_type: WinType) -> np.ndarray: """ ESH synthesis for one channel. Input: X_esh: (128, 8) MDCT coeffs for 8 short windows Output: x_ch: (2048, ) time-domain frame contribution (windowed), ready for OLA at the caller level. """ if X_esh.shape != (128, 8): raise ValueError("X_esh must have shape (128, 8).") wS = _short_window(win_type) out = np.zeros(2048, dtype=np.float64) # Each short IMDCT returns 256 samples. Place them at: # start = 448 + 128*j, j=0..7 (50% overlap) for j in range(8): seg = _imdct(X_esh[:, j]) * wS # (256,) start = 448 + 128 * j out[start:start + 256] += seg return out # ----------------------------------------------------------------------------- # Public Function prototypes (Level 1) # ----------------------------------------------------------------------------- def SSC(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: """ Sequence Segmentation Control (SSC). Selects and returns the frame type for the current frame (i) based on input parameters. Parameters ------- frame_T: FrameT current time-domain frame i, stereo, shape (2048, 2) next_frame_T: FrameT next time-domain frame (i+1), stereo, shape (2048, 2) (used to decide transitions to/from ESH) prev_frame_type: FrameType frame type chosen for the previous frame (i-1) Returns ------- frame_type : FrameType - "OLS" (ONLY_LONG_SEQUENCE) - "LSS" (LONG_START_SEQUENCE) - "ESH" (EIGHT_SHORT_SEQUENCE) - "LPS" (LONG_STOP_SEQUENCE) """ if frame_T.shape != (2048, 2): raise ValueError("frame_T must have shape (2048, 2).") if next_frame_T.shape != (2048, 2): raise ValueError("next_frame_T must have shape (2048, 2).") # Detect attack independently per channel on next frame. attack_l = _detect_attack(next_frame_T[:, 0]) attack_r = _detect_attack(next_frame_T[:, 1]) # Decide per-channel type based on shared prev_frame_type. ft_l = _decide_frame_type(prev_frame_type, attack_l) ft_r = _decide_frame_type(prev_frame_type, attack_r) # Stereo merge as per Table 1. return _stereo_merge(ft_l, ft_r) def filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF: """ Filterbank stage (MDCT analysis). Parameters ---------- frame_T : FrameT Time-domain frame, stereo, shape (2048, 2). frame_type : FrameType Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS"). win_type : WinType Window type ("KBD" or "SIN") used for the current frame. Returns ------- frame_F : FrameF Frequency-domain MDCT coefficients: - If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2) containing MDCT coefficients for both channels. - If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2), placed in columns according to subframe order, i.e. overall shape (128, 16). """ if frame_T.shape != (2048, 2): raise ValueError("frame_T must have shape (2048, 2).") xL = frame_T[:, 0].astype(np.float64, copy=False) xR = frame_T[:, 1].astype(np.float64, copy=False) if frame_type in ("OLS", "LSS", "LPS"): w = _window_sequence(frame_type, win_type) # length 2048 XL = _mdct(xL * w) # length 1024 XR = _mdct(xR * w) # length 1024 out = np.empty((1024, 2), dtype=np.float64) out[:, 0] = XL out[:, 1] = XR return out if frame_type == "ESH": Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8) Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8) # Pack into (128, 16): each subframe as (128,2) placed in columns out = np.empty((128, 16), dtype=np.float64) for j in range(8): out[:, 2 * j + 0] = Xl[:, j] out[:, 2 * j + 1] = Xr[:, j] return out raise ValueError(f"Invalid frame_type: {frame_type!r}") def i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT: """ Inverse filterbank (IMDCT synthesis). Parameters ---------- frame_F : FrameF Frequency-domain MDCT coefficients as produced by filter_bank(). frame_type : FrameType Frame type ("OLS"|"LSS"|"ESH"|"LPS"). win_type : WinType Window type ("KBD" or "SIN"). Returns ------- frame_T : FrameT Reconstructed time-domain frame, stereo, shape (2048, 2). """ if frame_type in ("OLS", "LSS", "LPS"): if frame_F.shape != (1024, 2): raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).") w = _window_sequence(frame_type, win_type) xL = _imdct(frame_F[:, 0]) * w xR = _imdct(frame_F[:, 1]) * w out = np.empty((2048, 2), dtype=np.float64) out[:, 0] = xL out[:, 1] = xR return out if frame_type == "ESH": if frame_F.shape != (128, 16): raise ValueError("For ESH, frame_F must have shape (128, 16).") Xl, Xr = _unpack_esh(frame_F) xL = _i_filter_bank_esh_channel(Xl, win_type) xR = _i_filter_bank_esh_channel(Xr, win_type) out = np.empty((2048, 2), dtype=np.float64) out[:, 0] = xL out[:, 1] = xR return out raise ValueError(f"Invalid frame_type: {frame_type!r}") def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: """ Level-1 AAC encoder. Parameters ---------- filename_in : str | Path Input WAV filename. Assumption: stereo audio, sampling rate 48 kHz. Returns ------- aac_seq_1 : AACSeq1 List of K encoded frames. For each i: - aac_seq_1[i]["frame_type"]: FrameType - aac_seq_1[i]["win_type"]: WinType - aac_seq_1[i]["chl"]["frame_F"]: - ESH: shape (128, 8) - else: shape (1024, 1) - aac_seq_1[i]["chr"]["frame_F"]: - ESH: shape (128, 8) - else: shape (1024, 1) """ filename_in = Path(filename_in) x, fs = sf.read(str(filename_in), always_2d=True) x = np.asarray(x, dtype=np.float64) if x.shape[1] != 2: raise ValueError("Input must be stereo (2 channels).") if fs != 48000: raise ValueError("Input sampling rate must be 48 kHz.") hop = 1024 win = 2048 # Pad at the beginning to support the first overlap region. # Tail padding is kept minimal; next-frame is padded on-the-fly when needed. pad_pre = np.zeros((hop, 2), dtype=np.float64) pad_post = np.zeros((hop, 2), dtype=np.float64) x_pad = np.vstack([pad_pre, x, pad_post]) # Number of frames such that current frame fits; next frame will be padded if needed. K = int((x_pad.shape[0] - win) // hop + 1) if K <= 0: raise ValueError("Input too short for framing.") aac_seq: AACSeq1 = [] prev_frame_type: FrameType = "OLS" for i in range(K): start = i * hop frame_t: FrameT = x_pad[start:start + win, :] if frame_t.shape != (win, 2): # This should not happen due to K definition, but we keep it explicit. raise ValueError("Internal framing error: frame_t has wrong shape.") next_t = x_pad[start + hop:start + hop + win, :] # Ensure next_t is always (2048,2) by zero-padding at the tail. if next_t.shape[0] < win: tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) next_t = np.vstack([next_t, tail]) frame_type = SSC(frame_t, next_t, prev_frame_type) frame_f = filter_bank(frame_t, frame_type, WIN_TYPE) # Store per-channel as required by AACSeq1 schema if frame_type == "ESH": # frame_f: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] chl_f = np.empty((128, 8), dtype=np.float64) chr_f = np.empty((128, 8), dtype=np.float64) for j in range(8): chl_f[:, j] = frame_f[:, 2 * j + 0] chr_f[:, j] = frame_f[:, 2 * j + 1] else: # frame_f: (1024, 2) chl_f = frame_f[:, 0:1].astype(np.float64, copy=False) chr_f = frame_f[:, 1:2].astype(np.float64, copy=False) aac_seq.append({ "frame_type": frame_type, "win_type": WIN_TYPE, "chl": {"frame_F": chl_f}, "chr": {"frame_F": chr_f}, }) prev_frame_type = frame_type return aac_seq def i_aac_coder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> np.ndarray: """ Level-1 AAC decoder (inverse of aac_coder_1()). Parameters ---------- aac_seq_1 : AACSeq1 Encoded sequence as produced by aac_coder_1(). filename_out : str | Path Output WAV filename. Assumption: stereo audio, sampling rate 48 kHz. Returns ------- x : np.ndarray Decoded audio samples (time-domain). Expected shape: (N, 2) for stereo (N depends on input length). """ filename_out = Path(filename_out) hop = 1024 win = 2048 K = len(aac_seq_1) # Output includes the encoder padding region, so we reconstruct # full padded stream. For K frames: last frame starts at (K-1)*hop and spans win, # so total length = (K-1)*hop + win n_pad = (K - 1) * hop + win y_pad = np.zeros((n_pad, 2), dtype=np.float64) for i, fr in enumerate(aac_seq_1): frame_type = fr["frame_type"] win_type = fr["win_type"] chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) # Re-pack into the format expected by i_filter_bank() if frame_type == "ESH": if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): raise ValueError("ESH channel frame_F must have shape (128, 8).") frame_f = np.empty((128, 16), dtype=np.float64) for j in range(8): frame_f[:, 2 * j + 0] = chl_f[:, j] frame_f[:, 2 * j + 1] = chr_f[:, j] else: if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1): raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).") frame_f = np.empty((1024, 2), dtype=np.float64) frame_f[:, 0] = chl_f[:, 0] frame_f[:, 1] = chr_f[:, 0] frame_t_hat = i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) start = i * hop y_pad[start:start + win, :] += frame_t_hat # Remove boundary padding that encoder adds: hop samples at start and hop at end. if y_pad.shape[0] < 2 * hop: raise ValueError("Decoded stream too short to unpad.") y = y_pad[hop:-hop, :] sf.write(str(filename_out), y, 48000) return y def demo_aac_1(filename_in: Union[str, Path], filename_out: Union[str, Path]) -> float: """ Demonstration for Level-1 codec. Runs: - aac_coder_1(filename_in) - i_aac_coder_1(aac_seq_1, filename_out) and computes total SNR between original and decoded audio. Parameters ---------- filename_in : str | Path Input WAV filename (stereo, 48 kHz). filename_out : str | Path Output WAV filename (stereo, 48 kHz). Returns ------- SNR : float Overall Signal-to-Noise Ratio in dB. """ filename_in = Path(filename_in) filename_out = Path(filename_out) # Read original audio (reference) x_ref, fs_ref = sf.read(str(filename_in), always_2d=True) x_ref = np.asarray(x_ref, dtype=np.float64) # Encode / decode aac_seq_1 = aac_coder_1(filename_in) x_hat = i_aac_coder_1(aac_seq_1, filename_out) x_hat = np.asarray(x_hat, dtype=np.float64) # Ensure 2D stereo shape (N, 2) if x_hat.ndim == 1: x_hat = x_hat.reshape(-1, 1) if x_ref.ndim == 1: x_ref = x_ref.reshape(-1, 1) # Align lengths (use common overlap) n = min(x_ref.shape[0], x_hat.shape[0]) x_ref = x_ref[:n, :] x_hat = x_hat[:n, :] # Match channel count conservatively (common channels) c = min(x_ref.shape[1], x_hat.shape[1]) x_ref = x_ref[:, :c] x_hat = x_hat[:, :c] # Compute overall SNR over all samples and channels err = x_ref - x_hat p_signal = float(np.sum(x_ref * x_ref)) p_noise = float(np.sum(err * err)) if p_noise <= 0.0: return float("inf") if p_signal <= 0.0: # Degenerate case: silent input return -float("inf") # else: snr_db = 10.0 * np.log10(p_signal / p_noise) return float(snr_db) if __name__ == "__main__": # Example usage: # python -m level_1.level_1 input.wav output.wav import sys if len(sys.argv) != 3: raise SystemExit("Usage: python -m level_1.level_1 ") in_wav = sys.argv[1] out_wav = sys.argv[2] print(f"Encoding/Decoding {in_wav} to {out_wav}") snr = demo_aac_1(in_wav, out_wav) print(f"SNR = {snr:.3f} dB")