diff --git a/source/core/aac_coder.py b/source/core/aac_coder.py index 4c6158c..3e9cc96 100644 --- a/source/core/aac_coder.py +++ b/source/core/aac_coder.py @@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> # Level 1 encoder # ----------------------------------------------------------------------------- -def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: +def aac_coder_1( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq1: """ Level-1 AAC encoder. @@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: filename_in : Union[str, Path] Input WAV filename. Assumption: stereo audio, sampling rate 48 kHz. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: aac_seq: AACSeq1 = [] prev_frame_type: FrameType = "OLS" - win_type: WinType = WIN_TYPE - + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: next_t = np.vstack([next_t, tail]) frame_type = aac_ssc(frame_t, next_t, prev_frame_type) - frame_f = aac_filter_bank(frame_t, frame_type, win_type) + frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) aac_seq.append({ "frame_type": frame_type, - "win_type": win_type, + "win_type": WIN_TYPE, "chl": {"frame_F": chl_f}, "chr": {"frame_F": chr_f}, }) prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") return aac_seq -def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: +def aac_coder_2( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq2: """ Level-2 AAC encoder (Level 1 + TNS). @@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: ---------- filename_in : Union[str, Path] Input WAV filename (stereo, 48 kHz). + verbose : bool + Optional argument to print encoding status Returns ------- @@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: aac_seq: AACSeq2 = [] prev_frame_type: FrameType = "OLS" + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: # Level 1 analysis (packed stereo container) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) - # Unpack to per-channel (as you already do in Level 1) - if frame_type == "ESH": - chl_f = np.empty((128, 8), dtype=np.float64) - chr_f = np.empty((128, 8), dtype=np.float64) - for j in range(8): - chl_f[:, j] = frame_f_stereo[:, 2 * j + 0] - chr_f[:, j] = frame_f_stereo[:, 2 * j + 1] - else: - chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False) - chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False) + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) # Level 2: apply TNS per channel chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) @@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, } ) - prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") return aac_seq @@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: def aac_coder_3( filename_in: Union[str, Path], filename_aac_coded: Union[str, Path] | None = None, + verbose: bool = False, ) -> AACSeq3: """ Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). @@ -389,6 +402,8 @@ def aac_coder_3( Input WAV filename (stereo, 48 kHz). filename_aac_coded : Union[str, Path] | None Optional .mat filename to store aac_seq_3 (assignment convenience). + verbose : bool + Optional argument to print encoding status Returns ------- @@ -416,15 +431,14 @@ def aac_coder_3( aac_seq: AACSeq3 = [] prev_frame_type: FrameType = "OLS" - # Pin win_type to the WinType literal for type checkers. - win_type: WinType = WIN_TYPE - # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. prev1_L = np.zeros((2048,), dtype=np.float64) prev2_L = np.zeros((2048,), dtype=np.float64) prev1_R = np.zeros((2048,), dtype=np.float64) prev2_R = np.zeros((2048,), dtype=np.float64) + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -440,7 +454,7 @@ def aac_coder_3( frame_type = aac_ssc(frame_t, next_t, prev_frame_type) # Analysis filterbank (stereo packed) - frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) # TNS per channel @@ -474,32 +488,35 @@ def aac_coder_3( # Codebook 11: # maxAbsCodeVal = 16 is RESERVED for ESCAPE. # We must stay strictly within [-15, +15] to avoid escape decoding. - sf_cb = 11 - sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + # sf_cb = 11 + # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + # + # sfc_L_dpcm = np.clip( + # sfc_L_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) + # + # sfc_R_dpcm = np.clip( + # sfc_R_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) - sfc_L_dpcm = np.clip( - sfc_L_dpcm, - -sf_max_abs, - sf_max_abs, - ).astype(np.int64, copy=False) - - sfc_R_dpcm = np.clip( - sfc_R_dpcm, - -sf_max_abs, - sf_max_abs, - ).astype(np.int64, copy=False) - - sfc_L_stream, _ = aac_encode_huff( + sfc_L_stream, cb_sfc_L = aac_encode_huff( sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, - force_codebook=sf_cb, + # force_codebook=11, ) - sfc_R_stream, _ = aac_encode_huff( + sfc_R_stream, cb_sfc_R = aac_encode_huff( sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, - force_codebook=sf_cb, + # force_codebook=11, ) + if cb_sfc_L != 11 or cb_sfc_R != 11: + print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + mdct_L_stream, cb_L = aac_encode_huff( np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list, @@ -512,7 +529,7 @@ def aac_coder_3( # Typed dict construction helps static analyzers validate the schema. frame_out: AACSeq3Frame = { "frame_type": frame_type, - "win_type": win_type, + "win_type": WIN_TYPE, "chl": { "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64), @@ -539,6 +556,11 @@ def aac_coder_3( prev1_R = frame_R prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") # Optional: store to .mat for the assignment wrapper if filename_aac_coded is not None: @@ -548,6 +570,5 @@ def aac_coder_3( {"aac_seq_3": np.array(aac_seq, dtype=object)}, do_compression=True, ) - return aac_seq diff --git a/source/core/aac_decoder.py b/source/core/aac_decoder.py index ac05621..be705df 100644 --- a/source/core/aac_decoder.py +++ b/source/core/aac_decoder.py @@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: # Level 1 decoder # ----------------------------------------------------------------------------- -def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_1( + aac_seq_1: AACSeq1, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: """ Level-1 AAC decoder (inverse of aac_coder_1()). @@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_1(). filename_out : Union[str, Path] Output WAV filename. Assumption: 48 kHz, stereo. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) for i, fr in enumerate(aac_seq_1): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start:start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") # Level 1 assumption: 48 kHz output. sf.write(str(filename_out), y, 48000) - return y @@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS # Level 2 decoder # ----------------------------------------------------------------------------- -def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_2( + aac_seq_2: AACSeq2, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: """ Level-2 AAC decoder (inverse of aac_coder_2). @@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_2(). filename_out : Union[str, Path] Output WAV filename. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) for i, fr in enumerate(aac_seq_2): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start : start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") sf.write(str(filename_out), y, 48000) return y -def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_3( + aac_seq_3: AACSeq3, + filename_out: Union[str, Path], + verbose: bool = False, +) -> StereoSignal: """ Level-3 AAC decoder (inverse of aac_coder_3). @@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_3. filename_out : Union[str, Path] Output WAV filename. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) + for i, fr in enumerate(aac_seq_3): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start : start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") sf.write(str(filename_out), y, 48000) return y diff --git a/source/core/aac_psycho.py b/source/core/aac_psycho.py index a26cab0..4ecaafc 100644 --- a/source/core/aac_psycho.py +++ b/source/core/aac_psycho.py @@ -316,8 +316,8 @@ def _psycho_one_window( nb = en * bc # Threshold in quiet (convert from dB to power domain): - # qthr_power = (N/2) * 10^(qthr_db/10) - qthr_power = (N / 2.0) * (10.0 ** (qthr_db / 10.0)) + # qthr_power = eps * (N/2) * 10^(qthr_db/10) + qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0)) # Final masking threshold per band: # np(b) = max(nb(b), qthr(b)) diff --git a/source/core/tests/test_aac_coder_decoder.py b/source/core/tests/test_aac_coder_decoder.py index 38a8d7c..7241902 100644 --- a/source/core/tests/test_aac_coder_decoder.py +++ b/source/core/tests/test_aac_coder_decoder.py @@ -25,38 +25,64 @@ from core.aac_utils import snr_db from core.aac_types import * -# Helper "fixtures" for aac_coder_1 / i_aac_coder_1 +# ----------------------------------------------------------------------------- +# Fixtures (small wav logic) # ----------------------------------------------------------------------------- +@pytest.fixture(scope="session") +def wav_in_path() -> Path: + """ + Provided input WAV used for end-to-end tests. + + Expected project layout: + source/material/LicorDeCalandraca.wav + """ + return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav" + + @pytest.fixture() -def tmp_stereo_wav(tmp_path: Path) -> Path: +def mk_random_stereo_wav(tmp_path: Path, request: pytest.FixtureRequest) -> Path: """ Create a temporary 48 kHz stereo WAV with random samples. + + Length (in seconds) must be provided via indirect parametrization. """ + length = float(request.param) rng = np.random.default_rng(123) fs = 48000 - # ~1 second of audio (kept small for test speed). - n = fs + n = int(fs * length) x: StereoSignal = rng.normal(size=(n, 2)).astype(np.float64) - wav_path = tmp_path / "in.wav" + wav_path = tmp_path / "in_random.wav" sf.write(str(wav_path), x, fs) return wav_path +@pytest.fixture() +def mk_actual_stereo_wav(tmp_path: Path, wav_in_path: Path, request: pytest.FixtureRequest) -> Path: + """ + Create a temporary 48 kHz stereo WAV by chopping from the provided material WAV. + Length can be overridden via indirect parametrization. + """ + length = float(getattr(request, "param", 0.25)) # seconds (default: small) + + x, fs = aac_read_wav_stereo_48k(wav_in_path) + n = int(fs * length) + x_short = x[:n, :] + + wav_path = tmp_path / "in_actual.wav" + sf.write(str(wav_path), x_short, fs) + return wav_path + + # ----------------------------------------------------------------------------- # Helper-function tests # ----------------------------------------------------------------------------- -def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None: - """ - Contract test for aac_read_wav_stereo_48k(): - - Reads stereo WAV - - Returns float64 array with shape (N,2) - - Returns fs = 48000 - """ - x, fs = aac_read_wav_stereo_48k(tmp_stereo_wav) +@pytest.mark.parametrize("mk_random_stereo_wav", [2.0], indirect=True) +def test_aac_read_wav_stereo_48k_roundtrip(mk_random_stereo_wav: Path) -> None: + x, fs = aac_read_wav_stereo_48k(mk_random_stereo_wav) assert int(fs) == 48000 assert isinstance(x, np.ndarray) @@ -67,10 +93,6 @@ def test_aac_read_wav_stereo_48k_roundtrip(tmp_stereo_wav: Path) -> None: def test_aac_remove_padding_removes_hop_from_both_ends() -> None: - """ - Contract test for aac_remove_padding(): - - Removes 'hop' samples from start and end. - """ hop = 1024 n = 10000 @@ -82,9 +104,6 @@ def test_aac_remove_padding_removes_hop_from_both_ends() -> None: def test_aac_remove_padding_errors_on_too_short_input() -> None: - """ - aac_remove_padding must raise if y_pad is shorter than 2*hop. - """ hop = 1024 y_pad: StereoSignal = np.zeros((2 * hop - 1, 2), dtype=np.float64) @@ -95,12 +114,10 @@ def test_aac_remove_padding_errors_on_too_short_input() -> None: # ----------------------------------------------------------------------------- # Level 1 tests # ----------------------------------------------------------------------------- -def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: - """ - Module-level contract test: - Ensure aac_seq_1 follows the expected schema and per-frame shapes. - """ - aac_seq: AACSeq1 = aac_coder_1(tmp_stereo_wav) + +@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True) +def test_aac_coder_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None: + aac_seq: AACSeq1 = aac_coder_1(mk_random_stereo_wav) assert isinstance(aac_seq, list) assert len(aac_seq) > 0 @@ -108,7 +125,6 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: for fr in aac_seq: assert isinstance(fr, dict) - # Required keys assert "frame_type" in fr assert "win_type" in fr assert "chl" in fr @@ -136,41 +152,32 @@ def test_aac_coder_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: assert chr_f.shape == (1024, 1) -def test_end_to_end_aac_coder_decoder_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: - """ - End-to-end test: - Encode + decode and check SNR is very high (numerical-noise only). - - The threshold is intentionally loose to avoid fragility across platforms/BLAS. - """ - x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True) +@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True) +def test_end_to_end_aac_coder_decoder_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None: + x_ref, fs = sf.read(str(mk_random_stereo_wav), always_2d=True) x_ref = np.asarray(x_ref, dtype=np.float64) assert int(fs) == 48000 out_wav = tmp_path / "out.wav" - aac_seq = aac_coder_1(tmp_stereo_wav) + aac_seq = aac_coder_1(mk_random_stereo_wav) x_hat: StereoSignal = aac_decoder_1(aac_seq, out_wav) - # Basic sanity: output file exists and is readable assert out_wav.exists() - x_hat_file, fs_hat = sf.read(str(out_wav), always_2d=True) + _, fs_hat = sf.read(str(out_wav), always_2d=True) assert int(fs_hat) == 48000 - # SNR against returned array (file should match closely, but we do not require it here). snr = snr_db(x_ref, x_hat) assert snr > 80.0 + # ----------------------------------------------------------------------------- -# Level 2 tests (new) +# Level 2 tests # ----------------------------------------------------------------------------- -def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: - """ - Module-level contract test (Level 2): - Ensure aac_seq_2 follows the expected schema and per-frame shapes, including tns_coeffs. - """ - aac_seq: AACSeq2 = aac_coder_2(tmp_stereo_wav) +@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True) +def test_aac_coder_2_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None: + aac_seq: AACSeq2 = aac_coder_2(mk_random_stereo_wav) assert isinstance(aac_seq, list) assert len(aac_seq) > 0 @@ -194,27 +201,20 @@ def test_aac_coder_2_seq_schema_and_shapes(tmp_stereo_wav: Path) -> None: if frame_type == "ESH": assert frame_f.shape == (128, 8) - assert coeffs.shape[0] == 4 - assert coeffs.shape[1] == 8 + assert coeffs.shape == (4, 8) else: assert frame_f.shape == (1024, 1) assert coeffs.shape == (4, 1) -def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> None: - """ - End-to-end test (Level 2): - Encode + decode and check SNR remains very high. - - Level 2 is still floating-point (TNS is reversible), so reconstruction - should remain numerical-noise only. - """ - x_ref, fs = sf.read(str(tmp_stereo_wav), always_2d=True) +@pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True) +def test_end_to_end_level_2_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None: + x_ref, fs = sf.read(str(mk_random_stereo_wav), always_2d=True) x_ref = np.asarray(x_ref, dtype=np.float64) assert int(fs) == 48000 out_wav = tmp_path / "out_l2.wav" - aac_seq = aac_coder_2(tmp_stereo_wav) + aac_seq = aac_coder_2(mk_random_stereo_wav) x_hat: StereoSignal = aac_decoder_2(aac_seq, out_wav) assert out_wav.exists() @@ -222,30 +222,14 @@ def test_end_to_end_level_2_high_snr(tmp_stereo_wav: Path, tmp_path: Path) -> No assert int(fs_hat) == 48000 snr = snr_db(x_ref, x_hat) - assert snr > 80 + assert snr > 80.0 # ----------------------------------------------------------------------------- # Level 3 tests (Quantizer + Huffman) # ----------------------------------------------------------------------------- -@pytest.fixture(scope="module") -def wav_in_path() -> Path: - """ - Input WAV used for end-to-end tests. - - This should point to the provided test audio under material/. - Adjust this path if your project layout differs. - """ - # Typical layout in this project: - # source/material/LicorDeCalandraca.wav - return Path(__file__).resolve().parents[2] / "material" / "LicorDeCalandraca.wav" - - def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None: - """ - Validate Level-3 per-frame schema (keys + basic types only). - """ assert "frame_type" in frame assert "win_type" in frame assert "chl" in frame @@ -264,37 +248,18 @@ def _assert_level3_frame_schema(frame: AACSeq3Frame) -> None: assert isinstance(ch["stream"], str) assert isinstance(ch["codebook"], int) - # Arrays: only check they are numpy arrays with expected dtype categories. assert isinstance(ch["tns_coeffs"], np.ndarray) assert isinstance(ch["T"], np.ndarray) - - # Global gain: long frames may be scalar float, ESH may be ndarray assert np.isscalar(ch["G"]) or isinstance(ch["G"], np.ndarray) -def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) -> None: +@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) +def test_aac_coder_3_seq_schema_and_shapes(mk_actual_stereo_wav: Path) -> None: """ - Contract test: - - aac_coder_3 returns AACSeq3 - - Per-frame keys exist and types are consistent - - Basic shape expectations hold for ESH vs non-ESH cases - - Note: - This test uses a short excerpt (a few frames) to keep runtime bounded. + Uses a short WAV excerpt produced by mk_actual_stereo_wav. + 0.11s is enough for a few frames at 48 kHz. """ - # Use only a few frames to avoid long runtimes in the quantizer loop. - hop = 1024 - win = 2048 - n_frames = 4 - n_samples = win + (n_frames - 1) * hop - - x, fs = aac_read_wav_stereo_48k(wav_in_path) - x_short = x[:n_samples, :] - - short_wav = tmp_path / "input_short.wav" - sf.write(str(short_wav), x_short, fs) - - aac_seq_3: AACSeq3 = aac_coder_3(short_wav) + aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav) assert isinstance(aac_seq_3, list) assert len(aac_seq_3) > 0 @@ -329,46 +294,21 @@ def test_aac_coder_3_seq_schema_and_shapes(wav_in_path: Path, tmp_path: Path) -> else: assert np.isscalar(G) - assert isinstance(ch["sfc"], str) - assert isinstance(ch["stream"], str) - - -def test_end_to_end_level_3_high_snr(wav_in_path: Path, tmp_path: Path) -> None: +@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) +def test_end_to_end_level_3_high_snr(mk_actual_stereo_wav: Path, tmp_path: Path) -> None: """ - End-to-end test for Level 3 (Quantizer + Huffman): - - coder_3 -> decoder_3 should reconstruct a waveform with acceptable SNR. - - Notes - ----- - - Level 3 includes quantization, so SNR is expected to be lower than Level 1/2. - - We intentionally use a short excerpt (few frames) to keep runtime bounded, - since the reference quantizer implementation is computationally expensive. + End-to-end Level 3 using a small WAV excerpt produced by mk_actual_stereo_wav. """ - # Use only a few frames to avoid long runtimes. - hop = 1024 - win = 2048 - n_frames = 4 - n_samples = win + (n_frames - 1) * hop - - x_ref, fs = aac_read_wav_stereo_48k(wav_in_path) - x_short = x_ref[:n_samples, :] - - short_wav = tmp_path / "input_short_l3.wav" - sf.write(str(short_wav), x_short, fs) + x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav) + assert int(fs) == 48000 out_wav = tmp_path / "decoded_level3.wav" - aac_seq_3: AACSeq3 = aac_coder_3(short_wav) + aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav) y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav) - # Align lengths defensively (padding removal may differ by a few samples) - n = min(x_short.shape[0], y_hat.shape[0]) - x2 = x_short[:n, :] - y2 = y_hat[:n, :] - - s = snr_db(x2, y2) - - # Conservative threshold: Level 3 is lossy by design. - assert s > 10.0 \ No newline at end of file + n = min(x_ref.shape[0], y_hat.shape[0]) + s = snr_db(x_ref[:n, :], y_hat[:n, :]) + print(f"SNR={s}") + assert s > 2.0 diff --git a/source/level_1/core/aac_coder.py b/source/level_1/core/aac_coder.py index 4c6158c..3e9cc96 100644 --- a/source/level_1/core/aac_coder.py +++ b/source/level_1/core/aac_coder.py @@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> # Level 1 encoder # ----------------------------------------------------------------------------- -def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: +def aac_coder_1( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq1: """ Level-1 AAC encoder. @@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: filename_in : Union[str, Path] Input WAV filename. Assumption: stereo audio, sampling rate 48 kHz. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: aac_seq: AACSeq1 = [] prev_frame_type: FrameType = "OLS" - win_type: WinType = WIN_TYPE - + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: next_t = np.vstack([next_t, tail]) frame_type = aac_ssc(frame_t, next_t, prev_frame_type) - frame_f = aac_filter_bank(frame_t, frame_type, win_type) + frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) aac_seq.append({ "frame_type": frame_type, - "win_type": win_type, + "win_type": WIN_TYPE, "chl": {"frame_F": chl_f}, "chr": {"frame_F": chr_f}, }) prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") return aac_seq -def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: +def aac_coder_2( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq2: """ Level-2 AAC encoder (Level 1 + TNS). @@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: ---------- filename_in : Union[str, Path] Input WAV filename (stereo, 48 kHz). + verbose : bool + Optional argument to print encoding status Returns ------- @@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: aac_seq: AACSeq2 = [] prev_frame_type: FrameType = "OLS" + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: # Level 1 analysis (packed stereo container) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) - # Unpack to per-channel (as you already do in Level 1) - if frame_type == "ESH": - chl_f = np.empty((128, 8), dtype=np.float64) - chr_f = np.empty((128, 8), dtype=np.float64) - for j in range(8): - chl_f[:, j] = frame_f_stereo[:, 2 * j + 0] - chr_f[:, j] = frame_f_stereo[:, 2 * j + 1] - else: - chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False) - chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False) + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) # Level 2: apply TNS per channel chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) @@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, } ) - prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") return aac_seq @@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: def aac_coder_3( filename_in: Union[str, Path], filename_aac_coded: Union[str, Path] | None = None, + verbose: bool = False, ) -> AACSeq3: """ Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). @@ -389,6 +402,8 @@ def aac_coder_3( Input WAV filename (stereo, 48 kHz). filename_aac_coded : Union[str, Path] | None Optional .mat filename to store aac_seq_3 (assignment convenience). + verbose : bool + Optional argument to print encoding status Returns ------- @@ -416,15 +431,14 @@ def aac_coder_3( aac_seq: AACSeq3 = [] prev_frame_type: FrameType = "OLS" - # Pin win_type to the WinType literal for type checkers. - win_type: WinType = WIN_TYPE - # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. prev1_L = np.zeros((2048,), dtype=np.float64) prev2_L = np.zeros((2048,), dtype=np.float64) prev1_R = np.zeros((2048,), dtype=np.float64) prev2_R = np.zeros((2048,), dtype=np.float64) + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -440,7 +454,7 @@ def aac_coder_3( frame_type = aac_ssc(frame_t, next_t, prev_frame_type) # Analysis filterbank (stereo packed) - frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) # TNS per channel @@ -474,32 +488,35 @@ def aac_coder_3( # Codebook 11: # maxAbsCodeVal = 16 is RESERVED for ESCAPE. # We must stay strictly within [-15, +15] to avoid escape decoding. - sf_cb = 11 - sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + # sf_cb = 11 + # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + # + # sfc_L_dpcm = np.clip( + # sfc_L_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) + # + # sfc_R_dpcm = np.clip( + # sfc_R_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) - sfc_L_dpcm = np.clip( - sfc_L_dpcm, - -sf_max_abs, - sf_max_abs, - ).astype(np.int64, copy=False) - - sfc_R_dpcm = np.clip( - sfc_R_dpcm, - -sf_max_abs, - sf_max_abs, - ).astype(np.int64, copy=False) - - sfc_L_stream, _ = aac_encode_huff( + sfc_L_stream, cb_sfc_L = aac_encode_huff( sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, - force_codebook=sf_cb, + # force_codebook=11, ) - sfc_R_stream, _ = aac_encode_huff( + sfc_R_stream, cb_sfc_R = aac_encode_huff( sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, - force_codebook=sf_cb, + # force_codebook=11, ) + if cb_sfc_L != 11 or cb_sfc_R != 11: + print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + mdct_L_stream, cb_L = aac_encode_huff( np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list, @@ -512,7 +529,7 @@ def aac_coder_3( # Typed dict construction helps static analyzers validate the schema. frame_out: AACSeq3Frame = { "frame_type": frame_type, - "win_type": win_type, + "win_type": WIN_TYPE, "chl": { "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64), @@ -539,6 +556,11 @@ def aac_coder_3( prev1_R = frame_R prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") # Optional: store to .mat for the assignment wrapper if filename_aac_coded is not None: @@ -548,6 +570,5 @@ def aac_coder_3( {"aac_seq_3": np.array(aac_seq, dtype=object)}, do_compression=True, ) - return aac_seq diff --git a/source/level_1/core/aac_decoder.py b/source/level_1/core/aac_decoder.py index ac05621..be705df 100644 --- a/source/level_1/core/aac_decoder.py +++ b/source/level_1/core/aac_decoder.py @@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: # Level 1 decoder # ----------------------------------------------------------------------------- -def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_1( + aac_seq_1: AACSeq1, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: """ Level-1 AAC decoder (inverse of aac_coder_1()). @@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_1(). filename_out : Union[str, Path] Output WAV filename. Assumption: 48 kHz, stereo. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) for i, fr in enumerate(aac_seq_1): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start:start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") # Level 1 assumption: 48 kHz output. sf.write(str(filename_out), y, 48000) - return y @@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS # Level 2 decoder # ----------------------------------------------------------------------------- -def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_2( + aac_seq_2: AACSeq2, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: """ Level-2 AAC decoder (inverse of aac_coder_2). @@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_2(). filename_out : Union[str, Path] Output WAV filename. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) for i, fr in enumerate(aac_seq_2): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start : start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") sf.write(str(filename_out), y, 48000) return y -def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_3( + aac_seq_3: AACSeq3, + filename_out: Union[str, Path], + verbose: bool = False, +) -> StereoSignal: """ Level-3 AAC decoder (inverse of aac_coder_3). @@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_3. filename_out : Union[str, Path] Output WAV filename. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) + for i, fr in enumerate(aac_seq_3): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start : start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") sf.write(str(filename_out), y, 48000) return y diff --git a/source/level_1/level_1.py b/source/level_1/level_1.py index 15958bc..63a7ac3 100644 --- a/source/level_1/level_1.py +++ b/source/level_1/level_1.py @@ -20,6 +20,7 @@ from __future__ import annotations from pathlib import Path +from tabnanny import verbose from typing import Union import soundfile as sf @@ -52,7 +53,7 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: AACSeq1 List of encoded frames (Level 1 schema). """ - return core_aac_coder_1(filename_in) + return core_aac_coder_1(filename_in, verbose=True) def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: @@ -73,7 +74,7 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS StereoSignal Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. """ - return core_aac_decoder_1(aac_seq_1, filename_out) + return core_aac_decoder_1(aac_seq_1, filename_out, verbose=True) # ----------------------------------------------------------------------------- diff --git a/source/level_2/core/aac_coder.py b/source/level_2/core/aac_coder.py index 4c6158c..3e9cc96 100644 --- a/source/level_2/core/aac_coder.py +++ b/source/level_2/core/aac_coder.py @@ -214,7 +214,10 @@ def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> # Level 1 encoder # ----------------------------------------------------------------------------- -def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: +def aac_coder_1( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq1: """ Level-1 AAC encoder. @@ -231,6 +234,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: filename_in : Union[str, Path] Input WAV filename. Assumption: stereo audio, sampling rate 48 kHz. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -257,8 +262,8 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: aac_seq: AACSeq1 = [] prev_frame_type: FrameType = "OLS" - win_type: WinType = WIN_TYPE - + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -275,23 +280,31 @@ def aac_coder_1(filename_in: Union[str, Path]) -> AACSeq1: next_t = np.vstack([next_t, tail]) frame_type = aac_ssc(frame_t, next_t, prev_frame_type) - frame_f = aac_filter_bank(frame_t, frame_type, win_type) + frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) aac_seq.append({ "frame_type": frame_type, - "win_type": win_type, + "win_type": WIN_TYPE, "chl": {"frame_F": chl_f}, "chr": {"frame_F": chr_f}, }) prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") return aac_seq -def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: +def aac_coder_2( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq2: """ Level-2 AAC encoder (Level 1 + TNS). @@ -299,6 +312,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: ---------- filename_in : Union[str, Path] Input WAV filename (stereo, 48 kHz). + verbose : bool + Optional argument to print encoding status Returns ------- @@ -330,6 +345,8 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: aac_seq: AACSeq2 = [] prev_frame_type: FrameType = "OLS" + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -347,16 +364,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: # Level 1 analysis (packed stereo container) frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) - # Unpack to per-channel (as you already do in Level 1) - if frame_type == "ESH": - chl_f = np.empty((128, 8), dtype=np.float64) - chr_f = np.empty((128, 8), dtype=np.float64) - for j in range(8): - chl_f[:, j] = frame_f_stereo[:, 2 * j + 0] - chr_f[:, j] = frame_f_stereo[:, 2 * j + 1] - else: - chl_f = frame_f_stereo[:, 0:1].astype(np.float64, copy=False) - chr_f = frame_f_stereo[:, 1:2].astype(np.float64, copy=False) + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) # Level 2: apply TNS per channel chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) @@ -370,8 +378,12 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, } ) - prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") return aac_seq @@ -379,6 +391,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: def aac_coder_3( filename_in: Union[str, Path], filename_aac_coded: Union[str, Path] | None = None, + verbose: bool = False, ) -> AACSeq3: """ Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). @@ -389,6 +402,8 @@ def aac_coder_3( Input WAV filename (stereo, 48 kHz). filename_aac_coded : Union[str, Path] | None Optional .mat filename to store aac_seq_3 (assignment convenience). + verbose : bool + Optional argument to print encoding status Returns ------- @@ -416,15 +431,14 @@ def aac_coder_3( aac_seq: AACSeq3 = [] prev_frame_type: FrameType = "OLS" - # Pin win_type to the WinType literal for type checkers. - win_type: WinType = WIN_TYPE - # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. prev1_L = np.zeros((2048,), dtype=np.float64) prev2_L = np.zeros((2048,), dtype=np.float64) prev1_R = np.zeros((2048,), dtype=np.float64) prev2_R = np.zeros((2048,), dtype=np.float64) + if verbose: + print("Encoding ", end="", flush=True) for i in range(K): start = i * hop @@ -440,7 +454,7 @@ def aac_coder_3( frame_type = aac_ssc(frame_t, next_t, prev_frame_type) # Analysis filterbank (stereo packed) - frame_f_stereo = aac_filter_bank(frame_t, frame_type, win_type) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) # TNS per channel @@ -474,32 +488,35 @@ def aac_coder_3( # Codebook 11: # maxAbsCodeVal = 16 is RESERVED for ESCAPE. # We must stay strictly within [-15, +15] to avoid escape decoding. - sf_cb = 11 - sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + # sf_cb = 11 + # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + # + # sfc_L_dpcm = np.clip( + # sfc_L_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) + # + # sfc_R_dpcm = np.clip( + # sfc_R_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) - sfc_L_dpcm = np.clip( - sfc_L_dpcm, - -sf_max_abs, - sf_max_abs, - ).astype(np.int64, copy=False) - - sfc_R_dpcm = np.clip( - sfc_R_dpcm, - -sf_max_abs, - sf_max_abs, - ).astype(np.int64, copy=False) - - sfc_L_stream, _ = aac_encode_huff( + sfc_L_stream, cb_sfc_L = aac_encode_huff( sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, - force_codebook=sf_cb, + # force_codebook=11, ) - sfc_R_stream, _ = aac_encode_huff( + sfc_R_stream, cb_sfc_R = aac_encode_huff( sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, - force_codebook=sf_cb, + # force_codebook=11, ) + if cb_sfc_L != 11 or cb_sfc_R != 11: + print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + mdct_L_stream, cb_L = aac_encode_huff( np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list, @@ -512,7 +529,7 @@ def aac_coder_3( # Typed dict construction helps static analyzers validate the schema. frame_out: AACSeq3Frame = { "frame_type": frame_type, - "win_type": win_type, + "win_type": WIN_TYPE, "chl": { "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64), @@ -539,6 +556,11 @@ def aac_coder_3( prev1_R = frame_R prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") # Optional: store to .mat for the assignment wrapper if filename_aac_coded is not None: @@ -548,6 +570,5 @@ def aac_coder_3( {"aac_seq_3": np.array(aac_seq, dtype=object)}, do_compression=True, ) - return aac_seq diff --git a/source/level_2/core/aac_decoder.py b/source/level_2/core/aac_decoder.py index ac05621..be705df 100644 --- a/source/level_2/core/aac_decoder.py +++ b/source/level_2/core/aac_decoder.py @@ -118,7 +118,11 @@ def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: # Level 1 decoder # ----------------------------------------------------------------------------- -def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_1( + aac_seq_1: AACSeq1, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: """ Level-1 AAC decoder (inverse of aac_coder_1()). @@ -134,6 +138,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_1(). filename_out : Union[str, Path] Output WAV filename. Assumption: 48 kHz, stereo. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -152,6 +158,8 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) for i, fr in enumerate(aac_seq_1): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -164,12 +172,15 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start:start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") # Level 1 assumption: 48 kHz output. sf.write(str(filename_out), y, 48000) - return y @@ -177,7 +188,11 @@ def aac_decoder_1(aac_seq_1: AACSeq1, filename_out: Union[str, Path]) -> StereoS # Level 2 decoder # ----------------------------------------------------------------------------- -def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_2( + aac_seq_2: AACSeq2, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: """ Level-2 AAC decoder (inverse of aac_coder_2). @@ -195,6 +210,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_2(). filename_out : Union[str, Path] Output WAV filename. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -213,6 +230,8 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) for i, fr in enumerate(aac_seq_2): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -260,15 +279,23 @@ def aac_decoder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start : start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") sf.write(str(filename_out), y, 48000) return y -def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoSignal: +def aac_decoder_3( + aac_seq_3: AACSeq3, + filename_out: Union[str, Path], + verbose: bool = False, +) -> StereoSignal: """ Level-3 AAC decoder (inverse of aac_coder_3). @@ -286,6 +313,8 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS Encoded sequence as produced by aac_coder_3. filename_out : Union[str, Path] Output WAV filename. + verbose : bool + Optional argument to print encoding status Returns ------- @@ -307,6 +336,9 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS n_pad = (K - 1) * hop + win y_pad = np.zeros((n_pad, 2), dtype=np.float64) + if verbose: + print("Decoding ", end="", flush=True) + for i, fr in enumerate(aac_seq_3): frame_type: FrameType = fr["frame_type"] win_type: WinType = fr["win_type"] @@ -401,7 +433,12 @@ def aac_decoder_3(aac_seq_3: AACSeq3, filename_out: Union[str, Path]) -> StereoS start = i * hop y_pad[start : start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") sf.write(str(filename_out), y, 48000) return y diff --git a/source/level_2/level_2.py b/source/level_2/level_2.py index 3ed9029..0d79d07 100644 --- a/source/level_2/level_2.py +++ b/source/level_2/level_2.py @@ -51,7 +51,7 @@ def aac_coder_2(filename_in: Union[str, Path]) -> AACSeq2: AACSeq2 List of encoded frames (Level 2 schema). """ - return core_aac_coder_2(filename_in) + return core_aac_coder_2(filename_in, verbose=True) def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoSignal: @@ -72,7 +72,7 @@ def i_aac_coder_2(aac_seq_2: AACSeq2, filename_out: Union[str, Path]) -> StereoS StereoSignal Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. """ - return core_aac_decoder_2(aac_seq_2, filename_out) + return core_aac_decoder_2(aac_seq_2, filename_out, verbose=True) # ----------------------------------------------------------------------------- diff --git a/source/level_3/core/aac_coder.py b/source/level_3/core/aac_coder.py new file mode 100644 index 0000000..3e9cc96 --- /dev/null +++ b/source/level_3/core/aac_coder.py @@ -0,0 +1,574 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 1 AAC encoder orchestration. +# Keeps the same functional behavior as the original level_1 implementation: +# - Reads WAV via soundfile +# - Validates stereo and 48 kHz +# - Frames into 2048 samples with hop=1024 and zero padding at both ends +# - SSC decision uses next-frame attack detection +# - Filterbank analysis (MDCT) +# - Stores per-channel spectra in AACSeq1 schema: +# * ESH: (128, 8) +# * else: (1024, 1) +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf +from scipy.io import savemat + +from core.aac_configuration import WIN_TYPE +from core.aac_filterbank import aac_filter_bank +from core.aac_ssc import aac_ssc +from core.aac_tns import aac_tns +from core.aac_psycho import aac_psycho +from core.aac_quantizer import aac_quantizer # assumes your quantizer file is core/aac_quantizer.py +from core.aac_huffman import aac_encode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT + +from core.aac_types import * + + + +# ----------------------------------------------------------------------------- +# Helpers for thresholds (T(b)) +# ----------------------------------------------------------------------------- + +def _band_slices_from_table(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return inclusive (lo, hi) band slices derived from TableB219. + """ + table, _ = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + return [(int(lo), int(hi)) for lo, hi in zip(wlow, whigh)] + + +def _thresholds_from_smr( + frame_F_ch: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> FloatArray: + """ + Compute thresholds T(b) = P(b) / SMR(b), where P(b) is band energy. + + Shapes: + - Long: returns (NB, 1) + - ESH: returns (NB, 8) + """ + bands = _band_slices_from_table(frame_type) + NB = len(bands) + + X = np.asarray(frame_F_ch, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F_ch must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + T = np.zeros((NB, 8), dtype=np.float64) + for j in range(8): + Xj = X[:, j] + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xj[lo : hi + 1] ** 2)) + smr = float(SMR[b, j]) + T[b, j] = 0.0 if smr <= 1e-12 else (P / smr) + return T + + # Long + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F_ch must be shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must be shape ({NB},) or ({NB}, 1).") + + T = np.zeros((NB, 1), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + P = float(np.sum(Xv[lo : hi + 1] ** 2)) + smr = float(SMRv[b]) + T[b, 0] = 0.0 if smr <= 1e-12 else (P / smr) + + return T + +def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: + """ + Normalize GlobalGain to match AACChannelFrameF3["G"] type: + - long: return float + - ESH: return float64 ndarray of shape (1, 8) + """ + if np.isscalar(G): + return float(G) + + G_arr = np.asarray(G) + if G_arr.size == 1: + return float(G_arr.reshape(-1)[0]) + + return np.asarray(G_arr, dtype=np.float64) + +# ----------------------------------------------------------------------------- +# Public helpers (useful for level_x demo wrappers) +# ----------------------------------------------------------------------------- + +def aac_read_wav_stereo_48k(filename_in: Union[str, Path]) -> tuple[StereoSignal, int]: + """ + Read a WAV file using soundfile and validate the Level-1 assumptions. + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + + Returns + ------- + x : StereoSignal (np.ndarray) + Stereo samples as float64, shape (N, 2). + fs : int + Sampling rate (Hz). Must be 48000. + + Raises + ------ + ValueError + If the input is not stereo or the sampling rate is not 48 kHz. + """ + filename_in = Path(filename_in) + + x, fs = sf.read(str(filename_in), always_2d=True) + x = np.asarray(x, dtype=np.float64) + + if x.shape[1] != 2: + raise ValueError("Input must be stereo (2 channels).") + if int(fs) != 48000: + raise ValueError("Input sampling rate must be 48 kHz.") + + return x, int(fs) + + +def aac_pack_frame_f_to_seq_channels(frame_type: FrameType, frame_f: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Convert the stereo FrameF returned by aac_filter_bank() into per-channel arrays + as required by the Level-1 AACSeq1 schema. + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + frame_f : FrameF + Output of aac_filter_bank(): + - If frame_type != "ESH": shape (1024, 2) + - If frame_type == "ESH": shape (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + + Returns + ------- + chl_f : FrameChannelF + Left channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: shape (128, 8) + - else: shape (1024, 1) + """ + if frame_type == "ESH": + if frame_f.shape != (128, 16): + raise ValueError("For ESH, frame_f must have shape (128, 16).") + + chl_f = np.empty((128, 8), dtype=np.float64) + chr_f = np.empty((128, 8), dtype=np.float64) + for j in range(8): + chl_f[:, j] = frame_f[:, 2 * j + 0] + chr_f[:, j] = frame_f[:, 2 * j + 1] + return chl_f, chr_f + + # Non-ESH: store as (1024, 1) as required by the original Level-1 schema. + if frame_f.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_f must have shape (1024, 2).") + + chl_f = frame_f[:, 0:1].astype(np.float64, copy=False) + chr_f = frame_f[:, 1:2].astype(np.float64, copy=False) + return chl_f, chr_f + + + +# ----------------------------------------------------------------------------- +# Level 1 encoder +# ----------------------------------------------------------------------------- + +def aac_coder_1( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq1: + """ + Level-1 AAC encoder. + + This function preserves the behavior of the original level_1 implementation: + - Read stereo 48 kHz WAV + - Pad hop samples at start and hop samples at end + - Frame with win=2048, hop=1024 + - Use SSC with next-frame lookahead + - Apply filterbank analysis + - Store per-channel coefficients using AACSeq1 schema + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + Assumption: stereo audio, sampling rate 48 kHz. + verbose : bool + Optional argument to print encoding status + + Returns + ------- + AACSeq1 + List of encoded frames (Level 1 schema). + """ + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + # Pad at the beginning to support the first overlap region. + # Tail padding is kept minimal; next-frame is padded on-the-fly when needed. + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + # Number of frames such that current frame fits; next frame will be padded if needed. + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq1 = [] + prev_frame_type: FrameType = "OLS" + + if verbose: + print("Encoding ", end="", flush=True) + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start:start + win, :] + if frame_t.shape != (win, 2): + # This should not happen due to K definition, but keep it explicit. + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop:start + hop + win, :] + + # Ensure next_t is always (2048, 2) by zero-padding at the tail. + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) + frame_f = aac_filter_bank(frame_t, frame_type, WIN_TYPE) + + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f) + + aac_seq.append({ + "frame_type": frame_type, + "win_type": WIN_TYPE, + "chl": {"frame_F": chl_f}, + "chr": {"frame_F": chr_f}, + }) + + prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") + + return aac_seq + + +def aac_coder_2( + filename_in: Union[str, Path], + verbose: bool = False +) -> AACSeq2: + """ + Level-2 AAC encoder (Level 1 + TNS). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + verbose : bool + Optional argument to print encoding status + + Returns + ------- + AACSeq2 + Encoded AAC sequence (Level 2 payload schema). + For each frame i: + - "frame_type": FrameType + - "win_type": WinType + - "chl"/"chr": + - "frame_F": FrameChannelF (after TNS) + - "tns_coeffs": TnsCoeffs + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + # The assignment assumes 48 kHz + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + aac_seq: AACSeq2 = [] + prev_frame_type: FrameType = "OLS" + + if verbose: + print("Encoding ", end="", flush=True) + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) + + # Level 1 analysis (packed stereo container) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) + + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) + + # Level 2: apply TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + aac_seq.append( + { + "frame_type": frame_type, + "win_type": WIN_TYPE, + "chl": {"frame_F": chl_f_tns, "tns_coeffs": chl_tns_coeffs}, + "chr": {"frame_F": chr_f_tns, "tns_coeffs": chr_tns_coeffs}, + } + ) + prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") + + return aac_seq + + +def aac_coder_3( + filename_in: Union[str, Path], + filename_aac_coded: Union[str, Path] | None = None, + verbose: bool = False, +) -> AACSeq3: + """ + Level-3 AAC encoder (Level 2 + Psycho + Quantizer + Huffman). + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + filename_aac_coded : Union[str, Path] | None + Optional .mat filename to store aac_seq_3 (assignment convenience). + verbose : bool + Optional argument to print encoding status + + Returns + ------- + AACSeq3 + Encoded AAC sequence (Level 3 payload schema). + """ + filename_in = Path(filename_in) + + x, _ = aac_read_wav_stereo_48k(filename_in) + + hop = 1024 + win = 2048 + + pad_pre = np.zeros((hop, 2), dtype=np.float64) + pad_post = np.zeros((hop, 2), dtype=np.float64) + x_pad = np.vstack([pad_pre, x, pad_post]) + + K = int((x_pad.shape[0] - win) // hop + 1) + if K <= 0: + raise ValueError("Input too short for framing.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + aac_seq: AACSeq3 = [] + prev_frame_type: FrameType = "OLS" + + # Psycho model needs per-channel history (prev1, prev2) of 2048-sample frames. + prev1_L = np.zeros((2048,), dtype=np.float64) + prev2_L = np.zeros((2048,), dtype=np.float64) + prev1_R = np.zeros((2048,), dtype=np.float64) + prev2_R = np.zeros((2048,), dtype=np.float64) + + if verbose: + print("Encoding ", end="", flush=True) + for i in range(K): + start = i * hop + + frame_t: FrameT = x_pad[start : start + win, :] + if frame_t.shape != (win, 2): + raise ValueError("Internal framing error: frame_t has wrong shape.") + + next_t = x_pad[start + hop : start + hop + win, :] + if next_t.shape[0] < win: + tail = np.zeros((win - next_t.shape[0], 2), dtype=np.float64) + next_t = np.vstack([next_t, tail]) + + frame_type = aac_ssc(frame_t, next_t, prev_frame_type) + + # Analysis filterbank (stereo packed) + frame_f_stereo = aac_filter_bank(frame_t, frame_type, WIN_TYPE) + chl_f, chr_f = aac_pack_frame_f_to_seq_channels(frame_type, frame_f_stereo) + + # TNS per channel + chl_f_tns, chl_tns_coeffs = aac_tns(chl_f, frame_type) + chr_f_tns, chr_tns_coeffs = aac_tns(chr_f, frame_type) + + # Psychoacoustic model per channel (time-domain) + frame_L = np.asarray(frame_t[:, 0], dtype=np.float64) + frame_R = np.asarray(frame_t[:, 1], dtype=np.float64) + + SMR_L = aac_psycho(frame_L, frame_type, prev1_L, prev2_L) + SMR_R = aac_psycho(frame_R, frame_type, prev1_R, prev2_R) + + # Thresholds T(b) (stored, not entropy-coded) + T_L = _thresholds_from_smr(chl_f_tns, frame_type, SMR_L) + T_R = _thresholds_from_smr(chr_f_tns, frame_type, SMR_R) + + # Quantizer per channel + S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) + S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) + + # Normalize G types for AACSeq3 schema (float | float64 ndarray). + G_Ln = _normalize_global_gain(G_L) + G_Rn = _normalize_global_gain(G_R) + + # Huffman-code ONLY the DPCM differences for b>0. + # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. + sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] + sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...] + + # Codebook 11: + # maxAbsCodeVal = 16 is RESERVED for ESCAPE. + # We must stay strictly within [-15, +15] to avoid escape decoding. + # sf_cb = 11 + # sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15 + # + # sfc_L_dpcm = np.clip( + # sfc_L_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) + # + # sfc_R_dpcm = np.clip( + # sfc_R_dpcm, + # -sf_max_abs, + # sf_max_abs, + # ).astype(np.int64, copy=False) + + sfc_L_stream, cb_sfc_L = aac_encode_huff( + sfc_L_dpcm.reshape(-1, order="F"), + huff_LUT_list, + # force_codebook=11, + ) + sfc_R_stream, cb_sfc_R = aac_encode_huff( + sfc_R_dpcm.reshape(-1, order="F"), + huff_LUT_list, + # force_codebook=11, + ) + + if cb_sfc_L != 11 or cb_sfc_R != 11: + print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}") + + mdct_L_stream, cb_L = aac_encode_huff( + np.asarray(S_L, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + mdct_R_stream, cb_R = aac_encode_huff( + np.asarray(S_R, dtype=np.int64).reshape(-1), + huff_LUT_list, + ) + + # Typed dict construction helps static analyzers validate the schema. + frame_out: AACSeq3Frame = { + "frame_type": frame_type, + "win_type": WIN_TYPE, + "chl": { + "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_L, dtype=np.float64), + "G": G_Ln, + "sfc": sfc_L_stream, + "stream": mdct_L_stream, + "codebook": int(cb_L), + }, + "chr": { + "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), + "T": np.asarray(T_R, dtype=np.float64), + "G": G_Rn, + "sfc": sfc_R_stream, + "stream": mdct_R_stream, + "codebook": int(cb_R), + }, + } + aac_seq.append(frame_out) + + # Update psycho history (shift register) + prev2_L = prev1_L + prev1_L = frame_L + prev2_R = prev1_R + prev1_R = frame_R + + prev_frame_type = frame_type + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + if verbose: + print(" done") + + # Optional: store to .mat for the assignment wrapper + if filename_aac_coded is not None: + filename_aac_coded = Path(filename_aac_coded) + savemat( + str(filename_aac_coded), + {"aac_seq_3": np.array(aac_seq, dtype=object)}, + do_compression=True, + ) + return aac_seq + diff --git a/source/level_3/core/aac_configuration.py b/source/level_3/core/aac_configuration.py new file mode 100644 index 0000000..63d6297 --- /dev/null +++ b/source/level_3/core/aac_configuration.py @@ -0,0 +1,41 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Configuration +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module contains the global configurations +# +# ------------------------------------------------------------ +from __future__ import annotations + +# Imports +from typing import Final + +from core.aac_types import WinType + +# Filterbank +# ------------------------------------------------------------ +# Window type +# Options: "SIN", "KBD" +WIN_TYPE: WinType = "SIN" + + +# TNS +# ------------------------------------------------------------ +PRED_ORDER = 4 +QUANT_STEP = 0.1 +QUANT_MAX = 0.7 # 4-bit symmetric with step 0.1 -> clamp to [-0.7, +0.7] + + +# ----------------------------------------------------------------------------- +# Psycho +# ----------------------------------------------------------------------------- + +NMT_DB: Final[float] = 6.0 # Noise Masking Tone (dB) +TMN_DB: Final[float] = 18.0 # Tone Masking Noise (dB) \ No newline at end of file diff --git a/source/level_3/core/aac_decoder.py b/source/level_3/core/aac_decoder.py new file mode 100644 index 0000000..be705df --- /dev/null +++ b/source/level_3/core/aac_decoder.py @@ -0,0 +1,445 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Inverse AAC Coder (Core) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# - Level 1 AAC decoder orchestration (inverse of aac_coder_1()). +# - Level 2 AAC decoder orchestration (inverse of aac_coder_1()). +# +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import soundfile as sf + +from core.aac_filterbank import aac_i_filter_bank +from core.aac_tns import aac_i_tns +from core.aac_quantizer import aac_i_quantizer +from core.aac_huffman import aac_decode_huff +from core.aac_utils import get_table, band_limits +from material.huff_utils import load_LUT +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Helper for NB +# ----------------------------------------------------------------------------- +def _nbands(frame_type: FrameType) -> int: + table, _ = get_table(frame_type) + wlow, _whigh, _bval, _qthr_db = band_limits(table) + return int(len(wlow)) + + +# ----------------------------------------------------------------------------- +# Public helpers +# ----------------------------------------------------------------------------- + +def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: + """ + Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo + FrameF container expected by aac_i_filter_bank(). + + Parameters + ---------- + frame_type : FrameType + "OLS" | "LSS" | "ESH" | "LPS". + chl_f : FrameChannelF + Left channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + chr_f : FrameChannelF + Right channel coefficients: + - ESH: (128, 8) + - else: (1024, 1) + + Returns + ------- + FrameF + Stereo coefficients: + - ESH: (128, 16) packed as [L0 R0 L1 R1 ... L7 R7] + - else: (1024, 2) + """ + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + return frame_f + + # Non-ESH: expected (1024, 1) per channel in Level-1 schema. + if chl_f.shape != (1024, 1) or chr_f.shape != (1024, 1): + raise ValueError("Non-ESH channel frame_F must have shape (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_f[:, 0] + frame_f[:, 1] = chr_f[:, 0] + return frame_f + + +def aac_remove_padding(y_pad: StereoSignal, hop: int = 1024) -> StereoSignal: + """ + Remove the boundary padding that the Level-1 encoder adds: + hop samples at start and hop samples at end. + + Parameters + ---------- + y_pad : StereoSignal (np.ndarray) + Reconstructed padded stream, shape (N_pad, 2). + hop : int + Hop size in samples (default 1024). + + Returns + ------- + StereoSignal (np.ndarray) + Unpadded reconstructed stream, shape (N_pad - 2*hop, 2). + + Raises + ------ + ValueError + If y_pad is too short to unpad. + """ + if y_pad.shape[0] < 2 * hop: + raise ValueError("Decoded stream too short to unpad.") + return y_pad[hop:-hop, :] + + +# ----------------------------------------------------------------------------- +# Level 1 decoder +# ----------------------------------------------------------------------------- + +def aac_decoder_1( + aac_seq_1: AACSeq1, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: + """ + Level-1 AAC decoder (inverse of aac_coder_1()). + + This function preserves the behavior of the original level_1 implementation: + - Reconstruct the full padded stream by overlap-adding K synthesized frames + - Remove hop padding at the beginning and hop padding at the end + - Write the reconstructed stereo WAV file (48 kHz) + - Return reconstructed stereo samples as float64 + + Parameters + ---------- + aac_seq_1 : AACSeq1 + Encoded sequence as produced by aac_coder_1(). + filename_out : Union[str, Path] + Output WAV filename. Assumption: 48 kHz, stereo. + verbose : bool + Optional argument to print encoding status + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_1) + + # Output includes the encoder padding region, so we reconstruct the full padded stream. + # For K frames: last frame starts at (K-1)*hop and spans win, + # so total length = (K-1)*hop + win. + n_pad = (K - 1) * hop + win + y_pad: StereoSignal = np.zeros((n_pad, 2), dtype=np.float64) + + if verbose: + print("Decoding ", end="", flush=True) + for i, fr in enumerate(aac_seq_1): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) + + start = i * hop + y_pad[start:start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + y: StereoSignal = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") + + # Level 1 assumption: 48 kHz output. + sf.write(str(filename_out), y, 48000) + return y + + +# ----------------------------------------------------------------------------- +# Level 2 decoder +# ----------------------------------------------------------------------------- + +def aac_decoder_2( + aac_seq_2: AACSeq2, + filename_out: Union[str, Path], + verbose: bool = False +) -> StereoSignal: + """ + Level-2 AAC decoder (inverse of aac_coder_2). + + Behavior matches Level 1 decoder pipeline, with additional iTNS stage: + - Per frame/channel: inverse TNS using stored coefficients + - Re-pack to stereo frame_F + - IMDCT + windowing + - Overlap-add over frames + - Remove Level-1 padding (hop samples start/end) + - Write output WAV (48 kHz) + + Parameters + ---------- + aac_seq_2 : AACSeq2 + Encoded sequence as produced by aac_coder_2(). + filename_out : Union[str, Path] + Output WAV filename. + verbose : bool + Optional argument to print encoding status + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_2) + + if K <= 0: + raise ValueError("aac_seq_2 must contain at least one frame.") + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + if verbose: + print("Decoding ", end="", flush=True) + for i, fr in enumerate(aac_seq_2): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + chl_f_tns = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) + chr_f_tns = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) + + chl_coeffs = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + chr_coeffs = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + + # Inverse TNS per channel + chl_f = aac_i_tns(chl_f_tns, frame_type, chl_coeffs) + chr_f = aac_i_tns(chr_f_tns, frame_type, chr_coeffs) + + # Re-pack to the stereo container expected by aac_i_filter_bank + if frame_type == "ESH": + if chl_f.shape != (128, 8) or chr_f.shape != (128, 8): + raise ValueError("ESH channel frame_F must have shape (128, 8).") + + frame_f: FrameF = np.empty((128, 16), dtype=np.float64) + for j in range(8): + frame_f[:, 2 * j + 0] = chl_f[:, j] + frame_f[:, 2 * j + 1] = chr_f[:, j] + else: + # Accept either (1024,1) or (1024,) from your internal convention. + if chl_f.shape == (1024,): + chl_col = chl_f.reshape(1024, 1) + elif chl_f.shape == (1024, 1): + chl_col = chl_f + else: + raise ValueError("Non-ESH left channel frame_F must be shape (1024,) or (1024, 1).") + + if chr_f.shape == (1024,): + chr_col = chr_f.reshape(1024, 1) + elif chr_f.shape == (1024, 1): + chr_col = chr_f + else: + raise ValueError("Non-ESH right channel frame_F must be shape (1024,) or (1024, 1).") + + frame_f = np.empty((1024, 2), dtype=np.float64) + frame_f[:, 0] = chl_col[:, 0] + frame_f[:, 1] = chr_col[:, 0] + + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") + + sf.write(str(filename_out), y, 48000) + return y + + + +def aac_decoder_3( + aac_seq_3: AACSeq3, + filename_out: Union[str, Path], + verbose: bool = False, +) -> StereoSignal: + """ + Level-3 AAC decoder (inverse of aac_coder_3). + + Steps per frame: + - Huffman decode scalefactors (sfc) using codebook 11 + - Huffman decode MDCT symbols (stream) using stored codebook + - iQuantizer -> MDCT coefficients after TNS + - iTNS using stored predictor coefficients + - IMDCT filterbank -> time domain + - Overlap-add, remove padding, write WAV + + Parameters + ---------- + aac_seq_3 : AACSeq3 + Encoded sequence as produced by aac_coder_3. + filename_out : Union[str, Path] + Output WAV filename. + verbose : bool + Optional argument to print encoding status + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + filename_out = Path(filename_out) + + hop = 1024 + win = 2048 + K = len(aac_seq_3) + + if K <= 0: + raise ValueError("aac_seq_3 must contain at least one frame.") + + # Load Huffman LUTs once. + huff_LUT_list = load_LUT() + + n_pad = (K - 1) * hop + win + y_pad = np.zeros((n_pad, 2), dtype=np.float64) + + if verbose: + print("Decoding ", end="", flush=True) + + for i, fr in enumerate(aac_seq_3): + frame_type: FrameType = fr["frame_type"] + win_type: WinType = fr["win_type"] + + NB = _nbands(frame_type) + # We store G separately, so Huffman stream contains only (NB-1) DPCM differences. + sfc_len = (NB - 1) * (8 if frame_type == "ESH" else 1) + + # ------------------------- + # Left channel + # ------------------------- + tns_L = np.asarray(fr["chl"]["tns_coeffs"], dtype=np.float64) + G_L = fr["chl"]["G"] + sfc_bits_L = fr["chl"]["sfc"] + mdct_bits_L = fr["chl"]["stream"] + cb_L = int(fr["chl"]["codebook"]) + + sfc_dec_L = aac_decode_huff(sfc_bits_L, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 8, order="F") + sfc_L = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_L, dtype=np.float64).reshape(1, 8) + sfc_L[0, :] = Gv[0, :].astype(np.int64) + sfc_L[1:, :] = sfc_dpcm_L + else: + sfc_dpcm_L = sfc_dec_L.reshape(NB - 1, 1, order="F") + sfc_L = np.zeros((NB, 1), dtype=np.int64) + sfc_L[0, 0] = int(float(G_L)) + sfc_L[1:, :] = sfc_dpcm_L + + # MDCT symbols: codebook 0 means "all-zero section" + if cb_L == 0: + S_dec_L = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_L = aac_decode_huff(mdct_bits_L, cb_L, huff_LUT_list).astype(np.int64, copy=False) + + # Tuple coding may produce extra trailing symbols; caller knows the true length (1024). + # Also guard against short outputs by zero-padding. + if S_tmp_L.size < 1024: + S_dec_L = np.zeros((1024,), dtype=np.int64) + S_dec_L[: S_tmp_L.size] = S_tmp_L + else: + S_dec_L = S_tmp_L[:1024] + + S_L = S_dec_L.reshape(1024, 1) + + Xq_L = aac_i_quantizer(S_L, sfc_L, G_L, frame_type) + X_L = aac_i_tns(Xq_L, frame_type, tns_L) + + # ------------------------- + # Right channel + # ------------------------- + tns_R = np.asarray(fr["chr"]["tns_coeffs"], dtype=np.float64) + G_R = fr["chr"]["G"] + sfc_bits_R = fr["chr"]["sfc"] + mdct_bits_R = fr["chr"]["stream"] + cb_R = int(fr["chr"]["codebook"]) + + sfc_dec_R = aac_decode_huff(sfc_bits_R, 11, huff_LUT_list)[:sfc_len].astype(np.int64, copy=False) + if frame_type == "ESH": + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 8, order="F") + sfc_R = np.zeros((NB, 8), dtype=np.int64) + Gv = np.asarray(G_R, dtype=np.float64).reshape(1, 8) + sfc_R[0, :] = Gv[0, :].astype(np.int64) + sfc_R[1:, :] = sfc_dpcm_R + else: + sfc_dpcm_R = sfc_dec_R.reshape(NB - 1, 1, order="F") + sfc_R = np.zeros((NB, 1), dtype=np.int64) + sfc_R[0, 0] = int(float(G_R)) + sfc_R[1:, :] = sfc_dpcm_R + + if cb_R == 0: + S_dec_R = np.zeros((1024,), dtype=np.int64) + else: + S_tmp_R = aac_decode_huff(mdct_bits_R, cb_R, huff_LUT_list).astype(np.int64, copy=False) + + if S_tmp_R.size < 1024: + S_dec_R = np.zeros((1024,), dtype=np.int64) + S_dec_R[: S_tmp_R.size] = S_tmp_R + else: + S_dec_R = S_tmp_R[:1024] + + S_R = S_dec_R.reshape(1024, 1) + + Xq_R = aac_i_quantizer(S_R, sfc_R, G_R, frame_type) + X_R = aac_i_tns(Xq_R, frame_type, tns_R) + + # Re-pack to stereo container and inverse filterbank + frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) + + start = i * hop + y_pad[start : start + win, :] += frame_t_hat + + if verbose and (i % (K//20)) == 0: + print(".", end="", flush=True) + + y = aac_remove_padding(y_pad, hop=hop) + if verbose: + print(" done") + + sf.write(str(filename_out), y, 48000) + return y + diff --git a/source/level_3/core/aac_filterbank.py b/source/level_3/core/aac_filterbank.py new file mode 100644 index 0000000..6bd6e76 --- /dev/null +++ b/source/level_3/core/aac_filterbank.py @@ -0,0 +1,387 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Filterbank module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Filterbank stage (MDCT/IMDCT), windowing, ESH packing/unpacking +# +# ------------------------------------------------------------ +from __future__ import annotations + +from core.aac_utils import mdct, imdct +from core.aac_types import * + +from scipy.signal.windows import kaiser + +# Private helpers for Filterbank +# ------------------------------------------------------------ + +def _sin_window(N: int) -> Window: + """ + Build a sinusoidal (SIN) window of length N. + + The AAC sinusoid window is: + w[n] = sin(pi/N * (n + 0.5)), for 0 <= n < N + + Parameters + ---------- + N : int + Window length in samples. + + Returns + ------- + Window + 1-D array of shape (N, ) with dtype float64. + """ + n = np.arange(N, dtype=np.float64) + return np.sin((np.pi / N) * (n + 0.5)) + + +def _kbd_window(N: int, alpha: float) -> Window: + """ + Build a Kaiser-Bessel-Derived (KBD) window of length N. + + This follows the standard KBD construction used in AAC: + 1) Build a Kaiser kernel of length (N/2 + 1). + 2) Form the left half by cumulative summation, normalization, and sqrt. + 3) Mirror the left half to form the right half (symmetric full-length window). + + Notes + ----- + - N must be even (AAC uses N=2048 for long and N=256 for short). + - The assignment specifies alpha=6 for long windows and alpha=4 for short windows. + - The Kaiser beta parameter is commonly taken as beta = pi * alpha for this context. + + Parameters + ---------- + N : int + Window length in samples (must be even). + alpha : float + KBD alpha parameter. + + Returns + ------- + Window + 1-D array of shape (N,) with dtype float64. + """ + half = N // 2 + + # Kaiser kernel length: half + 1 samples (0 .. half) + # beta = pi * alpha per the usual correspondence with the ISO definition + kernel = kaiser(half + 1, beta=np.pi * alpha).astype(np.float64) + + csum = np.cumsum(kernel) + denom = csum[-1] + + w_left = np.sqrt(csum[:-1] / denom) # length half, n = 0 .. half-1 + w_right = w_left[::-1] # mirror for second half + + return np.concatenate([w_left, w_right]) + + +def _long_window(win_type: WinType) -> Window: + """ + Return the long AAC window (length 2048) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(2048) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(2048, alpha=6.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _short_window(win_type: WinType) -> Window: + """ + Return the short AAC window (length 256) for the selected window family. + + Parameters + ---------- + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (256,) with dtype float64. + """ + if win_type == "SIN": + return _sin_window(256) + if win_type == "KBD": + # Assignment-specific alpha values + return _kbd_window(256, alpha=4.0) + raise ValueError(f"Invalid win_type: {win_type!r}") + + +def _window_sequence(frame_type: FrameType, win_type: WinType) -> Window: + """ + Build the 2048-sample analysis/synthesis window for OLS/LSS/LPS. + + In this assignment we assume a single window family is used globally + (no mixed KBD/SIN halves). Therefore, both the long and short windows + are drawn from the same family. + + For frame_type: + - "OLS": return the long window Wl (2048). + - "LSS": construct [Wl_left(1024), ones(448), Ws_right(128), zeros(448)]. + - "LPS": construct [zeros(448), Ws_left(128), ones(448), Wl_right(1024)]. + + Parameters + ---------- + frame_type : FrameType + One of "OLS", "LSS", "LPS". + win_type : WinType + Either "SIN" or "KBD". + + Returns + ------- + Window + 1-D array of shape (2048,) with dtype float64. + """ + wL = _long_window(win_type) # length 2048 + wS = _short_window(win_type) # length 256 + + if frame_type == "OLS": + return wL + + if frame_type == "LSS": + # 0..1023: left half of long window + # 1024..1471: ones (448 samples) + # 1472..1599: right half of short window (128 samples) + # 1600..2047: zeros (448 samples) + out = np.zeros(2048, dtype=np.float64) + out[0:1024] = wL[0:1024] + out[1024:1472] = 1.0 + out[1472:1600] = wS[128:256] + out[1600:2048] = 0.0 + return out + + if frame_type == "LPS": + # 0..447: zeros (448) + # 448..575: left half of short window (128) + # 576..1023: ones (448) + # 1024..2047: right half of long window (1024) + out = np.zeros(2048, dtype=np.float64) + out[0:448] = 0.0 + out[448:576] = wS[0:128] + out[576:1024] = 1.0 + out[1024:2048] = wL[1024:2048] + return out + + raise ValueError(f"Invalid frame_type for long window sequence: {frame_type!r}") + + +def _filter_bank_esh_channel(x_ch: FrameChannelT, win_type: WinType) -> FrameChannelF: + """ + ESH analysis for one channel. + + Parameters + ---------- + x_ch : FrameChannelT + Time-domain channel frame (expected shape: (2048,)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelF + Array of shape (128, 8). Column j contains the 128 MDCT coefficients + of the j-th short window. + """ + wS = _short_window(win_type) # (256,) + X_esh = np.empty((128, 8), dtype=np.float64) + + # ESH subwindows are taken from the central region: + # start positions: 448 + 128*j, j = 0..7 + for j in range(8): + start = 448 + 128 * j + seg = x_ch[start:start + 256] * wS # (256,) + X_esh[:, j] = mdct(seg) # (128,) + + return X_esh + + +def _unpack_esh(frame_F: FrameF) -> tuple[FrameChannelF, FrameChannelF]: + """ + Unpack ESH spectrum from shape (128, 16) into per-channel arrays (128, 8). + + Parameters + ---------- + frame_F : FrameF + Packed ESH spectrum (expected shape: (128, 16)). + + Returns + ------- + left : FrameChannelF + Left channel spectrum, shape (128, 8). + right : FrameChannelF + Right channel spectrum, shape (128, 8). + + Notes + ----- + Inverse mapping of the packing used in aac_filter_bank(): + packed[:, 2*j] = left[:, j] + packed[:, 2*j+1] = right[:, j] + """ + if frame_F.shape != (128, 16): + raise ValueError("ESH frame_F must have shape (128, 16).") + + left = np.empty((128, 8), dtype=np.float64) + right = np.empty((128, 8), dtype=np.float64) + for j in range(8): + left[:, j] = frame_F[:, 2 * j + 0] + right[:, j] = frame_F[:, 2 * j + 1] + return left, right + + +def _i_filter_bank_esh_channel(X_esh: FrameChannelF, win_type: WinType) -> FrameChannelT: + """ + ESH synthesis for one channel. + + Parameters + ---------- + X_esh : FrameChannelF + MDCT coefficients for 8 short windows (expected shape: (128, 8)). + win_type : WinType + Window family ("KBD" or "SIN"). + + Returns + ------- + FrameChannelT + Time-domain channel contribution, shape (2048,). + This is already overlap-added internally for the 8 short blocks and + ready for OLA at the caller level. + """ + if X_esh.shape != (128, 8): + raise ValueError("X_esh must have shape (128, 8).") + + wS = _short_window(win_type) # (256,) + out = np.zeros(2048, dtype=np.float64) + + # Each short IMDCT returns 256 samples. Place them at: + # start = 448 + 128*j, j=0..7 (50% overlap) + for j in range(8): + seg = imdct(X_esh[:, j]) * wS # (256,) + start = 448 + 128 * j + out[start:start + 256] += seg + + return out + + +# ----------------------------------------------------------------------------- +# Public Function prototypes +# ----------------------------------------------------------------------------- + +def aac_filter_bank(frame_T: FrameT, frame_type: FrameType, win_type: WinType) -> FrameF: + """ + Filterbank stage (MDCT analysis). + + Parameters + ---------- + frame_T : FrameT + Time-domain frame, stereo, shape (2048, 2). + frame_type : FrameType + Type of the frame under encoding ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN") used for the current frame. + + Returns + ------- + frame_F : FrameF + Frequency-domain MDCT coefficients: + - If frame_type in {"OLS","LSS","LPS"}: array shape (1024, 2) + containing MDCT coefficients for both channels. + - If frame_type == "ESH": contains 8 subframes, each subframe has shape (128,2), + placed in columns according to subframe order, i.e. overall shape (128, 16). + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + + xL :FrameChannelT = frame_T[:, 0].astype(np.float64, copy=False) + xR :FrameChannelT = frame_T[:, 1].astype(np.float64, copy=False) + + if frame_type in ("OLS", "LSS", "LPS"): + w = _window_sequence(frame_type, win_type) # length 2048 + XL = mdct(xL * w) # length 1024 + XR = mdct(xR * w) # length 1024 + out = np.empty((1024, 2), dtype=np.float64) + out[:, 0] = XL + out[:, 1] = XR + return out + + if frame_type == "ESH": + Xl = _filter_bank_esh_channel(xL, win_type) # (128, 8) + Xr = _filter_bank_esh_channel(xR, win_type) # (128, 8) + + # Pack into (128, 16): each subframe as (128,2) placed in columns + out = np.empty((128, 16), dtype=np.float64) + for j in range(8): + out[:, 2 * j + 0] = Xl[:, j] + out[:, 2 * j + 1] = Xr[:, j] + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") + + +def aac_i_filter_bank(frame_F: FrameF, frame_type: FrameType, win_type: WinType) -> FrameT: + """ + Inverse filterbank (IMDCT synthesis). + + Parameters + ---------- + frame_F : FrameF + Frequency-domain MDCT coefficients as produced by filter_bank(). + frame_type : FrameType + Frame type ("OLS"|"LSS"|"ESH"|"LPS"). + win_type : WinType + Window type ("KBD" or "SIN"). + + Returns + ------- + frame_T : FrameT + Reconstructed time-domain frame, stereo, shape (2048, 2). + """ + if frame_type in ("OLS", "LSS", "LPS"): + if frame_F.shape != (1024, 2): + raise ValueError("For OLS/LSS/LPS, frame_F must have shape (1024, 2).") + + w = _window_sequence(frame_type, win_type) + + xL = imdct(frame_F[:, 0]) * w + xR = imdct(frame_F[:, 1]) * w + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + if frame_type == "ESH": + if frame_F.shape != (128, 16): + raise ValueError("For ESH, frame_F must have shape (128, 16).") + + Xl, Xr = _unpack_esh(frame_F) + xL = _i_filter_bank_esh_channel(Xl, win_type) + xR = _i_filter_bank_esh_channel(Xr, win_type) + + out = np.empty((2048, 2), dtype=np.float64) + out[:, 0] = xL + out[:, 1] = xR + return out + + raise ValueError(f"Invalid frame_type: {frame_type!r}") diff --git a/source/level_3/core/aac_huffman.py b/source/level_3/core/aac_huffman.py new file mode 100644 index 0000000..3d2eeca --- /dev/null +++ b/source/level_3/core/aac_huffman.py @@ -0,0 +1,112 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Huffman wrappers (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Thin wrappers around the provided Huffman utilities (material/huff_utils.py) +# so that the API matches the assignment text. +# +# Exposed API (assignment): +# huff_sec, huff_codebook = aac_encode_huff(coeff_sec, huff_LUT_list, force_codebook) +# dec_coeffs = aac_decode_huff(huff_sec, huff_codebook, huff_LUT_list) +# +# Notes: +# - Huffman coding operates on tuples. Therefore, decode(encode(x)) may return +# extra trailing symbols due to tuple padding. The AAC decoder knows the +# true section length from side information (band limits) and truncates. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Any +import numpy as np + +from material.huff_utils import encode_huff, decode_huff + + +def aac_encode_huff( + coeff_sec: np.ndarray, + huff_LUT_list: list[dict[str, Any]], + force_codebook: int | None = None, +) -> tuple[str, int]: + """ + Huffman-encode a section of coefficients (MDCT symbols or scalefactors). + + Parameters + ---------- + coeff_sec : np.ndarray + Coefficient section to be encoded. Any shape is accepted; the input + is flattened and treated as a 1-D sequence of int64 symbols. + huff_LUT_list : list[dict[str, Any]] + List of Huffman Look-Up Tables (LUTs) as returned by material.load_LUT(). + Index corresponds to codebook id (typically 1..11, with 0 reserved). + force_codebook : int | None + If provided, forces the use of this Huffman codebook. In the assignment, + scalefactors are encoded with codebook 11. For MDCT coefficients, this + argument is usually omitted (auto-selection). + + Returns + ------- + tuple[str, int] + (huff_sec, huff_codebook) + - huff_sec: bitstream as a string of '0'/'1' + - huff_codebook: codebook id used by the encoder + """ + coeff_sec_arr = np.asarray(coeff_sec, dtype=np.int64).reshape(-1) + + if force_codebook is None: + # Provided utility returns (bitstream, codebook) in the auto-selection case. + huff_sec, huff_codebook = encode_huff(coeff_sec_arr, huff_LUT_list) + return str(huff_sec), int(huff_codebook) + + # Provided utility returns ONLY the bitstream when force_codebook is set. + cb = int(force_codebook) + huff_sec = encode_huff(coeff_sec_arr, huff_LUT_list, force_codebook=cb) + return str(huff_sec), cb + + +def aac_decode_huff( + huff_sec: str | np.ndarray, + huff_codebook: int, + huff_LUT: list[dict[str, Any]], +) -> np.ndarray: + """ + Huffman-decode a bitstream using the specified codebook. + + Parameters + ---------- + huff_sec : str | np.ndarray + Huffman bitstream. Typically a string of '0'/'1'. If an array is provided, + it is passed through to the provided decoder. + huff_codebook : int + Codebook id that was returned by aac_encode_huff. + Codebook 0 represents an all-zero section. + huff_LUT : list[dict[str, Any]] + Huffman LUT list as returned by material.load_LUT(). + + Returns + ------- + np.ndarray + Decoded coefficients as a 1-D np.int64 array. + + Note: Due to tuple coding, the decoded array may contain extra trailing + padding symbols. The caller must truncate to the known section length. + """ + cb = int(huff_codebook) + + if cb == 0: + # Codebook 0 represents an all-zero section. The decoded length is not + # recoverable from the bitstream alone; the caller must expand/truncate. + return np.zeros((0,), dtype=np.int64) + + if cb < 0 or cb >= len(huff_LUT): + raise ValueError(f"Invalid Huffman codebook index: {cb}") + + lut = huff_LUT[cb] + dec = decode_huff(huff_sec, lut) + return np.asarray(dec, dtype=np.int64).reshape(-1) diff --git a/source/level_3/core/aac_psycho.py b/source/level_3/core/aac_psycho.py new file mode 100644 index 0000000..4ecaafc --- /dev/null +++ b/source/level_3/core/aac_psycho.py @@ -0,0 +1,441 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Psychoacoustic Model +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Psychoacoustic model for ONE channel, based on the assignment notes (Section 2.4). +# +# Public API: +# SMR = aac_psycho(frame_T, frame_type, frame_T_prev_1, frame_T_prev_2) +# +# Output: +# - For long frames ("OLS", "LSS", "LPS"): SMR has shape (69,) +# - For short frames ("ESH"): SMR has shape (42, 8) (one column per subframe) +# +# Notes: +# - Uses Bark band tables from material/TableB219.mat: +# * B219a for long windows (69 bands, N=2048 FFT, N/2=1024 bins) +# * B219b for short windows (42 bands, N=256 FFT, N/2=128 bins) +# - Applies a Hann window in time domain before FFT magnitude/phase extraction. +# - Implements: +# spreading function -> band spreading -> tonality index -> masking thresholds -> SMR. +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np + +from core.aac_utils import band_limits, get_table +from core.aac_configuration import NMT_DB, TMN_DB +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Spreading function +# ----------------------------------------------------------------------------- + +def _spreading_matrix(bval: BandValueArray) -> FloatArray: + """ + Compute the spreading function matrix between psychoacoustic bands. + + The spreading function describes how energy in one critical band masks + nearby bands. The formula follows the assignment pseudo-code. + + Parameters + ---------- + bval : BandValueArray + Bark value per band, shape (B,). + + Returns + ------- + FloatArray + Spreading matrix S of shape (B, B), where: + S[bb, b] quantifies the contribution of band bb masking band b. + """ + bval = np.asarray(bval, dtype=np.float64).reshape(-1) + B = int(bval.shape[0]) + + spread = np.zeros((B, B), dtype=np.float64) + + for b in range(B): + for bb in range(B): + # tmpx depends on direction (asymmetric spreading) + if bb >= b: + tmpx = 3.0 * (bval[bb] - bval[b]) + else: + tmpx = 1.5 * (bval[bb] - bval[b]) + + # tmpz uses the "min(..., 0)" nonlinearity exactly as in the notes + tmpz = 8.0 * min((tmpx - 0.5) ** 2 - 2.0 * (tmpx - 0.5), 0.0) + tmpy = 15.811389 + 7.5 * (tmpx + 0.474) - 17.5 * np.sqrt(1.0 + (tmpx + 0.474) ** 2) + + # Clamp very small values (below -100 dB) to 0 contribution + if tmpy < -100.0: + spread[bb, b] = 0.0 + else: + spread[bb, b] = 10.0 ** ((tmpz + tmpy) / 10.0) + + return spread + + +# ----------------------------------------------------------------------------- +# Windowing + FFT feature extraction +# ----------------------------------------------------------------------------- + +def _hann_window(N: int) -> FloatArray: + """ + Hann window as specified in the notes: + w[n] = 0.5 - 0.5*cos(2*pi*(n + 0.5)/N) + + Parameters + ---------- + N : int + Window length. + + Returns + ------- + FloatArray + 1-D array of shape (N,), dtype float64. + """ + n = np.arange(N, dtype=np.float64) + return 0.5 - 0.5 * np.cos((2.0 * np.pi / N) * (n + 0.5)) + + +def _r_phi_from_time(x: FrameChannelT, N: int) -> tuple[FloatArray, FloatArray]: + """ + Compute FFT magnitude r(w) and phase phi(w) for bins w = 0 .. N/2-1. + + Processing: + 1) Apply Hann window in time domain. + 2) Compute N-point FFT. + 3) Keep only the positive-frequency bins [0 .. N/2-1]. + + Parameters + ---------- + x : FrameChannelT + Time-domain samples, shape (N,). + N : int + FFT size (2048 or 256). + + Returns + ------- + r : FloatArray + Magnitude spectrum for bins 0 .. N/2-1, shape (N/2,). + phi : FloatArray + Phase spectrum for bins 0 .. N/2-1, shape (N/2,). + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + if x.shape[0] != N: + raise ValueError(f"Expected time vector of length {N}, got {x.shape[0]}.") + + w = _hann_window(N) + X = np.fft.fft(x * w, n=N) + + Xp = X[: N // 2] + r = np.abs(Xp).astype(np.float64, copy=False) + phi = np.angle(Xp).astype(np.float64, copy=False) + return r, phi + + +def _predictability( + r: FloatArray, + phi: FloatArray, + r_m1: FloatArray, + phi_m1: FloatArray, + r_m2: FloatArray, + phi_m2: FloatArray, +) -> FloatArray: + """ + Compute predictability c(w) per spectral bin. + + The notes define: + r_pred(w) = 2*r_{-1}(w) - r_{-2}(w) + phi_pred(w) = 2*phi_{-1}(w) - phi_{-2}(w) + + c(w) = |X(w) - X_pred(w)| / (r(w) + |r_pred(w)|) + + where X(w) is represented in polar form using r(w), phi(w). + + Parameters + ---------- + r, phi : FloatArray + Current magnitude and phase, shape (N/2,). + r_m1, phi_m1 : FloatArray + Previous magnitude and phase, shape (N/2,). + r_m2, phi_m2 : FloatArray + Pre-previous magnitude and phase, shape (N/2,). + + Returns + ------- + FloatArray + Predictability c(w), shape (N/2,). + """ + r_pred = 2.0 * r_m1 - r_m2 + phi_pred = 2.0 * phi_m1 - phi_m2 + + num = np.sqrt( + (r * np.cos(phi) - r_pred * np.cos(phi_pred)) ** 2 + + (r * np.sin(phi) - r_pred * np.sin(phi_pred)) ** 2 + ) + den = r + np.abs(r_pred) + 1e-12 # avoid division-by-zero without altering behavior + return (num / den).astype(np.float64, copy=False) + + +# ----------------------------------------------------------------------------- +# Band-domain aggregation +# ----------------------------------------------------------------------------- + +def _band_energy_and_weighted_predictability( + r: FloatArray, + c: FloatArray, + wlow: BandIndexArray, + whigh: BandIndexArray, +) -> tuple[FloatArray, FloatArray]: + """ + Aggregate spectral bin quantities into psychoacoustic bands. + + Definitions (notes): + e(b) = sum_{w=wlow(b)..whigh(b)} r(w)^2 + c_num(b) = sum_{w=wlow(b)..whigh(b)} c(w) * r(w)^2 + + The band predictability c(b) is later computed after spreading as: + cb(b) = ct(b) / ecb(b) + + Parameters + ---------- + r : FloatArray + Magnitude spectrum, shape (N/2,). + c : FloatArray + Predictability per bin, shape (N/2,). + wlow, whigh : BandIndexArray + Band limits (inclusive indices), shape (B,). + + Returns + ------- + e_b : FloatArray + Band energies e(b), shape (B,). + c_num_b : FloatArray + Weighted predictability numerators c_num(b), shape (B,). + """ + r2 = (r * r).astype(np.float64, copy=False) + + B = int(wlow.shape[0]) + e_b = np.zeros(B, dtype=np.float64) + c_num_b = np.zeros(B, dtype=np.float64) + + for b in range(B): + a = int(wlow[b]) + z = int(whigh[b]) + + seg_r2 = r2[a : z + 1] + e_b[b] = float(np.sum(seg_r2)) + c_num_b[b] = float(np.sum(c[a : z + 1] * seg_r2)) + + return e_b, c_num_b + + +def _psycho_one_window( + time_x: FrameChannelT, + prev1_x: FrameChannelT, + prev2_x: FrameChannelT, + *, + N: int, + table: BarkTable, +) -> FloatArray: + """ + Compute SMR for one FFT analysis window (N=2048 for long, N=256 for short). + + This implements the pipeline described in the notes: + - FFT magnitude/phase + - predictability per bin + - band energies and predictability + - band spreading + - tonality index tb(b) + - masking threshold (noise + threshold in quiet) + - SMR(b) = e(b) / np(b) + + Parameters + ---------- + time_x : FrameChannelT + Current time-domain samples, shape (N,). + prev1_x : FrameChannelT + Previous time-domain samples, shape (N,). + prev2_x : FrameChannelT + Pre-previous time-domain samples, shape (N,). + N : int + FFT size. + table : BarkTable + Psychoacoustic band table (B219a or B219b). + + Returns + ------- + FloatArray + SMR per band, shape (B,). + """ + wlow, whigh, bval, qthr_db = band_limits(table) + spread = _spreading_matrix(bval) + + # FFT features for current and history windows + r, phi = _r_phi_from_time(time_x, N) + r_m1, phi_m1 = _r_phi_from_time(prev1_x, N) + r_m2, phi_m2 = _r_phi_from_time(prev2_x, N) + + # Predictability per bin + c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2) + + # Aggregate into psycho bands + e_b, c_num_b = _band_energy_and_weighted_predictability(r, c_w, wlow, whigh) + + # Spread energies and predictability across bands: + # ecb(b) = sum_bb e(bb) * S(bb, b) + # ct(b) = sum_bb c_num(bb) * S(bb, b) + ecb = spread.T @ e_b + ct = spread.T @ c_num_b + + # Band predictability after spreading: cb(b) = ct(b) / ecb(b) + cb = ct / (ecb + 1e-12) + + # Normalized energy term: + # en(b) = ecb(b) / sum_bb S(bb, b) + spread_colsum = np.sum(spread, axis=0) + en = ecb / (spread_colsum + 1e-12) + + # Tonality index (clamped to [0, 1]) + tb = -0.299 - 0.43 * np.log(np.maximum(cb, 1e-12)) + tb = np.clip(tb, 0.0, 1.0) + + # Required SNR per band (dB): interpolate between TMN and NMT + snr_b = tb * TMN_DB + (1.0 - tb) * NMT_DB + bc = 10.0 ** (-snr_b / 10.0) + + # Noise masking threshold estimate (power domain) + nb = en * bc + + # Threshold in quiet (convert from dB to power domain): + # qthr_power = eps * (N/2) * 10^(qthr_db/10) + qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0)) + + # Final masking threshold per band: + # np(b) = max(nb(b), qthr(b)) + npart = np.maximum(nb, qthr_power) + + # Signal-to-mask ratio: + # SMR(b) = e(b) / np(b) + smr = e_b / (npart + 1e-12) + return smr.astype(np.float64, copy=False) + + +# ----------------------------------------------------------------------------- +# ESH window slicing (match filterbank conventions) +# ----------------------------------------------------------------------------- + +def _esh_subframes_256(x_2048: FrameChannelT) -> list[FrameChannelT]: + """ + Extract the 8 overlapping 256-sample short windows used by AAC ESH. + + The project convention (matching the filterbank) is: + start_j = 448 + 128*j, for j = 0..7 + subframe_j = x[start_j : start_j + 256] + + This selects the central 1152-sample region [448, 1600) and produces + 8 windows with 50% overlap. + + Parameters + ---------- + x_2048 : FrameChannelT + Time-domain channel frame, shape (2048,). + + Returns + ------- + list[FrameChannelT] + List of 8 subframes, each of shape (256,). + """ + x_2048 = np.asarray(x_2048, dtype=np.float64).reshape(-1) + if x_2048.shape[0] != 2048: + raise ValueError("ESH requires 2048-sample input frames.") + + subs: list[FrameChannelT] = [] + for j in range(8): + start = 448 + 128 * j + subs.append(x_2048[start : start + 256]) + return subs + + +# ----------------------------------------------------------------------------- +# Public API +# ----------------------------------------------------------------------------- + +def aac_psycho( + frame_T: FrameChannelT, + frame_type: FrameType, + frame_T_prev_1: FrameChannelT, + frame_T_prev_2: FrameChannelT, +) -> FloatArray: + """ + Psychoacoustic model for ONE channel. + + Parameters + ---------- + frame_T : FrameChannelT + Current time-domain channel frame, shape (2048,). + For "ESH", the 8 short windows are derived internally. + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + frame_T_prev_1 : FrameChannelT + Previous time-domain channel frame, shape (2048,). + frame_T_prev_2 : FrameChannelT + Pre-previous time-domain channel frame, shape (2048,). + + Returns + ------- + FloatArray + Signal-to-Mask Ratio (SMR), per psychoacoustic band. + - If frame_type == "ESH": shape (42, 8) + - Else: shape (69,) + """ + frame_T = np.asarray(frame_T, dtype=np.float64).reshape(-1) + frame_T_prev_1 = np.asarray(frame_T_prev_1, dtype=np.float64).reshape(-1) + frame_T_prev_2 = np.asarray(frame_T_prev_2, dtype=np.float64).reshape(-1) + + if frame_T.shape[0] != 2048 or frame_T_prev_1.shape[0] != 2048 or frame_T_prev_2.shape[0] != 2048: + raise ValueError("aac_psycho expects 2048-sample frames for current/prev1/prev2.") + + table, N = get_table(frame_type) + + # Long frame types: compute one SMR vector (69 bands) + if frame_type != "ESH": + return _psycho_one_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table) + + # ESH: compute 8 SMR vectors (42 bands each), one per short subframe. + # + # The notes use short-window history for predictability: + # - For j=0: use previous frame's subframes (7, 6) + # - For j=1: use current subframe 0 and previous frame's subframe 7 + # - For j>=2: use current subframes (j-1, j-2) + # + # This matches the "within-frame history" convention commonly used in + # simplified psycho models for ESH. + cur_subs = _esh_subframes_256(frame_T) + prev1_subs = _esh_subframes_256(frame_T_prev_1) + + B = int(table.shape[0]) # expected 42 + smr_out = np.zeros((B, 8), dtype=np.float64) + + for j in range(8): + if j == 0: + x_m1 = prev1_subs[7] + x_m2 = prev1_subs[6] + elif j == 1: + x_m1 = cur_subs[0] + x_m2 = prev1_subs[7] + else: + x_m1 = cur_subs[j - 1] + x_m2 = cur_subs[j - 2] + + smr_out[:, j] = _psycho_one_window(cur_subs[j], x_m1, x_m2, N=256, table=table) + + return smr_out diff --git a/source/level_3/core/aac_quantizer.py b/source/level_3/core/aac_quantizer.py new file mode 100644 index 0000000..addcfe8 --- /dev/null +++ b/source/level_3/core/aac_quantizer.py @@ -0,0 +1,604 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Implements AAC quantizer and inverse quantizer for one channel. +# Based on assignment section 2.6 (Eq. 12-15). +# +# Notes: +# - Bit reservoir is not implemented (assignment simplification). +# - Scalefactor bands are assumed equal to psychoacoustic bands +# (Table B.2.1.9a / B.2.1.9b from TableB219.mat). +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np + +from core.aac_utils import get_table, band_limits +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Constants (assignment) +# ----------------------------------------------------------------------------- +MAGIC_NUMBER: float = 0.4054 +MQ: int = 8191 + +EPS: float = 1e-12 +MAX_SFC_DIFF: int = 60 + +# Safeguard: prevents infinite loops in pathological cases +MAX_ALPHA_ITERS: int = 2000 + + +# ----------------------------------------------------------------------------- +# Helpers: ESH packing/unpacking (128x8 <-> 1024x1) +# ----------------------------------------------------------------------------- +def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray: + """ + Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1). + + Packing order: + Columns are concatenated in subframe order (0..7), column-major. + + Parameters + ---------- + x_128x8 : FloatArray + ESH coefficients, shape (128, 8). + + Returns + ------- + FloatArray + Packed coefficients, shape (1024, 1). + """ + x_128x8 = np.asarray(x_128x8, dtype=np.float64) + if x_128x8.shape != (128, 8): + raise ValueError("ESH pack expects shape (128, 8).") + return x_128x8.reshape(1024, 1, order="F") + + +def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray: + """ + Unpack a packed ESH vector (1024 elements) back to shape (128, 8). + + Parameters + ---------- + x_1024x1 : FloatArray + Packed ESH vector, shape (1024,) or (1024, 1) after flattening. + + Returns + ------- + FloatArray + Unpacked ESH coefficients, shape (128, 8). + """ + x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1) + if x_1024x1.shape[0] != 1024: + raise ValueError("ESH unpack expects 1024 elements.") + return x_1024x1.reshape(128, 8, order="F") + + +# ----------------------------------------------------------------------------- +# Core quantizer formulas (Eq. 12, Eq. 13) +# ----------------------------------------------------------------------------- +def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols: + """ + Quantize MDCT coefficients to integer symbols S(k). + + Implements Eq. (12): + S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER ) + + Parameters + ---------- + x : FloatArray + MDCT coefficients for a contiguous set of spectral lines. + Shape: (N,) + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + QuantizedSymbols + Quantized symbols S(k) as int64, shape (N,). + """ + x = np.asarray(x, dtype=np.float64) + + scale = 2.0 ** (-0.25 * float(alpha)) + ax = np.abs(x) * scale + + y = np.power(ax, 0.75, dtype=np.float64) + + # "int" in the assignment corresponds to truncation. + q = np.floor(y + MAGIC_NUMBER).astype(np.int64) + return (np.sign(x).astype(np.int64) * q).astype(np.int64) + + +def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray: + """ + Inverse quantizer (dequantization of symbols). + + Implements Eq. (13): + Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4) + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols S(k), int64, shape (N,). + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + FloatArray + Reconstructed MDCT coefficients Xhat(k), float64, shape (N,). + """ + S = np.asarray(S, dtype=np.int64) + + scale = 2.0 ** (0.25 * float(alpha)) + aS = np.abs(S).astype(np.float64) + y = np.power(aS, 4.0 / 3.0, dtype=np.float64) + + return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64) + + +# ----------------------------------------------------------------------------- +# Alpha initialization (Eq. 14) +# ----------------------------------------------------------------------------- +def _alpha_initial_from_frame_max(x_all: FloatArray) -> int: + """ + Compute the initial scalefactor gain alpha_hat for a frame. + + Implements Eq. (14): + alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ ) + + The same initial value is used for all bands before the per-band refinement. + + Parameters + ---------- + x_all : FloatArray + All MDCT coefficients of a frame (one channel), flattened. + + Returns + ------- + int + Initial alpha value (integer). + """ + x_all = np.asarray(x_all, dtype=np.float64).reshape(-1) + if x_all.size == 0: + return 0 + + max_abs = float(np.max(np.abs(x_all))) + if max_abs <= 0.0: + return 0 + + val = (max_abs ** 0.75) / float(MQ) + if val <= 0.0: + return 0 + + alpha_hat = (16.0 / 3.0) * np.log2(val) + return int(np.floor(alpha_hat)) + + +# ----------------------------------------------------------------------------- +# Band utilities +# ----------------------------------------------------------------------------- +def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type. + + These are derived from the psychoacoustic tables (TableB219), + and map directly to MDCT indices: + - long: 0..1023 + - short (ESH subframe): 0..127 + + Parameters + ---------- + frame_type : FrameType + Frame type ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + list[tuple[int, int]] + List of (lo, hi) inclusive index pairs for each band. + """ + table, _Nfft = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + + bands: list[tuple[int, int]] = [] + for lo, hi in zip(wlow, whigh): + bands.append((int(lo), int(hi))) + return bands + + +def _band_energy(x: FloatArray, lo: int, hi: int) -> float: + """ + Compute energy of a spectral segment x[lo:hi+1]. + + Parameters + ---------- + x : FloatArray + MDCT coefficient vector. + lo, hi : int + Inclusive index range. + + Returns + ------- + float + Sum of squares (energy) within the band. + """ + sec = x[lo : hi + 1] + return float(np.sum(sec * sec)) + + +def _threshold_T_from_SMR( + X: FloatArray, + SMR_col: FloatArray, + bands: list[tuple[int, int]], +) -> FloatArray: + """ + Compute psychoacoustic thresholds T(b) per band. + + Uses: + P(b) = sum_{k in band} X(k)^2 + T(b) = P(b) / SMR(b) + + Parameters + ---------- + X : FloatArray + MDCT coefficients for a frame (long) or one ESH subframe (short). + SMR_col : FloatArray + SMR values for this frame/subframe, shape (NB,). + bands : list[tuple[int, int]] + Band index ranges. + + Returns + ------- + FloatArray + Threshold vector T(b), shape (NB,). + """ + nb = len(bands) + T = np.zeros((nb,), dtype=np.float64) + + for b, (lo, hi) in enumerate(bands): + P = _band_energy(X, lo, hi) + smr = float(SMR_col[b]) + if smr <= EPS: + T[b] = 0.0 + else: + T[b] = P / smr + + return T + + +# ----------------------------------------------------------------------------- +# Alpha selection per band + neighbor-difference constraint +# ----------------------------------------------------------------------------- +def _best_alpha_for_band( + X: FloatArray, + lo: int, + hi: int, + T_b: float, + alpha0: int, + alpha_min: int, + alpha_max: int, +) -> int: + """ + Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha. + + The algorithm increases alpha until the quantization error power exceeds + the band threshold: + + P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2 + select the largest alpha such that P_e(b) <= T(b) + + Parameters + ---------- + X : FloatArray + Full MDCT vector (one frame or one subframe), shape (N,). + lo, hi : int + Inclusive MDCT index range defining the band. + T_b : float + Psychoacoustic threshold for this band. + alpha0 : int + Initial alpha value for the band. + alpha_min, alpha_max : int + Bounds for alpha, used as a safeguard. + + Returns + ------- + int + Selected integer alpha(b). + """ + if T_b <= 0.0: + return int(alpha0) + + Xsec = X[lo : hi + 1] + + alpha = int(alpha0) + alpha = max(alpha_min, min(alpha, alpha_max)) + + # Evaluate at current alpha + Ssec = _quantize_symbol(Xsec, alpha) + Xhat = _dequantize_symbol(Ssec, alpha) + Pe = float(np.sum((Xsec - Xhat) ** 2)) + + if Pe > T_b: + return alpha + + iters = 0 + while iters < MAX_ALPHA_ITERS: + iters += 1 + alpha_next = alpha + 1 + if alpha_next > alpha_max: + break + + Ssec = _quantize_symbol(Xsec, alpha_next) + Xhat = _dequantize_symbol(Ssec, alpha_next) + Pe_next = float(np.sum((Xsec - Xhat) ** 2)) + + if Pe_next > T_b: + break + + alpha = alpha_next + Pe = Pe_next + + return alpha + + +def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray: + """ + Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping. + + Uses a forward pass and a backward pass to reduce drift. + + Parameters + ---------- + alpha : np.ndarray + Alpha vector, shape (NB,). + max_diff : int + Maximum allowed absolute difference between adjacent bands. + + Returns + ------- + np.ndarray + Clamped alpha vector, int64, shape (NB,). + """ + a = np.asarray(alpha, dtype=np.int64).copy() + nb = a.shape[0] + + for b in range(1, nb): + lo = int(a[b - 1] - max_diff) + hi = int(a[b - 1] + max_diff) + if a[b] < lo: + a[b] = lo + elif a[b] > hi: + a[b] = hi + + for b in range(nb - 2, -1, -1): + lo = int(a[b + 1] - max_diff) + hi = int(a[b + 1] + max_diff) + if a[b] < lo: + a[b] = lo + elif a[b] > hi: + a[b] = hi + + return a + + +# ----------------------------------------------------------------------------- +# Public API +# ----------------------------------------------------------------------------- +def aac_quantizer( + frame_F: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]: + """ + AAC quantizer for one channel (Level 3). + + Quantizes MDCT coefficients using band-wise scalefactors derived from SMR. + + Parameters + ---------- + frame_F : FrameChannelF + MDCT coefficients after TNS, one channel. + Shapes: + - Long frames: (1024,) or (1024, 1) + - ESH: (128, 8) + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + SMR : FloatArray + Signal-to-Mask Ratio per band. + Shapes: + - Long: (NB,) or (NB, 1) + - ESH: (NB, 8) + + Returns + ------- + S : QuantizedSymbols + Quantized symbols S(k), shape (1024, 1) for all frame types. + sfc : ScaleFactors + DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1). + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain G = alpha(0). + - Long: scalar float + - ESH: array shape (1, 8) + """ + bands = _band_slices(frame_type) + NB = len(bands) + + X = np.asarray(frame_F, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64) + sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64) + G_arr = np.zeros((1, 8), dtype=np.int64) + + for j in range(8): + Xj = X[:, j].reshape(128) + SMRj = SMR[:, j].reshape(NB) + + T = _threshold_T_from_SMR(Xj, SMRj, bands) + + alpha0 = _alpha_initial_from_frame_max(Xj) + alpha = np.full((NB,), alpha0, dtype=np.int64) + + for b, (lo, hi) in enumerate(bands): + alpha[b] = _best_alpha_for_band( + X=Xj, lo=lo, hi=hi, T_b=float(T[b]), + alpha0=int(alpha[b]), + alpha_min=-4096, + alpha_max=4096, + ) + + alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) + + G_arr[0, j] = int(alpha[0]) + sfc[0, j] = int(alpha[0]) + for b in range(1, NB): + sfc[b, j] = int(alpha[b] - alpha[b - 1]) + + Sj = np.zeros((128,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b])) + + # Place this subframe in the packed output (column-major subframe layout) + S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj + + return S_out, sfc, G_arr.astype(np.float64) + + # Long frames + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") + + T = _threshold_T_from_SMR(Xv, SMRv, bands) + + alpha0 = _alpha_initial_from_frame_max(Xv) + alpha = np.full((NB,), alpha0, dtype=np.int64) + + for b, (lo, hi) in enumerate(bands): + alpha[b] = _best_alpha_for_band( + X=Xv, lo=lo, hi=hi, T_b=float(T[b]), + alpha0=int(alpha[b]), + alpha_min=-4096, + alpha_max=4096, + ) + + alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF) + + sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) + sfc[0, 0] = int(alpha[0]) + for b in range(1, NB): + sfc[b, 0] = int(alpha[b] - alpha[b - 1]) + + G: float = float(alpha[0]) + + S_vec = np.zeros((1024,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b])) + + return S_vec.reshape(1024, 1), sfc, G + + +def aac_i_quantizer( + S: QuantizedSymbols, + sfc: ScaleFactors, + G: GlobalGain, + frame_type: FrameType, +) -> FrameChannelF: + """ + Inverse quantizer (iQuantizer) for one channel. + + Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors. + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols, shape (1024, 1) (or any array with 1024 elements). + sfc : ScaleFactors + DPCM-coded scalefactors. + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain (not strictly required if sfc includes sfc(0)=alpha(0)). + Present for API compatibility with the assignment. + frame_type : FrameType + AAC frame type. + + Returns + ------- + FrameChannelF + Reconstructed MDCT coefficients: + - ESH: (128, 8) + - Long: (1024, 1) + """ + bands = _band_slices(frame_type) + NB = len(bands) + + S_flat = np.asarray(S, dtype=np.int64).reshape(-1) + if S_flat.shape[0] != 1024: + raise ValueError("S must contain 1024 symbols.") + + if frame_type == "ESH": + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 8): + raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).") + + S_128x8 = _esh_unpack_from_1024(S_flat) + + Xrec = np.zeros((128, 8), dtype=np.float64) + + for j in range(8): + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, j]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, j]) + + Xj = np.zeros((128,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b])) + + Xrec[:, j] = Xj + + return Xrec + + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 1): + raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).") + + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, 0]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, 0]) + + Xrec = np.zeros((1024,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b])) + + return Xrec.reshape(1024, 1) diff --git a/source/level_3/core/aac_ssc.py b/source/level_3/core/aac_ssc.py new file mode 100644 index 0000000..323e9e3 --- /dev/null +++ b/source/level_3/core/aac_ssc.py @@ -0,0 +1,217 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Sequence Segmentation Control module +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Sequence Segmentation Control module (SSC). +# Selects and returns the frame type based on input parameters. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Dict, Tuple +from core.aac_types import FrameType, FrameT, FrameChannelT + +import numpy as np + +# ----------------------------------------------------------------------------- +# Private helpers for SSC +# ----------------------------------------------------------------------------- + +# See Table 1 in mm-2025-hw-v0.1.pdf +STEREO_MERGE_TABLE: Dict[Tuple[FrameType, FrameType], FrameType] = { + ("OLS", "OLS"): "OLS", + ("OLS", "LSS"): "LSS", + ("OLS", "ESH"): "ESH", + ("OLS", "LPS"): "LPS", + ("LSS", "OLS"): "LSS", + ("LSS", "LSS"): "LSS", + ("LSS", "ESH"): "ESH", + ("LSS", "LPS"): "ESH", + ("ESH", "OLS"): "ESH", + ("ESH", "LSS"): "ESH", + ("ESH", "ESH"): "ESH", + ("ESH", "LPS"): "ESH", + ("LPS", "OLS"): "LPS", + ("LPS", "LSS"): "ESH", + ("LPS", "ESH"): "ESH", + ("LPS", "LPS"): "LPS", +} + + +def _detect_attack(next_frame_channel: FrameChannelT) -> bool: + """ + Detect whether the *next* frame (single channel) implies an attack, i.e. ESH + according to the assignment's criterion. + + Parameters + ---------- + next_frame_channel : FrameChannelT + One channel of next_frame_T (expected shape: (2048,)). + + Returns + ------- + bool + True if an attack is detected (=> next frame predicted ESH), else False. + + Notes + ----- + The criterion is implemented as described in the spec: + + 1) Apply the high-pass filter: + H(z) = (1 - z^-1) / (1 - 0.5 z^-1) + implemented in the time domain as: + y[n] = x[n] - x[n-1] + 0.5*y[n-1] + + 2) Split y into 16 segments of length 128 and compute segment energies s[l]. + + 3) Compute the ratio: + ds[l] = s[l] / s[l-1] + + 4) An attack exists if there exists l in {1..7} such that: + s[l] > 1e-3 and ds[l] > 10 + """ + # Local alias; expected to be a 1-D array of length 2048. + x = next_frame_channel + + # High-pass filter reference implementation (scalar recurrence). + y = np.zeros_like(x) + prev_x = 0.0 + prev_y = 0.0 + for n in range(x.shape[0]): + xn = float(x[n]) + yn = (xn - prev_x) + 0.5 * prev_y + y[n] = yn + prev_x = xn + prev_y = yn + + # Segment energies over 16 blocks of 128 samples. + s = np.empty(16, dtype=np.float64) + for l in range(16): + a = l * 128 + b = (l + 1) * 128 + seg = y[a:b] + s[l] = float(np.sum(seg * seg)) + + # ds[l] for l>=1. For l=0 not defined, keep 0. + ds = np.zeros(16, dtype=np.float64) + eps = 1e-12 # Avoid division by zero without materially changing the logic. + for l in range(1, 16): + ds[l] = s[l] / max(s[l - 1], eps) + + # Spec: check l in {1..7}. + for l in range(1, 8): + if (s[l] > 1e-3) and (ds[l] > 10.0): + return True + + return False + + +def _decide_frame_type(prev_frame_type: FrameType, attack: bool) -> FrameType: + """ + Decide the current frame type for a single channel based on the previous + frame type and whether the next frame is predicted to be ESH. + + Rules (spec): + + - If prev is "LSS" => current is "ESH" + - If prev is "LPS" => current is "OLS" + - If prev is "OLS" => current is "LSS" if attack else "OLS" + - If prev is "ESH" => current is "ESH" if attack else "LPS" + + Parameters + ---------- + prev_frame_type : FrameType + Previous frame type (one of "OLS", "LSS", "ESH", "LPS"). + attack : bool + True if the next frame is predicted ESH for this channel. + + Returns + ------- + FrameType + The per-channel decision for the current frame. + + """ + if prev_frame_type == "LSS": + return "ESH" + if prev_frame_type == "LPS": + return "OLS" + if prev_frame_type == "OLS": + return "LSS" if attack else "OLS" + if prev_frame_type == "ESH": + return "ESH" if attack else "LPS" + + raise ValueError(f"Invalid prev_frame_type: {prev_frame_type!r}") + + +def _stereo_merge(ft_l: FrameType, ft_r: FrameType) -> FrameType: + """ + Merge per-channel frame type decisions into one common frame type using + the stereo merge table from the spec. + + Parameters + ---------- + ft_l : FrameType + Frame type decision for the left channel. + ft_r : FrameType + Frame type decision for the right channel. + + Returns + ------- + FrameType + The merged common frame type. + """ + try: + return STEREO_MERGE_TABLE[(ft_l, ft_r)] + except KeyError as e: + raise ValueError(f"Invalid stereo merge pair: {(ft_l, ft_r)}") from e + + +# ----------------------------------------------------------------------------- +# Public Function prototypes +# ----------------------------------------------------------------------------- + +def aac_ssc(frame_T: FrameT, next_frame_T: FrameT, prev_frame_type: FrameType) -> FrameType: + """ + Sequence Segmentation Control (SSC). + + Select and return the frame type for the current frame (i) based on: + - the current time-domain frame (stereo), + - the next time-domain frame (stereo), used for attack detection, + - the previous frame type. + + Parameters + ---------- + frame_T : FrameT + Current time-domain frame i (expected shape: (2048, 2)). + next_frame_T : FrameT + Next time-domain frame (i+1), used to decide transitions to/from ESH + (expected shape: (2048, 2)). + prev_frame_type : FrameType + Frame type chosen for the previous frame (i-1). + + Returns + ------- + FrameType + One of: "OLS", "LSS", "ESH", "LPS". + """ + if frame_T.shape != (2048, 2): + raise ValueError("frame_T must have shape (2048, 2).") + if next_frame_T.shape != (2048, 2): + raise ValueError("next_frame_T must have shape (2048, 2).") + + # Detect attack independently per channel on the next frame. + attack_l = _detect_attack(next_frame_T[:, 0]) + attack_r = _detect_attack(next_frame_T[:, 1]) + + # Decide per-channel type based on shared prev_frame_type. + ft_l = _decide_frame_type(prev_frame_type, attack_l) + ft_r = _decide_frame_type(prev_frame_type, attack_r) + + # Stereo merge as per the spec table. + return _stereo_merge(ft_l, ft_r) diff --git a/source/level_3/core/aac_tns.py b/source/level_3/core/aac_tns.py new file mode 100644 index 0000000..06227b1 --- /dev/null +++ b/source/level_3/core/aac_tns.py @@ -0,0 +1,514 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Temporal Noise Shaping (TNS) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Temporal Noise Shaping (TNS) module (Level 2). +# +# Public API: +# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type) +# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs) +# +# Notes (per assignment): +# - TNS is applied per channel (not stereo). +# - For ESH, TNS is applied independently to each of the 8 short subframes. +# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short) +# provided in TableB219.mat. +# - Predictor order is fixed to p = 4. +# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1. +# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p +# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Tuple + +from core.aac_utils import load_b219_tables +from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Private helpers +# ----------------------------------------------------------------------------- + + +def _band_ranges_for_kcount(k_count: int) -> BandRanges: + """ + Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count. + + Parameters + ---------- + k_count : int + Number of MDCT lines: + - 1024 for long frames + - 128 for short subframes (ESH) + + Returns + ------- + BandRanges (list[tuple[int, int]]) + Each tuple is (start_k, end_k) inclusive. + """ + tables = load_b219_tables() + if k_count == 1024: + tbl = tables["B219a"] + elif k_count == 128: + tbl = tables["B219b"] + else: + raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).") + + start = tbl[:, 1].astype(int) + end = tbl[:, 2].astype(int) + + ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)] + + for s, e in ranges: + if s < 0 or e < s or e >= k_count: + raise ValueError("Invalid band table ranges for given k_count.") + return ranges + + +# ----------------------------------------------------------------------------- +# Core DSP helpers +# ----------------------------------------------------------------------------- + +def _smooth_sw_inplace(sw: MdctCoeffs) -> None: + """ + Smooth Sw(k) to reduce discontinuities between adjacent Bark bands. + + The assignment applies two passes: + - Backward: Sw(k) = (Sw(k) + Sw(k+1))/2 + - Forward: Sw(k) = (Sw(k) + Sw(k-1))/2 + + Parameters + ---------- + sw : MdctCoeffs + 1-D array of length K (float64). Modified in-place. + """ + k_count = int(sw.shape[0]) + + for k in range(k_count - 2, -1, -1): + sw[k] = 0.5 * (sw[k] + sw[k + 1]) + + for k in range(1, k_count): + sw[k] = 0.5 * (sw[k] + sw[k - 1]) + + +def _compute_sw(x: MdctCoeffs) -> MdctCoeffs: + """ + Compute Sw(k) from band energies P(j) and apply boundary smoothing. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT line array, length K. + + Returns + ------- + MdctCoeffs + Sw(k), 1-D array of length K, float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + k_count = int(x.shape[0]) + + bands = _band_ranges_for_kcount(k_count) + sw = np.zeros(k_count, dtype=np.float64) + + for s, e in bands: + seg = x[s : e + 1] + p_j = float(np.sum(seg * seg)) + sw_val = float(np.sqrt(p_j)) + sw[s : e + 1] = sw_val + + _smooth_sw_inplace(sw) + return sw + + +def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Autocorrelation r(m) for m=0..p. + + Parameters + ---------- + x : MdctCoeffs + 1-D signal. + p : int + Maximum lag. + + Returns + ------- + MdctCoeffs + r, shape (p+1,), float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + n = int(x.shape[0]) + + r = np.zeros(p + 1, dtype=np.float64) + for m in range(p + 1): + r[m] = float(np.dot(x[m:], x[: n - m])) + return r + + +def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Solve Yule-Walker normal equations for LPC coefficients of order p. + + Parameters + ---------- + xw : MdctCoeffs + 1-D normalized sequence Xw(k). + p : int + Predictor order. + + Returns + ------- + MdctCoeffs + LPC coefficients a[0..p-1], shape (p,), float64. + """ + r = _autocorr(xw, p) + + R = np.empty((p, p), dtype=np.float64) + for i in range(p): + for j in range(p): + R[i, j] = r[abs(i - j)] + + rhs = r[1 : p + 1].reshape(p) + + reg = 1e-12 + R_reg = R + reg * np.eye(p, dtype=np.float64) + + a = np.linalg.solve(R_reg, rhs) + return a + + +def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs: + """ + Quantize LPC coefficients with uniform symmetric quantizer and clamp. + + Parameters + ---------- + a : MdctCoeffs + LPC coefficient array, shape (p,). + + Returns + ------- + MdctCoeffs + Quantized coefficients, shape (p,), float64. + """ + a = np.asarray(a, dtype=np.float64).reshape(-1) + q = np.round(a / QUANT_STEP) * QUANT_STEP + q = np.clip(q, -QUANT_MAX, QUANT_MAX) + return q.astype(np.float64, copy=False) + + +def _is_inverse_stable(a_q: MdctCoeffs) -> bool: + """ + Check stability of the inverse TNS filter H_TNS^{-1}. + + Forward filter: + H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p + + Inverse filter poles are roots of: + A(z) = 1 - a1 z^-1 - ... - ap z^-p + Multiply by z^p: + z^p - a1 z^{p-1} - ... - ap = 0 + + Stability condition: + all roots satisfy |z| < 1. + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + bool + True if stable, else False. + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + + # Polynomial in z: z^p - a1 z^{p-1} - ... - ap + poly = np.empty(p + 1, dtype=np.float64) + poly[0] = 1.0 + poly[1:] = -a_q + + roots = np.roots(poly) + + # Strictly inside unit circle for stability. Add tiny margin for numeric safety. + margin = 1e-12 + return bool(np.all(np.abs(roots) < (1.0 - margin))) + + +def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs: + """ + Make quantized predictor coefficients stable for inverse filtering. + + Policy: + - If already stable: return as-is. + - Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid. + - If still unstable after attempts: fall back to all-zero coefficients (disable TNS). + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Stable quantized coefficients, shape (p,). + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + + if _is_inverse_stable(a_q): + return a_q + + # Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid. + gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1) + + for g in gammas: + cand = _quantize_coeffs(g * a_q) + if _is_inverse_stable(cand): + return cand + + # Last resort: disable TNS for this vector + return np.zeros_like(a_q, dtype=np.float64) + + +def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply forward TNS FIR filter: + y[k] = x[k] - sum_{l=1..p} a_l * x[k-l] + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT lines, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Filtered MDCT lines y, length K. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(x.shape[0]) + + y = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = x[k] + for l in range(1, p + 1): + if k - l >= 0: + acc -= a_q[l - 1] * x[k - l] + y[k] = acc + return y + + +def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply inverse TNS IIR filter: + x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l] + + Parameters + ---------- + y : MdctCoeffs + 1-D MDCT lines after TNS, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Reconstructed MDCT lines x_hat, length K. + """ + y = np.asarray(y, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(y.shape[0]) + + x_hat = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = y[k] + for l in range(1, p + 1): + if k - l >= 0: + acc += a_q[l - 1] * x_hat[k - l] + x_hat[k] = acc + return x_hat + + +def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: + """ + TNS for a single MDCT vector (one long frame or one short subframe). + + Steps: + 1) Compute Sw(k) from Bark band energies and smooth it. + 2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0). + 3) Compute LPC coefficients (order p=PRED_ORDER) on Xw. + 4) Quantize coefficients (4-bit symmetric, step QUANT_STEP). + 5) Apply FIR filter on original X(k) using quantized coefficients. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT vector. + + Returns + ------- + y : MdctCoeffs + TNS-processed MDCT vector (same length). + a_q : MdctCoeffs + Quantized LPC coefficients, shape (PRED_ORDER,). + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + sw = _compute_sw(x) + + eps = 1e-12 + xw = np.zeros_like(x, dtype=np.float64) + mask = sw > eps + np.divide(x, sw, out=xw, where=mask) + + a = _lpc_coeffs(xw, PRED_ORDER) + a_q = _quantize_coeffs(a) + + # Ensure inverse stability (assignment requirement) + a_q = _stabilize_quantized_coeffs(a_q) + + y = _apply_tns_fir(x, a_q) + return y, a_q + + + +# ----------------------------------------------------------------------------- +# Public Functions +# ----------------------------------------------------------------------------- + +def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]: + """ + Temporal Noise Shaping (TNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + frame_F_out : FrameChannelF + Per-channel MDCT coefficients after applying TNS. + Same shape convention as input. + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + """ + x = np.asarray(frame_F_in, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + + y = np.empty_like(x, dtype=np.float64) + a_out = np.empty((PRED_ORDER, 8), dtype=np.float64) + + for j in range(8): + y[:, j], a_out[:, j] = _tns_one_vector(x[:, j]) + + return y, a_out + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec, a_q = _tns_one_vector(x_vec) + + if out_shape == (1024,): + y_out = y_vec + else: + y_out = y_vec.reshape(1024, 1) + + a_out = a_q.reshape(PRED_ORDER, 1) + return y_out, a_out + + +def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF: + """ + Inverse Temporal Noise Shaping (iTNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients after TNS. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + + Returns + ------- + FrameChannelF + Per-channel MDCT coefficients after inverse TNS. + Same shape convention as input frame_F_in. + """ + x = np.asarray(frame_F_in, dtype=np.float64) + a = np.asarray(tns_coeffs, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + if a.shape != (PRED_ORDER, 8): + raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).") + + y = np.empty_like(x, dtype=np.float64) + for j in range(8): + y[:, j] = _apply_itns_iir(x[:, j], a[:, j]) + return y + + if a.shape != (PRED_ORDER, 1): + raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).") + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec = _apply_itns_iir(x_vec, a[:, 0]) + + if out_shape == (1024,): + return y_vec + return y_vec.reshape(1024, 1) diff --git a/source/level_3/core/aac_types.py b/source/level_3/core/aac_types.py new file mode 100644 index 0000000..00508c6 --- /dev/null +++ b/source/level_3/core/aac_types.py @@ -0,0 +1,411 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Public Type Aliases +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements Public Type aliases +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import List, Literal, TypeAlias, TypedDict +import numpy as np +from numpy.typing import NDArray + +# ----------------------------------------------------------------------------- +# Code enums (for readability; not intended to enforce shapes/lengths) +# ----------------------------------------------------------------------------- + +FrameType: TypeAlias = Literal["OLS", "LSS", "ESH", "LPS"] +""" +Frame type codes (AAC): +- "OLS": ONLY_LONG_SEQUENCE +- "LSS": LONG_START_SEQUENCE +- "ESH": EIGHT_SHORT_SEQUENCE +- "LPS": LONG_STOP_SEQUENCE +""" + +WinType: TypeAlias = Literal["KBD", "SIN"] +""" +Window type codes (AAC): +- "KBD": Kaiser-Bessel-Derived +- "SIN": sinusoid +""" + +ChannelKey: TypeAlias = Literal["chl", "chr"] +"""Channel dictionary keys used in Level payloads.""" + + +# ----------------------------------------------------------------------------- +# Array “semantic” aliases +# +# Goal: communicate meaning (time/frequency/window, stereo/channel) without +# forcing strict shapes in the type system. +# ----------------------------------------------------------------------------- + +FloatArray: TypeAlias = NDArray[np.float64] +""" +Generic float64 NumPy array. + +Note: +- We standardize internal numeric computations to float64 for stability and + reproducibility. External I/O can still be float32, but we convert at the + boundaries. +""" + +Window: TypeAlias = FloatArray +""" +Time-domain window (weighting sequence), 1-D. + +Typical lengths in this assignment: +- Long: 2048 +- Short: 256 +- Window sequences for LSS/LPS are also 2048 + +Expected shape: (N,) +dtype: float64 +""" + +TimeSignal: TypeAlias = FloatArray +""" +Time-domain signal samples, typically 1-D. + +Examples: +- Windowed MDCT input: shape (N,) +- IMDCT output: shape (N,) + +dtype: float64 +""" + +StereoSignal: TypeAlias = FloatArray +""" +Time-domain stereo signal stream. + +Expected (typical) shape: (N, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +MdctCoeffs: TypeAlias = FloatArray +""" +MDCT coefficient vector, typically 1-D. + +Examples: +- Long: shape (1024,) +- Short: shape (128,) + +dtype: float64 +""" + +MdctFrameChannel: TypeAlias = FloatArray +""" +Per-channel MDCT container used in Level-1/2 sequences. + +Typical shapes: +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) (8 short subframes for one channel) + +dtype: float64 + +Notes +----- +Some parts of the assignment store long-frame coefficients as a column vector +(1024, 1) to match MATLAB conventions. Internally you may also use (1024,) +when convenient, but the semantic meaning is identical. +""" + +TnsCoeffs: TypeAlias = FloatArray +""" +Quantized TNS predictor coefficients (one channel). + +Typical shapes (Level 2): +- If frame_type == "ESH": (4, 8) (order p=4 for each of the 8 short subframes) +- Else: (4, 1) (order p=4 for the long frame) + +dtype: float64 + +Notes +----- +The assignment uses a 4-bit uniform symmetric quantizer with step size 0.1. +We store the quantized coefficient values as float64 (typically multiples of 0.1) +to keep the pipeline simple and readable. +""" + + +FrameT: TypeAlias = FloatArray +""" +Time-domain frame (stereo), as used by the filterbank input/output. + +Expected (typical) shape for stereo: (2048, 2) +- axis 0: time samples +- axis 1: channels [L, R] + +dtype: float64 +""" + +FrameChannelT: TypeAlias = FloatArray +""" +Time-domain single-channel frame. + +Expected (typical) shape: (2048,) + +dtype: float64 +""" + +FrameF: TypeAlias = FloatArray +""" +Frequency-domain frame (MDCT coefficients), stereo container. + +Typical shapes (Level 1): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 2) +- If frame_type == "ESH": (128, 16) + +Rationale for ESH (128, 16): +- 8 short subframes per channel => 8 * 2 = 16 columns total +- Each short subframe per stereo is (128, 2), flattened into columns + in subframe order: [sf0_L, sf0_R, sf1_L, sf1_R, ..., sf7_L, sf7_R] + +dtype: float64 +""" + +FrameChannelF: TypeAlias = MdctFrameChannel +""" +Frequency-domain single-channel MDCT coefficients. + +Typical shapes (Level 1/2): +- If frame_type in {"OLS","LSS","LPS"}: (1024, 1) or (1024,) +- If frame_type == "ESH": (128, 8) + +dtype: float64 +""" + +BandRanges: TypeAlias = list[tuple[int, int]] +""" +Bark-band index ranges [start, end] (inclusive) for MDCT lines. + +Used by TNS to map MDCT indices k to Bark bands. +""" + +BarkTable: TypeAlias = FloatArray +""" +Psychoacoustic Bark band table loaded from TableB219.mat. + +Typical shapes: +- Long: (69, 6) +- Short: (42, 6) +""" + +BandIndexArray: TypeAlias = NDArray[np.int_] +""" +Array of FFT bin indices per psychoacoustic band. +""" + +BandValueArray: TypeAlias = FloatArray +""" +Per-band psychoacoustic values (e.g. Bark position, thresholds). +""" + + +# Quantizer-related semantic aliases + +QuantizedSymbols: TypeAlias = NDArray[np.generic] +""" +Quantized MDCT symbols S(k). + +Shapes: +- Always (1024, 1) at the quantizer output (ESH packed to 1024 symbols). +""" + +ScaleFactors: TypeAlias = NDArray[np.generic] +""" +DPCM-coded scalefactors sfc(b) = alpha(b) - alpha(b-1). + +Shapes: +- Long frames: (NB, 1) +- ESH frames: (NB, 8) +""" + +GlobalGain: TypeAlias = float | NDArray[np.generic] +""" +Global gain G = alpha(0). + +- Long frames: scalar float +- ESH frames: array shape (1, 8) +""" + +# Huffman semantic aliases + +HuffmanBitstream: TypeAlias = str +"""Huffman-coded bitstream stored as a string of '0'/'1'.""" + +HuffmanCodebook: TypeAlias = int +"""Huffman codebook id (e.g., 0..11).""" + +# ----------------------------------------------------------------------------- +# Level 1 AAC sequence payload types +# ----------------------------------------------------------------------------- + +class AACChannelFrameF(TypedDict): + """ + Per-channel payload for aac_seq_1[i]["chl"] or ["chr"] (Level 1). + + Keys + ---- + frame_F: + The MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) (8 short subframes) + - else: (1024, 1) or (1024,) + """ + frame_F: FrameChannelF + + +class AACSeq1Frame(TypedDict): + """ + One frame dictionary element of aac_seq_1 (Level 1). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF + chr: AACChannelFrameF + + +AACSeq1: TypeAlias = List[AACSeq1Frame] +""" +AAC sequence for Level 1: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" +""" + + +# ----------------------------------------------------------------------------- +# Level 2 AAC sequence payload types (TNS) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF2(TypedDict): + """ + Per-channel payload for aac_seq_2[i]["chl"] or ["chr"] (Level 2). + + Keys + ---- + frame_F: + The TNS-processed MDCT coefficients for ONE channel. + Typical shapes: + - ESH: (128, 8) + - else: (1024, 1) or (1024,) + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Typical shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + """ + frame_F: FrameChannelF + tns_coeffs: TnsCoeffs + + +class AACSeq2Frame(TypedDict): + """ + One frame dictionary element of aac_seq_2 (Level 2). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF2 + chr: AACChannelFrameF2 + + +AACSeq2: TypeAlias = List[AACSeq2Frame] +""" +AAC sequence for Level 2: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 2 adds: +- per-channel "tns_coeffs" +and stores: +- per-channel "frame_F" after applying TNS. +""" + +# ----------------------------------------------------------------------------- +# Level 3 AAC sequence payload types (Quantizer + Huffman) +# ----------------------------------------------------------------------------- + +class AACChannelFrameF3(TypedDict): + """ + Per-channel payload for aac_seq_3[i]["chl"] or ["chr"] (Level 3). + + Keys + ---- + tns_coeffs: + Quantized TNS predictor coefficients for ONE channel. + Shapes: + - ESH: (PRED_ORDER, 8) + - else: (PRED_ORDER, 1) + + T: + Psychoacoustic thresholds per band. + Shapes: + - ESH: (NB, 8) + - else: (NB, 1) + Note: Stored for completeness / debugging; not entropy-coded. + + G: + Quantized global gains. + Shapes: + - ESH: (1, 8) (one per short subframe) + - else: scalar (or compatible np scalar) + + sfc: + Huffman-coded scalefactor differences (DPCM sequence). + + stream: + Huffman-coded MDCT quantized symbols S(k) (packed to 1024 symbols). + + codebook: + Huffman codebook id used for MDCT symbols (stream). + (Scalefactors typically use fixed codebook 11 and do not need to store it.) + """ + tns_coeffs: TnsCoeffs + T: FloatArray + G: FloatArray | float + sfc: HuffmanBitstream + stream: HuffmanBitstream + codebook: HuffmanCodebook + + +class AACSeq3Frame(TypedDict): + """ + One frame dictionary element of aac_seq_3 (Level 3). + """ + frame_type: FrameType + win_type: WinType + chl: AACChannelFrameF3 + chr: AACChannelFrameF3 + + +AACSeq3: TypeAlias = List[AACSeq3Frame] +""" +AAC sequence for Level 3: +List of length K (K = number of frames). + +Each element is a dict with keys: +- "frame_type", "win_type", "chl", "chr" + +Level 3 adds (per channel): +- "tns_coeffs" +- "T" thresholds (not entropy-coded) +- "G" global gain(s) +- "sfc" Huffman-coded scalefactor differences +- "stream" Huffman-coded MDCT quantized symbols +- "codebook" Huffman codebook for MDCT symbols +""" diff --git a/source/level_3/core/aac_utils.py b/source/level_3/core/aac_utils.py new file mode 100644 index 0000000..0e18ed1 --- /dev/null +++ b/source/level_3/core/aac_utils.py @@ -0,0 +1,270 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - AAC Utilities +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Shared utility functions used across AAC encoder/decoder levels. +# +# This module currently provides: +# - MDCT / IMDCT conversions +# - Signal-to-Noise Ratio (SNR) computation in dB +# - Loading and access helpers for psychoacoustic band tables +# (TableB219.mat, Tables B.2.1.9a / B.2.1.9b of the AAC specification) +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np +from pathlib import Path + +from scipy.io import loadmat +from core.aac_types import * + + +# ----------------------------------------------------------------------------- +# Global cached data +# ----------------------------------------------------------------------------- +# Cached contents of TableB219.mat to avoid repeated disk I/O. +# Keys: +# - "B219a": long-window psychoacoustic bands (69 bands, FFT size 2048) +# - "B219b": short-window psychoacoustic bands (42 bands, FFT size 256) +B219_CACHE: dict[str, BarkTable] | None = None + + +# ----------------------------------------------------------------------------- +# MDCT / IMDCT +# ----------------------------------------------------------------------------- +def mdct(s: TimeSignal) -> MdctCoeffs: + """ + MDCT (direct form) as specified in the assignment. + + Parameters + ---------- + s : TimeSignal + Windowed time samples, 1-D array of length N (N = 2048 or 256). + + Returns + ------- + MdctCoeffs + MDCT coefficients, 1-D array of length N/2. + + Definition + ---------- + X[k] = 2 * sum_{n=0..N-1} s[n] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + s = np.asarray(s, dtype=np.float64).reshape(-1) + N = int(s.shape[0]) + if N not in (2048, 256): + raise ValueError("MDCT input length must be 2048 or 256.") + + n0 = (N / 2.0 + 1.0) / 2.0 + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(N // 2, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, N/2) + X = 2.0 * (s @ C) # (N/2,) + return X + + +def imdct(X: MdctCoeffs) -> TimeSignal: + """ + IMDCT (direct form) as specified in the assignment. + + Parameters + ---------- + X : MdctCoeffs + MDCT coefficients, 1-D array of length K (K = 1024 or 128). + + Returns + ------- + TimeSignal + Reconstructed time samples, 1-D array of length N = 2K. + + Definition + ---------- + s[n] = (2/N) * sum_{k=0..N/2-1} X[k] * cos((2*pi/N) * (n + n0) * (k + 1/2)), + where n0 = (N/2 + 1)/2. + """ + X = np.asarray(X, dtype=np.float64).reshape(-1) + K = int(X.shape[0]) + if K not in (1024, 128): + raise ValueError("IMDCT input length must be 1024 or 128.") + + N = 2 * K + n0 = (N / 2.0 + 1.0) / 2.0 + + n = np.arange(N, dtype=np.float64) + n0 + k = np.arange(K, dtype=np.float64) + 0.5 + + C = np.cos((2.0 * np.pi / N) * np.outer(n, k)) # (N, K) + s = (2.0 / N) * (C @ X) # (N,) + return s + + +# ----------------------------------------------------------------------------- +# Signal quality metrics +# ----------------------------------------------------------------------------- + +def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Compute the overall Signal-to-Noise Ratio (SNR) in dB. + + The SNR is computed over all available samples and channels, + after conservatively aligning the two signals to their common + length and channel count. + + Parameters + ---------- + x_ref : StereoSignal + Reference (original) signal. + Typical shape: (N, 2) for stereo. + x_hat : StereoSignal + Reconstructed or processed signal. + Typical shape: (M, 2) for stereo. + + Returns + ------- + float + SNR in dB. + - +inf if the noise power is zero (perfect reconstruction). + - -inf if the reference signal power is zero. + """ + x_ref = np.asarray(x_ref, dtype=np.float64) + x_hat = np.asarray(x_hat, dtype=np.float64) + + # Ensure 2-D shape: (samples, channels) + if x_ref.ndim == 1: + x_ref = x_ref.reshape(-1, 1) + if x_hat.ndim == 1: + x_hat = x_hat.reshape(-1, 1) + + # Align lengths and channel count conservatively + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + x_ref = x_ref[:n, :c] + x_hat = x_hat[:n, :c] + + err = x_ref - x_hat + ps = float(np.sum(x_ref * x_ref)) # signal power + pn = float(np.sum(err * err)) # noise power + + if pn <= 0.0: + return float("inf") + if ps <= 0.0: + return float("-inf") + + return float(10.0 * np.log10(ps / pn)) + + +# ----------------------------------------------------------------------------- +# Psychoacoustic band tables (TableB219.mat) +# ----------------------------------------------------------------------------- + +def load_b219_tables() -> dict[str, BarkTable]: + """ + Load and cache psychoacoustic band tables from TableB219.mat. + + The assignment/project layout assumes that a 'material' directory + is available in the current working directory when running: + - tests + - level_1 / level_2 / level_3 entrypoints + + This function loads the tables once and caches them for subsequent calls. + + Returns + ------- + dict[str, BarkTable] + Dictionary with the following entries: + - "B219a": long-window psychoacoustic table + (69 bands, FFT size 2048 / 1024 spectral lines) + - "B219b": short-window psychoacoustic table + (42 bands, FFT size 256 / 128 spectral lines) + """ + global B219_CACHE + if B219_CACHE is not None: + return B219_CACHE + + mat_path = Path("material") / "TableB219.mat" + if not mat_path.exists(): + raise FileNotFoundError( + "Could not locate material/TableB219.mat in the current working directory." + ) + + data = loadmat(str(mat_path)) + if "B219a" not in data or "B219b" not in data: + raise ValueError( + "TableB219.mat missing required variables 'B219a' and/or 'B219b'." + ) + + B219_CACHE = { + "B219a": np.asarray(data["B219a"], dtype=np.float64), + "B219b": np.asarray(data["B219b"], dtype=np.float64), + } + return B219_CACHE + + +def get_table(frame_type: FrameType) -> tuple[BarkTable, int]: + """ + Select the appropriate psychoacoustic band table and FFT size + based on the AAC frame type. + + Parameters + ---------- + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + table : BarkTable + Psychoacoustic band table: + - B219a for long frames + - B219b for ESH short subframes + N : int + FFT size corresponding to the table: + - 2048 for long frames + - 256 for short frames (ESH) + """ + tables = load_b219_tables() + if frame_type == "ESH": + return tables["B219b"], 256 + return tables["B219a"], 2048 + + +def band_limits( + table: BarkTable, +) -> tuple[BandIndexArray, BandIndexArray, BandValueArray, BandValueArray]: + """ + Extract per-band metadata from a TableB2.1.9 psychoacoustic table. + + The column layout follows the provided TableB219.mat file and the + AAC specification tables B.2.1.9a / B.2.1.9b. + + Parameters + ---------- + table : BarkTable + Psychoacoustic band table (B219a or B219b). + + Returns + ------- + wlow : BandIndexArray + Lower FFT bin index (inclusive) for each band. + whigh : BandIndexArray + Upper FFT bin index (inclusive) for each band. + bval : BandValueArray + Bark-scale (or equivalent) band position values. + Used in the spreading function. + qthr_db : BandValueArray + Threshold in quiet for each band, in dB. + """ + wlow = table[:, 1].astype(int) + whigh = table[:, 2].astype(int) + bval = table[:, 4].astype(np.float64) + qthr_db = table[:, 5].astype(np.float64) + return wlow, whigh, bval, qthr_db diff --git a/source/level_3/level_3.py b/source/level_3/level_3.py new file mode 100644 index 0000000..2f2105b --- /dev/null +++ b/source/level_3/level_3.py @@ -0,0 +1,237 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Level 3 Wrappers + Demo +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Level 3 wrapper module. +# +# This file provides: +# - Thin wrappers for Level 3 API functions (encode/decode) that delegate +# to the corresponding core implementations. +# - A demo function that runs end-to-end and computes: +# * SNR +# * bitrate (coded) +# * compression ratio +# - A small CLI entrypoint for convenience. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Optional, Tuple, Union + +import os +import soundfile as sf + +from core.aac_types import AACSeq3, StereoSignal +from core.aac_coder import aac_coder_3 as core_aac_coder_3 +from core.aac_coder import aac_read_wav_stereo_48k +from core.aac_decoder import aac_decoder_3 as core_aac_decoder_3 +from core.aac_utils import snr_db + + +# ----------------------------------------------------------------------------- +# Helpers (Level 3 metrics) +# ----------------------------------------------------------------------------- + +def _wav_duration_seconds(wav_path: Path) -> float: + """Return WAV duration in seconds using soundfile metadata.""" + info = sf.info(str(wav_path)) + if info.samplerate <= 0: + raise ValueError("Invalid samplerate in WAV header.") + if info.frames < 0: + raise ValueError("Invalid frame count in WAV header.") + return float(info.frames) / float(info.samplerate) + + +def _bitrate_before_from_file(wav_path: Path) -> float: + """ + Compute input bitrate (bits/s) from file size and duration. + + Note: + This is a file-based bitrate estimate (includes WAV header), which is + acceptable for a simple compression ratio metric. + """ + duration = _wav_duration_seconds(wav_path) + if duration <= 0.0: + raise ValueError("Non-positive WAV duration.") + nbits = float(os.path.getsize(wav_path)) * 8.0 + return nbits / duration + + +def _bitrate_after_from_aacseq(aac_seq_3: AACSeq3, duration_sec: float) -> float: + """ + Compute coded bitrate (bits/s) from Huffman streams stored in AACSeq3. + + We count bits from: + - scalefactor Huffman bitstream ("sfc") + - MDCT symbols Huffman bitstream ("stream") + for both channels and all frames. + + Note: + We intentionally ignore side-info overhead (frame_type, G, T, TNS coeffs, + codebook ids, etc.). This matches a common simplified metric in demos. + """ + if duration_sec <= 0.0: + raise ValueError("Non-positive duration for bitrate computation.") + + total_bits = 0 + for fr in aac_seq_3: + total_bits += len(fr["chl"]["sfc"]) + total_bits += len(fr["chl"]["stream"]) + total_bits += len(fr["chr"]["sfc"]) + total_bits += len(fr["chr"]["stream"]) + + return float(total_bits) / float(duration_sec) + + +# ----------------------------------------------------------------------------- +# Public Level 3 API (wrappers) +# ----------------------------------------------------------------------------- + +def aac_coder_3( + filename_in: Union[str, Path], + filename_aac_coded: Optional[Union[str, Path]] = None, +) -> AACSeq3: + """ + Level-3 AAC encoder (wrapper). + + Delegates to core implementation. + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename. + Assumption: stereo audio, sampling rate 48 kHz. + filename_aac_coded : Optional[Union[str, Path]] + Optional filename to store the encoded AAC sequence (e.g., .mat). + + Returns + ------- + AACSeq3 + List of encoded frames (Level 3 schema). + """ + return core_aac_coder_3(filename_in, filename_aac_coded, verbose=True) + + +def i_aac_coder_3( + aac_seq_3: AACSeq3, + filename_out: Union[str, Path], +) -> StereoSignal: + """ + Level-3 AAC decoder (wrapper). + + Delegates to core implementation. + + Parameters + ---------- + aac_seq_3 : AACSeq3 + Encoded sequence as produced by aac_coder_3(). + filename_out : Union[str, Path] + Output WAV filename. Assumption: 48 kHz, stereo. + + Returns + ------- + StereoSignal + Decoded audio samples (time-domain), stereo, shape (N, 2), dtype float64. + """ + return core_aac_decoder_3(aac_seq_3, filename_out, verbose=True) + + +# ----------------------------------------------------------------------------- +# Demo (Level 3) +# ----------------------------------------------------------------------------- + +def demo_aac_3( + filename_in: Union[str, Path], + filename_out: Union[str, Path], + filename_aac_coded: Optional[Union[str, Path]] = None, +) -> Tuple[float, float, float]: + """ + Demonstration for the Level-3 codec. + + Runs: + - aac_coder_3(filename_in, filename_aac_coded) + - i_aac_coder_3(aac_seq_3, filename_out) + and computes: + - total SNR between original and decoded audio + - coded bitrate (bits/s) based on Huffman streams + - compression ratio (bitrate_before / bitrate_after) + + Parameters + ---------- + filename_in : Union[str, Path] + Input WAV filename (stereo, 48 kHz). + filename_out : Union[str, Path] + Output WAV filename (stereo, 48 kHz). + filename_aac_coded : Optional[Union[str, Path]] + Optional filename to store the encoded AAC sequence (e.g., .mat). + + Returns + ------- + Tuple[float, float, float] + (SNR_dB, bitrate_after_bits_per_s, compression_ratio) + """ + filename_in = Path(filename_in) + filename_out = Path(filename_out) + filename_aac_coded = Path(filename_aac_coded) if filename_aac_coded else None + + # Read original audio (reference) with the same validation as the codec. + x_ref, fs_ref = aac_read_wav_stereo_48k(filename_in) + if int(fs_ref) != 48000: + raise ValueError("Input sampling rate must be 48 kHz.") + + # Encode / decode + aac_seq_3 = aac_coder_3(filename_in, filename_aac_coded) + x_hat = i_aac_coder_3(aac_seq_3, filename_out) + + # Optional sanity: ensure output file exists and is readable + _, fs_hat = sf.read(str(filename_out), always_2d=True) + if int(fs_hat) != 48000: + raise ValueError("Decoded output sampling rate must be 48 kHz.") + + # Metrics + s = snr_db(x_ref, x_hat) + + duration = _wav_duration_seconds(filename_in) + bitrate_before = _bitrate_before_from_file(filename_in) + bitrate_after = _bitrate_after_from_aacseq(aac_seq_3, duration) + compression = float("inf") if bitrate_after <= 0.0 else (bitrate_before / bitrate_after) + + return float(s), float(bitrate_after), float(compression) + + +# ----------------------------------------------------------------------------- +# CLI +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + # Example: + # cd level_3 + # python -m level_3 input.wav output.wav + # for example: + # python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav + # or + # python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav aac_seq_3.mat + import sys + + if len(sys.argv) not in (3, 4): + raise SystemExit("Usage: python -m level_3 [aac_seq_3.mat]") + + in_wav = Path(sys.argv[1]) + out_wav = Path(sys.argv[2]) + aac_mat = Path(sys.argv[3]) if len(sys.argv) == 4 else None + + print(f"Encoding/Decoding {in_wav} to {out_wav}") + if aac_mat is not None: + print(f"Storing coded sequence to {aac_mat}") + + snr, bitrate, compression = demo_aac_3(in_wav, out_wav, aac_mat) + print(f"SNR = {snr:.3f} dB") + print(f"Bitrate (coded) = {bitrate:.2f} bits/s") + print(f"Compression ratio = {compression:.4f}") diff --git a/source/level_3/material/LicorDeCalandraca.wav b/source/level_3/material/LicorDeCalandraca.wav new file mode 100644 index 0000000..a527e8c Binary files /dev/null and b/source/level_3/material/LicorDeCalandraca.wav differ diff --git a/source/level_3/material/LicorDeCalandraca_out2.wav b/source/level_3/material/LicorDeCalandraca_out2.wav new file mode 100644 index 0000000..5d2fe28 Binary files /dev/null and b/source/level_3/material/LicorDeCalandraca_out2.wav differ diff --git a/source/level_3/material/TableB219.mat b/source/level_3/material/TableB219.mat new file mode 100644 index 0000000..7b2e403 Binary files /dev/null and b/source/level_3/material/TableB219.mat differ diff --git a/source/level_3/material/aac_seq_3.mat b/source/level_3/material/aac_seq_3.mat new file mode 100644 index 0000000..a859043 Binary files /dev/null and b/source/level_3/material/aac_seq_3.mat differ diff --git a/source/level_3/material/huffCodebooks.mat b/source/level_3/material/huffCodebooks.mat new file mode 100644 index 0000000..a208b3d Binary files /dev/null and b/source/level_3/material/huffCodebooks.mat differ diff --git a/source/level_3/material/huff_utils.py b/source/level_3/material/huff_utils.py new file mode 100644 index 0000000..9f996ee --- /dev/null +++ b/source/level_3/material/huff_utils.py @@ -0,0 +1,400 @@ +import numpy as np +import scipy.io as sio +import os + +# ------------------ LOAD LUT ------------------ + +def load_LUT(mat_filename=None): + """ + Loads the list of Huffman Codebooks (LUTs) + + Returns: + huffLUT : list (index 1..11 used, index 0 unused) + """ + if mat_filename is None: + current_dir = os.path.dirname(os.path.abspath(__file__)) + mat_filename = os.path.join(current_dir, "huffCodebooks.mat") + + mat = sio.loadmat(mat_filename) + + + huffCodebooks_raw = mat['huffCodebooks'].squeeze() + + huffCodebooks = [] + for i in range(11): + huffCodebooks.append(np.array(huffCodebooks_raw[i])) + + # Build inverse VLC tables + invTable = [None] * 11 + + for i in range(11): + h = huffCodebooks[i][:, 2].astype(int) # column 3 + hlength = huffCodebooks[i][:, 1].astype(int) # column 2 + + hbin = [] + for j in range(len(h)): + hbin.append(format(h[j], f'0{hlength[j]}b')) + + invTable[i] = vlc_table(hbin) + + # Build Huffman LUT dicts + huffLUT = [None] * 12 # index 0 unused + params = [ + (4, 1, True), + (4, 1, True), + (4, 2, False), + (4, 2, False), + (2, 4, True), + (2, 4, True), + (2, 7, False), + (2, 7, False), + (2, 12, False), + (2, 12, False), + (2, 16, False), + ] + + for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1): + huffLUT[i] = { + 'LUT': huffCodebooks[i-1], + 'invTable': invTable[i-1], + 'codebook': i, + 'nTupleSize': nTupleSize, + 'maxAbsCodeVal': maxAbs, + 'signedValues': signed + } + + return huffLUT + +def vlc_table(code_array): + """ + codeArray: list of strings, each string is a Huffman codeword (e.g. '0101') + returns: + h : NumPy array of shape (num_nodes, 3) + columns: + [ next_if_0 , next_if_1 , symbol_index ] + """ + h = np.zeros((1, 3), dtype=int) + + for code_index, code in enumerate(code_array, start=1): + word = [int(bit) for bit in code] + h_index = 0 + + for bit in word: + k = bit + next_node = h[h_index, k] + if next_node == 0: + h = np.vstack([h, [0, 0, 0]]) + new_index = h.shape[0] - 1 + h[h_index, k] = new_index + h_index = new_index + else: + h_index = next_node + + h[h_index, 2] = code_index + + return h + +# ------------------ ENCODE ------------------ + +def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None): + """ + Huffman-encode a sequence of quantized coefficients. + + This function selects the appropriate Huffman codebook based on the + maximum absolute value of the input coefficients, encodes the coefficients + into a binary Huffman bitstream, and returns both the bitstream and the + selected codebook index. + + This is the Python equivalent of the MATLAB `encodeHuff.m` function used + in audio/image coding (e.g., scale factor band encoding). The input + coefficient sequence is grouped into fixed-size tuples as defined by + the chosen Huffman LUT. Zero-padding may be applied internally. + + Parameters + ---------- + coeff_sec : array_like of int + 1-D array of quantized integer coefficients to encode. + Typically corresponds to a "section" or scale-factor band. + + huff_LUT_list : list + List of Huffman lookup-table dictionaries as returned by `loadLUT()`. + Index 1..11 correspond to valid Huffman codebooks. + Index 0 is unused. + + Returns + ------- + huffSec : str + Huffman-encoded bitstream represented as a string of '0' and '1' + characters. + + huffCodebook : int + Index (1..11) of the Huffman codebook used for encoding. + A value of 0 indicates a special all-zero section. + """ + if force_codebook is not None: + return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec) + + maxAbsVal = np.max(np.abs(coeff_sec)) + + if maxAbsVal == 0: + huffCodebook = 0 + huffSec = huff_LUT_code_0() + + elif maxAbsVal == 1: + candidates = [1, 2] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal == 2: + candidates = [3, 4] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (3, 4): + candidates = [5, 6] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (5, 6, 7): + candidates = [7, 8] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (8, 9, 10, 11, 12): + candidates = [9, 10] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (13, 14, 15): + huffCodebook = 11 + huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec) + + else: + huffCodebook = 11 + huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec) + + return huffSec, huffCodebook + +def huff_LUT_code_1(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + + if signedValues: + coeff = coeff_sec + maxAbsCodeVal + base = 2 * maxAbsCodeVal + 1 + else: + coeff = coeff_sec + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff)] = coeff + + huffSec = [] + + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + huffIndex = int(np.abs(nTuple) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + if signedValues: + huffSec.append(bits) + else: + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits) + + return ''.join(huffSec) + +def huff_LUT_code_0(): + return '' + +def huff_LUT_code_ESC(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff_sec)] = coeff_sec + + huffSec = [] + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + + lnTuple = nTuple.astype(float) + lnTuple[lnTuple == 0] = np.finfo(float).eps + + N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int)) + N = np.maximum(0, N4 - 4) + esc = np.abs(nTuple) > 15 + + nTupleESC = nTuple.copy() + nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16 + + huffIndex = int(np.abs(nTupleESC) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + escSeq = '' + for k in range(nTupleSize): + if esc[k]: + escSeq += '1' * N[k] + escSeq += '0' + escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b') + + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits + escSeq) + + return ''.join(huffSec) + +# ------------------ DECODE ------------------ + +def decode_huff(huff_sec, huff_LUT): + """ + Decode a Huffman-encoded stream. + + Parameters + ---------- + huff_sec : array-like of int or str + Huffman encoded stream as a sequence of 0 and 1 (string or list/array). + huff_LUT : dict + Huffman lookup table with keys: + - 'invTable': inverse table (numpy array) + - 'codebook': codebook number + - 'nTupleSize': tuple size + - 'maxAbsCodeVal': maximum absolute code value + - 'signedValues': True/False + + Returns + ------- + decCoeffs : list of int + Decoded quantized coefficients. + """ + + h = huff_LUT['invTable'] + huffCodebook = huff_LUT['codebook'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + # Convert string to array of ints + if isinstance(huff_sec, str): + huff_sec = np.array([int(b) for b in huff_sec]) + + eos = False + decCoeffs = [] + streamIndex = 0 + + while not eos: + wordbit = 0 + r = 0 # start at root + + # Decode Huffman word using inverse table + while True: + b = huff_sec[streamIndex + wordbit] + wordbit += 1 + rOld = r + r = h[rOld, b] + if h[r, 0] == 0 and h[r, 1] == 0: + symbolIndex = h[r, 2] - 1 # zero-based + streamIndex += wordbit + break + + # Decode n-tuple magnitudes + if signedValues: + base = 2 * maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val - maxAbsCodeVal) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + else: + base = maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + + # Apply sign bits + nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize] + nTupleSign = -(np.sign(nTupleSignBits - 0.5)) + streamIndex += nTupleSize + nTupleDec = nTupleDec * nTupleSign + + # Handle escape sequences + escIndex = np.where(np.abs(nTupleDec) == 16)[0] + if huffCodebook == 11 and escIndex.size > 0: + for idx in escIndex: + N = 0 + b = huff_sec[streamIndex] + while b: + N += 1 + b = huff_sec[streamIndex + N] + streamIndex += N +1 + N4 = N + 4 + escape_word = huff_sec[streamIndex:streamIndex + N4] + escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2) + nTupleDec[idx] = escape_value + streamIndex += N4 + # Apply signs again + nTupleDec[escIndex] *= nTupleSign[escIndex] + + decCoeffs.extend(nTupleDec.tolist()) + + if streamIndex >= len(huff_sec): + eos = True + + return decCoeffs + +