Level 3: Fix alpha miss-calculation that affected SNR
This commit is contained in:
parent
781cb3047f
commit
6b1ebc3cfe
@ -485,46 +485,16 @@ def aac_coder_3(
|
|||||||
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
# Codebook 11:
|
# sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
# sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_cb = 11
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
|
||||||
#
|
|
||||||
# sfc_L_dpcm = np.clip(
|
|
||||||
# sfc_L_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
#
|
|
||||||
# sfc_R_dpcm = np.clip(
|
|
||||||
# sfc_R_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
|
||||||
sfc_L_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
|
||||||
sfc_R_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
|
|
||||||
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.")
|
||||||
|
|
||||||
mdct_L_stream, cb_L = aac_encode_huff(
|
mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
mdct_R_stream, cb_R = aac_encode_huff(
|
|
||||||
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Typed dict construction helps static analyzers validate the schema.
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
frame_out: AACSeq3Frame = {
|
frame_out: AACSeq3Frame = {
|
||||||
|
|||||||
@ -28,13 +28,8 @@ from core.aac_types import *
|
|||||||
# Constants (assignment)
|
# Constants (assignment)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
MAGIC_NUMBER: float = 0.4054
|
MAGIC_NUMBER: float = 0.4054
|
||||||
MQ: int = 8191
|
|
||||||
|
|
||||||
EPS: float = 1e-12
|
EPS: float = 1e-12
|
||||||
MAX_SFC_DIFF: int = 60
|
MAX_SF_DELTA:int = 60
|
||||||
|
|
||||||
# Safeguard: prevents infinite loops in pathological cases
|
|
||||||
MAX_ALPHA_ITERS: int = 2000
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -149,39 +144,35 @@ def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray:
|
|||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Alpha initialization (Eq. 14)
|
# Alpha initialization (Eq. 14)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
def _alpha_initial_from_frame_max(x_all: FloatArray) -> int:
|
def _initial_alpha_hat(X: "FloatArray", MQ: int = 8191) -> int:
|
||||||
"""
|
"""
|
||||||
Compute the initial scalefactor gain alpha_hat for a frame.
|
Compute the initial scalefactor estimate alpha_hat for a frame.
|
||||||
|
|
||||||
Implements Eq. (14):
|
The assignment proposes the following first approximation (Equation 14):
|
||||||
alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ )
|
|
||||||
|
|
||||||
The same initial value is used for all bands before the per-band refinement.
|
alpha_hat = (16/3) * log2( max_k(|X(k)|)^(3/4) / MQ )
|
||||||
|
|
||||||
|
where max_k runs over all MDCT coefficients of the frame (not per band),
|
||||||
|
and MQ is the maximum quantization level parameter (2*MQ + 1 levels).
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
x_all : FloatArray
|
X : FloatArray
|
||||||
All MDCT coefficients of a frame (one channel), flattened.
|
MDCT coefficients of one frame (or one ESH subframe), shape (N,).
|
||||||
|
MQ : int
|
||||||
|
Quantizer parameter (default 8191, as per assignment).
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
int
|
int
|
||||||
Initial alpha value (integer).
|
Integer alpha_hat (rounded to nearest integer).
|
||||||
"""
|
"""
|
||||||
x_all = np.asarray(x_all, dtype=np.float64).reshape(-1)
|
x_max = float(np.max(np.abs(X)))
|
||||||
if x_all.size == 0:
|
if x_max <= 0.0:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
max_abs = float(np.max(np.abs(x_all)))
|
alpha_hat = (16.0 / 3.0) * np.log2((x_max ** (3.0 / 4.0)) / float(MQ))
|
||||||
if max_abs <= 0.0:
|
return int(np.round(alpha_hat))
|
||||||
return 0
|
|
||||||
|
|
||||||
val = (max_abs ** 0.75) / float(MQ)
|
|
||||||
if val <= 0.0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
alpha_hat = (16.0 / 3.0) * np.log2(val)
|
|
||||||
return int(np.floor(alpha_hat))
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -279,35 +270,38 @@ def _threshold_T_from_SMR(
|
|||||||
# Alpha selection per band + neighbor-difference constraint
|
# Alpha selection per band + neighbor-difference constraint
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
def _best_alpha_for_band(
|
def _best_alpha_for_band(
|
||||||
X: FloatArray,
|
X: "FloatArray", lo: int, hi: int, T_b: float,
|
||||||
lo: int,
|
alpha_hat: int, alpha_prev: int, alpha_min: int, alpha_max: int,
|
||||||
hi: int,
|
|
||||||
T_b: float,
|
|
||||||
alpha0: int,
|
|
||||||
alpha_min: int,
|
|
||||||
alpha_max: int,
|
|
||||||
) -> int:
|
) -> int:
|
||||||
"""
|
"""
|
||||||
Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha.
|
Determine the band-wise scalefactor alpha(b) following the assignment.
|
||||||
|
|
||||||
The algorithm increases alpha until the quantization error power exceeds
|
Procedure:
|
||||||
the band threshold:
|
- Start from a frame-wise initial estimate alpha_hat.
|
||||||
|
- Iteratively increase alpha(b) by 1 as long as the quantization error power
|
||||||
|
stays below the psychoacoustic threshold T(b): P_e(b) = sum_{k in band} ( X(k) - Xhat(k) )^2
|
||||||
|
|
||||||
P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2
|
- Stop increasing alpha(b) if the neighbor constraint would be violated: |alpha(b) - alpha(b-1)| <= 60
|
||||||
select the largest alpha such that P_e(b) <= T(b)
|
When processing bands sequentially (low -> high), this becomes: alpha(b) <= alpha_prev + 60
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- This function does not decrease alpha if the initial value already violates
|
||||||
|
the threshold; the assignment only specifies iterative increase.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
X : FloatArray
|
X : FloatArray
|
||||||
Full MDCT vector (one frame or one subframe), shape (N,).
|
Full MDCT vector of the current (sub)frame, shape (N,).
|
||||||
lo, hi : int
|
lo, hi : int
|
||||||
Inclusive MDCT index range defining the band.
|
Band index bounds (inclusive), defining the band slice.
|
||||||
T_b : float
|
T_b : float
|
||||||
Psychoacoustic threshold for this band.
|
Threshold T(b) for this band.
|
||||||
alpha0 : int
|
alpha_hat : int
|
||||||
Initial alpha value for the band.
|
Initial frame-wise estimate (Equation 14).
|
||||||
|
alpha_prev : int
|
||||||
|
Previously selected alpha for band b-1 (neighbor constraint reference).
|
||||||
alpha_min, alpha_max : int
|
alpha_min, alpha_max : int
|
||||||
Bounds for alpha, used as a safeguard.
|
Safeguard bounds for alpha.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -315,81 +309,41 @@ def _best_alpha_for_band(
|
|||||||
Selected integer alpha(b).
|
Selected integer alpha(b).
|
||||||
"""
|
"""
|
||||||
if T_b <= 0.0:
|
if T_b <= 0.0:
|
||||||
return int(alpha0)
|
return int(alpha_hat)
|
||||||
|
|
||||||
Xsec = X[lo : hi + 1]
|
Xsec = X[lo : hi + 1]
|
||||||
|
|
||||||
alpha = int(alpha0)
|
# Neighbor constraint (sequential processing): alpha(b) <= alpha_prev + 60
|
||||||
alpha = max(alpha_min, min(alpha, alpha_max))
|
alpha_limit = min(int(alpha_max), int(alpha_prev) + MAX_SF_DELTA)
|
||||||
|
|
||||||
|
# Start from alpha_hat, clamped to feasible range
|
||||||
|
alpha = int(alpha_hat)
|
||||||
|
alpha = max(int(alpha_min), min(alpha, int(alpha_limit)))
|
||||||
|
|
||||||
# Evaluate at current alpha
|
# Evaluate at current alpha
|
||||||
Ssec = _quantize_symbol(Xsec, alpha)
|
Ssec = _quantize_symbol(Xsec, alpha)
|
||||||
Xhat = _dequantize_symbol(Ssec, alpha)
|
Xhat = _dequantize_symbol(Ssec, alpha)
|
||||||
Pe = float(np.sum((Xsec - Xhat) ** 2))
|
Pe = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
|
# If already above threshold, return current alpha (no decrease step specified)
|
||||||
if Pe > T_b:
|
if Pe > T_b:
|
||||||
return alpha
|
return alpha
|
||||||
|
|
||||||
iters = 0
|
# Increase alpha while still under threshold and within constraints
|
||||||
while iters < MAX_ALPHA_ITERS:
|
while True:
|
||||||
iters += 1
|
|
||||||
alpha_next = alpha + 1
|
alpha_next = alpha + 1
|
||||||
if alpha_next > alpha_max:
|
if alpha_next > alpha_limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
Ssec = _quantize_symbol(Xsec, alpha_next)
|
Ssec = _quantize_symbol(Xsec, alpha_next)
|
||||||
Xhat = _dequantize_symbol(Ssec, alpha_next)
|
Xhat = _dequantize_symbol(Ssec, alpha_next)
|
||||||
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
|
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
if Pe_next > T_b:
|
if Pe_next > T_b:
|
||||||
break
|
break
|
||||||
|
|
||||||
alpha = alpha_next
|
alpha = alpha_next
|
||||||
Pe = Pe_next
|
|
||||||
|
|
||||||
return alpha
|
return alpha
|
||||||
|
|
||||||
|
|
||||||
def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping.
|
|
||||||
|
|
||||||
Uses a forward pass and a backward pass to reduce drift.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
alpha : np.ndarray
|
|
||||||
Alpha vector, shape (NB,).
|
|
||||||
max_diff : int
|
|
||||||
Maximum allowed absolute difference between adjacent bands.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.ndarray
|
|
||||||
Clamped alpha vector, int64, shape (NB,).
|
|
||||||
"""
|
|
||||||
a = np.asarray(alpha, dtype=np.int64).copy()
|
|
||||||
nb = a.shape[0]
|
|
||||||
|
|
||||||
for b in range(1, nb):
|
|
||||||
lo = int(a[b - 1] - max_diff)
|
|
||||||
hi = int(a[b - 1] + max_diff)
|
|
||||||
if a[b] < lo:
|
|
||||||
a[b] = lo
|
|
||||||
elif a[b] > hi:
|
|
||||||
a[b] = hi
|
|
||||||
|
|
||||||
for b in range(nb - 2, -1, -1):
|
|
||||||
lo = int(a[b + 1] - max_diff)
|
|
||||||
hi = int(a[b + 1] + max_diff)
|
|
||||||
if a[b] < lo:
|
|
||||||
a[b] = lo
|
|
||||||
elif a[b] > hi:
|
|
||||||
a[b] = hi
|
|
||||||
|
|
||||||
return a
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public API
|
# Public API
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -401,7 +355,16 @@ def aac_quantizer(
|
|||||||
"""
|
"""
|
||||||
AAC quantizer for one channel (Level 3).
|
AAC quantizer for one channel (Level 3).
|
||||||
|
|
||||||
Quantizes MDCT coefficients using band-wise scalefactors derived from SMR.
|
Quantizes MDCT coefficients (after TNS) using band-wise scalefactors derived
|
||||||
|
from psychoacoustic thresholds computed via SMR.
|
||||||
|
|
||||||
|
The implementation follows the assignment procedure:
|
||||||
|
- Compute an initial frame-wise alpha_hat using Equation (14), based on the
|
||||||
|
maximum MDCT coefficient magnitude of the (sub)frame.
|
||||||
|
- For each band b, increase alpha(b) by 1 while the quantization error power
|
||||||
|
P_e(b) stays below the threshold T(b).
|
||||||
|
- Enforce the neighbor constraint |alpha(b) - alpha(b-1)| <= 60 during the
|
||||||
|
band-by-band search (no post-processing needed).
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -421,16 +384,19 @@ def aac_quantizer(
|
|||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
S : QuantizedSymbols
|
S : QuantizedSymbols
|
||||||
Quantized symbols S(k), shape (1024, 1) for all frame types.
|
Quantized symbols S(k), packed as shape (1024, 1) for all frame types.
|
||||||
|
For ESH, the 8 subframes are packed in column-major subframe layout.
|
||||||
sfc : ScaleFactors
|
sfc : ScaleFactors
|
||||||
DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1).
|
DPCM-coded scalefactors:
|
||||||
|
sfc(0) = alpha(0) = G
|
||||||
|
sfc(b) = alpha(b) - alpha(b-1), for b > 0
|
||||||
Shapes:
|
Shapes:
|
||||||
- Long: (NB, 1)
|
- Long: (NB, 1)
|
||||||
- ESH: (NB, 8)
|
- ESH: (NB, 8)
|
||||||
G : GlobalGain
|
G : GlobalGain
|
||||||
Global gain G = alpha(0).
|
Global gain G = alpha(0).
|
||||||
- Long: scalar float
|
- Long: scalar float
|
||||||
- ESH: array shape (1, 8)
|
- ESH: array shape (1, 8), dtype float64
|
||||||
"""
|
"""
|
||||||
bands = _band_slices(frame_type)
|
bands = _band_slices(frame_type)
|
||||||
NB = len(bands)
|
NB = len(bands)
|
||||||
@ -438,6 +404,9 @@ def aac_quantizer(
|
|||||||
X = np.asarray(frame_F, dtype=np.float64)
|
X = np.asarray(frame_F, dtype=np.float64)
|
||||||
SMR = np.asarray(SMR, dtype=np.float64)
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# ESH: 8 short subframes, each of length 128
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
if frame_type == "ESH":
|
if frame_type == "ESH":
|
||||||
if X.shape != (128, 8):
|
if X.shape != (128, 8):
|
||||||
raise ValueError("For ESH, frame_F must have shape (128, 8).")
|
raise ValueError("For ESH, frame_F must have shape (128, 8).")
|
||||||
@ -446,42 +415,58 @@ def aac_quantizer(
|
|||||||
|
|
||||||
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
|
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
|
||||||
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
|
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
|
||||||
G_arr = np.zeros((1, 8), dtype=np.int64)
|
G_arr = np.zeros((1, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
# Packed output view: (128, 8) with column-major layout
|
||||||
|
S_pack = S_out[:, 0].reshape(128, 8, order="F")
|
||||||
|
|
||||||
for j in range(8):
|
for j in range(8):
|
||||||
Xj = X[:, j].reshape(128)
|
Xj = X[:, j].reshape(128)
|
||||||
SMRj = SMR[:, j].reshape(NB)
|
SMRj = SMR[:, j].reshape(NB)
|
||||||
|
|
||||||
|
# Compute psychoacoustic threshold T(b) for this subframe
|
||||||
T = _threshold_T_from_SMR(Xj, SMRj, bands)
|
T = _threshold_T_from_SMR(Xj, SMRj, bands)
|
||||||
|
|
||||||
alpha0 = _alpha_initial_from_frame_max(Xj)
|
# Frame-wise initial estimate alpha_hat (Equation 14)
|
||||||
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
alpha_hat = _initial_alpha_hat(Xj)
|
||||||
|
|
||||||
|
# Band-wise scalefactors alpha(b)
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha_prev = int(alpha_hat)
|
||||||
|
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
alpha[b] = _best_alpha_for_band(
|
alpha_b = _best_alpha_for_band(
|
||||||
X=Xj, lo=lo, hi=hi, T_b=float(T[b]),
|
X=Xj,
|
||||||
alpha0=int(alpha[b]),
|
lo=lo,
|
||||||
|
hi=hi,
|
||||||
|
T_b=float(T[b]),
|
||||||
|
alpha_hat=int(alpha_hat),
|
||||||
|
alpha_prev=int(alpha_prev),
|
||||||
alpha_min=-4096,
|
alpha_min=-4096,
|
||||||
alpha_max=4096,
|
alpha_max=4096,
|
||||||
)
|
)
|
||||||
|
alpha[b] = int(alpha_b)
|
||||||
|
alpha_prev = int(alpha_b)
|
||||||
|
|
||||||
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
# DPCM-coded scalefactors
|
||||||
|
G_arr[0, j] = float(alpha[0])
|
||||||
G_arr[0, j] = int(alpha[0])
|
|
||||||
sfc[0, j] = int(alpha[0])
|
sfc[0, j] = int(alpha[0])
|
||||||
for b in range(1, NB):
|
for b in range(1, NB):
|
||||||
sfc[b, j] = int(alpha[b] - alpha[b - 1])
|
sfc[b, j] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
|
# Quantize MDCT coefficients band-by-band
|
||||||
Sj = np.zeros((128,), dtype=np.int64)
|
Sj = np.zeros((128,), dtype=np.int64)
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
|
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
# Place this subframe in the packed output (column-major subframe layout)
|
# Store subframe in packed output
|
||||||
S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj
|
S_pack[:, j] = Sj
|
||||||
|
|
||||||
return S_out, sfc, G_arr.astype(np.float64)
|
return S_out, sfc, G_arr
|
||||||
|
|
||||||
# Long frames
|
# -------------------------------------------------------------------------
|
||||||
|
# Long frames: OLS / LSS / LPS, length 1024
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
if X.shape == (1024,):
|
if X.shape == (1024,):
|
||||||
Xv = X
|
Xv = X
|
||||||
elif X.shape == (1024, 1):
|
elif X.shape == (1024, 1):
|
||||||
@ -496,33 +481,44 @@ def aac_quantizer(
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
|
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
# Compute psychoacoustic threshold T(b) for the long frame
|
||||||
T = _threshold_T_from_SMR(Xv, SMRv, bands)
|
T = _threshold_T_from_SMR(Xv, SMRv, bands)
|
||||||
|
|
||||||
alpha0 = _alpha_initial_from_frame_max(Xv)
|
# Frame-wise initial estimate alpha_hat (Equation 14)
|
||||||
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
alpha_hat = _initial_alpha_hat(Xv)
|
||||||
|
|
||||||
|
# Band-wise scalefactors alpha(b)
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha_prev = int(alpha_hat)
|
||||||
|
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
alpha[b] = _best_alpha_for_band(
|
alpha_b = _best_alpha_for_band(
|
||||||
X=Xv, lo=lo, hi=hi, T_b=float(T[b]),
|
X=Xv,
|
||||||
alpha0=int(alpha[b]),
|
lo=lo,
|
||||||
|
hi=hi,
|
||||||
|
T_b=float(T[b]),
|
||||||
|
alpha_hat=int(alpha_hat),
|
||||||
|
alpha_prev=int(alpha_prev),
|
||||||
alpha_min=-4096,
|
alpha_min=-4096,
|
||||||
alpha_max=4096,
|
alpha_max=4096,
|
||||||
)
|
)
|
||||||
|
alpha[b] = int(alpha_b)
|
||||||
|
alpha_prev = int(alpha_b)
|
||||||
|
|
||||||
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
# DPCM-coded scalefactors
|
||||||
|
sfc_out: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
|
||||||
sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
|
sfc_out[0, 0] = int(alpha[0])
|
||||||
sfc[0, 0] = int(alpha[0])
|
|
||||||
for b in range(1, NB):
|
for b in range(1, NB):
|
||||||
sfc[b, 0] = int(alpha[b] - alpha[b - 1])
|
sfc_out[b, 0] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
G: float = float(alpha[0])
|
G: float = float(alpha[0])
|
||||||
|
|
||||||
|
# Quantize MDCT coefficients band-by-band
|
||||||
S_vec = np.zeros((1024,), dtype=np.int64)
|
S_vec = np.zeros((1024,), dtype=np.int64)
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
|
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
return S_vec.reshape(1024, 1), sfc, G
|
return S_vec.reshape(1024, 1), sfc_out, G
|
||||||
|
|
||||||
|
|
||||||
def aac_i_quantizer(
|
def aac_i_quantizer(
|
||||||
|
|||||||
@ -310,5 +310,5 @@ def test_end_to_end_level_3_high_snr(mk_actual_stereo_wav: Path, tmp_path: Path)
|
|||||||
|
|
||||||
n = min(x_ref.shape[0], y_hat.shape[0])
|
n = min(x_ref.shape[0], y_hat.shape[0])
|
||||||
s = snr_db(x_ref[:n, :], y_hat[:n, :])
|
s = snr_db(x_ref[:n, :], y_hat[:n, :])
|
||||||
print(f"SNR={s}")
|
# print(f" with SNR={s}")
|
||||||
assert s > 2.0
|
assert s > 7.0
|
||||||
|
|||||||
@ -485,46 +485,16 @@ def aac_coder_3(
|
|||||||
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
# Codebook 11:
|
# sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
# sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_cb = 11
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
|
||||||
#
|
|
||||||
# sfc_L_dpcm = np.clip(
|
|
||||||
# sfc_L_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
#
|
|
||||||
# sfc_R_dpcm = np.clip(
|
|
||||||
# sfc_R_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
|
||||||
sfc_L_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
|
||||||
sfc_R_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
|
|
||||||
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.")
|
||||||
|
|
||||||
mdct_L_stream, cb_L = aac_encode_huff(
|
mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
mdct_R_stream, cb_R = aac_encode_huff(
|
|
||||||
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Typed dict construction helps static analyzers validate the schema.
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
frame_out: AACSeq3Frame = {
|
frame_out: AACSeq3Frame = {
|
||||||
|
|||||||
Binary file not shown.
@ -485,46 +485,16 @@ def aac_coder_3(
|
|||||||
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
# Codebook 11:
|
# sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
# sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_cb = 11
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
|
||||||
#
|
|
||||||
# sfc_L_dpcm = np.clip(
|
|
||||||
# sfc_L_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
#
|
|
||||||
# sfc_R_dpcm = np.clip(
|
|
||||||
# sfc_R_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
|
||||||
sfc_L_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
|
||||||
sfc_R_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
|
|
||||||
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.")
|
||||||
|
|
||||||
mdct_L_stream, cb_L = aac_encode_huff(
|
mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
mdct_R_stream, cb_R = aac_encode_huff(
|
|
||||||
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Typed dict construction helps static analyzers validate the schema.
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
frame_out: AACSeq3Frame = {
|
frame_out: AACSeq3Frame = {
|
||||||
|
|||||||
Binary file not shown.
@ -485,46 +485,16 @@ def aac_coder_3(
|
|||||||
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...]
|
||||||
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
sfc_R_dpcm = np.asarray(sfc_R, dtype=np.int64)[1:, ...]
|
||||||
|
|
||||||
# Codebook 11:
|
# sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# maxAbsCodeVal = 16 is RESERVED for ESCAPE.
|
# sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list, force_codebook=11)
|
||||||
# We must stay strictly within [-15, +15] to avoid escape decoding.
|
sfc_L_stream, cb_sfc_L = aac_encode_huff(sfc_L_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_cb = 11
|
sfc_R_stream, cb_sfc_R = aac_encode_huff(sfc_R_dpcm.reshape(-1, order="F"), huff_LUT_list)
|
||||||
# sf_max_abs = int(huff_LUT_list[sf_cb]["maxAbsCodeVal"]) - 1 # -> 15
|
|
||||||
#
|
|
||||||
# sfc_L_dpcm = np.clip(
|
|
||||||
# sfc_L_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
#
|
|
||||||
# sfc_R_dpcm = np.clip(
|
|
||||||
# sfc_R_dpcm,
|
|
||||||
# -sf_max_abs,
|
|
||||||
# sf_max_abs,
|
|
||||||
# ).astype(np.int64, copy=False)
|
|
||||||
|
|
||||||
sfc_L_stream, cb_sfc_L = aac_encode_huff(
|
|
||||||
sfc_L_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
sfc_R_stream, cb_sfc_R = aac_encode_huff(
|
|
||||||
sfc_R_dpcm.reshape(-1, order="F"),
|
|
||||||
huff_LUT_list,
|
|
||||||
# force_codebook=11,
|
|
||||||
)
|
|
||||||
|
|
||||||
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
if cb_sfc_L != 11 or cb_sfc_R != 11:
|
||||||
print (f"frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}")
|
raise ValueError(f"Illegal codebook value for frame: {i}: cb_sfc_l={cb_sfc_L}, cb_sfc_r={cb_sfc_R}.")
|
||||||
|
|
||||||
mdct_L_stream, cb_L = aac_encode_huff(
|
mdct_L_stream, cb_L = aac_encode_huff(np.asarray(S_L, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
np.asarray(S_L, dtype=np.int64).reshape(-1),
|
mdct_R_stream, cb_R = aac_encode_huff(np.asarray(S_R, dtype=np.int64).reshape(-1), huff_LUT_list)
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
mdct_R_stream, cb_R = aac_encode_huff(
|
|
||||||
np.asarray(S_R, dtype=np.int64).reshape(-1),
|
|
||||||
huff_LUT_list,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Typed dict construction helps static analyzers validate the schema.
|
# Typed dict construction helps static analyzers validate the schema.
|
||||||
frame_out: AACSeq3Frame = {
|
frame_out: AACSeq3Frame = {
|
||||||
|
|||||||
@ -28,13 +28,8 @@ from core.aac_types import *
|
|||||||
# Constants (assignment)
|
# Constants (assignment)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
MAGIC_NUMBER: float = 0.4054
|
MAGIC_NUMBER: float = 0.4054
|
||||||
MQ: int = 8191
|
|
||||||
|
|
||||||
EPS: float = 1e-12
|
EPS: float = 1e-12
|
||||||
MAX_SFC_DIFF: int = 60
|
MAX_SF_DELTA:int = 60
|
||||||
|
|
||||||
# Safeguard: prevents infinite loops in pathological cases
|
|
||||||
MAX_ALPHA_ITERS: int = 2000
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -149,39 +144,35 @@ def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray:
|
|||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Alpha initialization (Eq. 14)
|
# Alpha initialization (Eq. 14)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
def _alpha_initial_from_frame_max(x_all: FloatArray) -> int:
|
def _initial_alpha_hat(X: "FloatArray", MQ: int = 8191) -> int:
|
||||||
"""
|
"""
|
||||||
Compute the initial scalefactor gain alpha_hat for a frame.
|
Compute the initial scalefactor estimate alpha_hat for a frame.
|
||||||
|
|
||||||
Implements Eq. (14):
|
The assignment proposes the following first approximation (Equation 14):
|
||||||
alpha_hat = (16/3) * log2( max_k(|X(k)|^(3/4)) / MQ )
|
|
||||||
|
|
||||||
The same initial value is used for all bands before the per-band refinement.
|
alpha_hat = (16/3) * log2( max_k(|X(k)|)^(3/4) / MQ )
|
||||||
|
|
||||||
|
where max_k runs over all MDCT coefficients of the frame (not per band),
|
||||||
|
and MQ is the maximum quantization level parameter (2*MQ + 1 levels).
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
x_all : FloatArray
|
X : FloatArray
|
||||||
All MDCT coefficients of a frame (one channel), flattened.
|
MDCT coefficients of one frame (or one ESH subframe), shape (N,).
|
||||||
|
MQ : int
|
||||||
|
Quantizer parameter (default 8191, as per assignment).
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
int
|
int
|
||||||
Initial alpha value (integer).
|
Integer alpha_hat (rounded to nearest integer).
|
||||||
"""
|
"""
|
||||||
x_all = np.asarray(x_all, dtype=np.float64).reshape(-1)
|
x_max = float(np.max(np.abs(X)))
|
||||||
if x_all.size == 0:
|
if x_max <= 0.0:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
max_abs = float(np.max(np.abs(x_all)))
|
alpha_hat = (16.0 / 3.0) * np.log2((x_max ** (3.0 / 4.0)) / float(MQ))
|
||||||
if max_abs <= 0.0:
|
return int(np.round(alpha_hat))
|
||||||
return 0
|
|
||||||
|
|
||||||
val = (max_abs ** 0.75) / float(MQ)
|
|
||||||
if val <= 0.0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
alpha_hat = (16.0 / 3.0) * np.log2(val)
|
|
||||||
return int(np.floor(alpha_hat))
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -279,35 +270,38 @@ def _threshold_T_from_SMR(
|
|||||||
# Alpha selection per band + neighbor-difference constraint
|
# Alpha selection per band + neighbor-difference constraint
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
def _best_alpha_for_band(
|
def _best_alpha_for_band(
|
||||||
X: FloatArray,
|
X: "FloatArray", lo: int, hi: int, T_b: float,
|
||||||
lo: int,
|
alpha_hat: int, alpha_prev: int, alpha_min: int, alpha_max: int,
|
||||||
hi: int,
|
|
||||||
T_b: float,
|
|
||||||
alpha0: int,
|
|
||||||
alpha_min: int,
|
|
||||||
alpha_max: int,
|
|
||||||
) -> int:
|
) -> int:
|
||||||
"""
|
"""
|
||||||
Determine the band-wise scalefactor alpha(b) by iteratively increasing alpha.
|
Determine the band-wise scalefactor alpha(b) following the assignment.
|
||||||
|
|
||||||
The algorithm increases alpha until the quantization error power exceeds
|
Procedure:
|
||||||
the band threshold:
|
- Start from a frame-wise initial estimate alpha_hat.
|
||||||
|
- Iteratively increase alpha(b) by 1 as long as the quantization error power
|
||||||
|
stays below the psychoacoustic threshold T(b): P_e(b) = sum_{k in band} ( X(k) - Xhat(k) )^2
|
||||||
|
|
||||||
P_e(b) = sum_{k in band} (X(k) - Xhat(k))^2
|
- Stop increasing alpha(b) if the neighbor constraint would be violated: |alpha(b) - alpha(b-1)| <= 60
|
||||||
select the largest alpha such that P_e(b) <= T(b)
|
When processing bands sequentially (low -> high), this becomes: alpha(b) <= alpha_prev + 60
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- This function does not decrease alpha if the initial value already violates
|
||||||
|
the threshold; the assignment only specifies iterative increase.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
X : FloatArray
|
X : FloatArray
|
||||||
Full MDCT vector (one frame or one subframe), shape (N,).
|
Full MDCT vector of the current (sub)frame, shape (N,).
|
||||||
lo, hi : int
|
lo, hi : int
|
||||||
Inclusive MDCT index range defining the band.
|
Band index bounds (inclusive), defining the band slice.
|
||||||
T_b : float
|
T_b : float
|
||||||
Psychoacoustic threshold for this band.
|
Threshold T(b) for this band.
|
||||||
alpha0 : int
|
alpha_hat : int
|
||||||
Initial alpha value for the band.
|
Initial frame-wise estimate (Equation 14).
|
||||||
|
alpha_prev : int
|
||||||
|
Previously selected alpha for band b-1 (neighbor constraint reference).
|
||||||
alpha_min, alpha_max : int
|
alpha_min, alpha_max : int
|
||||||
Bounds for alpha, used as a safeguard.
|
Safeguard bounds for alpha.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -315,81 +309,41 @@ def _best_alpha_for_band(
|
|||||||
Selected integer alpha(b).
|
Selected integer alpha(b).
|
||||||
"""
|
"""
|
||||||
if T_b <= 0.0:
|
if T_b <= 0.0:
|
||||||
return int(alpha0)
|
return int(alpha_hat)
|
||||||
|
|
||||||
Xsec = X[lo : hi + 1]
|
Xsec = X[lo : hi + 1]
|
||||||
|
|
||||||
alpha = int(alpha0)
|
# Neighbor constraint (sequential processing): alpha(b) <= alpha_prev + 60
|
||||||
alpha = max(alpha_min, min(alpha, alpha_max))
|
alpha_limit = min(int(alpha_max), int(alpha_prev) + MAX_SF_DELTA)
|
||||||
|
|
||||||
|
# Start from alpha_hat, clamped to feasible range
|
||||||
|
alpha = int(alpha_hat)
|
||||||
|
alpha = max(int(alpha_min), min(alpha, int(alpha_limit)))
|
||||||
|
|
||||||
# Evaluate at current alpha
|
# Evaluate at current alpha
|
||||||
Ssec = _quantize_symbol(Xsec, alpha)
|
Ssec = _quantize_symbol(Xsec, alpha)
|
||||||
Xhat = _dequantize_symbol(Ssec, alpha)
|
Xhat = _dequantize_symbol(Ssec, alpha)
|
||||||
Pe = float(np.sum((Xsec - Xhat) ** 2))
|
Pe = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
|
# If already above threshold, return current alpha (no decrease step specified)
|
||||||
if Pe > T_b:
|
if Pe > T_b:
|
||||||
return alpha
|
return alpha
|
||||||
|
|
||||||
iters = 0
|
# Increase alpha while still under threshold and within constraints
|
||||||
while iters < MAX_ALPHA_ITERS:
|
while True:
|
||||||
iters += 1
|
|
||||||
alpha_next = alpha + 1
|
alpha_next = alpha + 1
|
||||||
if alpha_next > alpha_max:
|
if alpha_next > alpha_limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
Ssec = _quantize_symbol(Xsec, alpha_next)
|
Ssec = _quantize_symbol(Xsec, alpha_next)
|
||||||
Xhat = _dequantize_symbol(Ssec, alpha_next)
|
Xhat = _dequantize_symbol(Ssec, alpha_next)
|
||||||
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
|
Pe_next = float(np.sum((Xsec - Xhat) ** 2))
|
||||||
|
|
||||||
if Pe_next > T_b:
|
if Pe_next > T_b:
|
||||||
break
|
break
|
||||||
|
|
||||||
alpha = alpha_next
|
alpha = alpha_next
|
||||||
Pe = Pe_next
|
|
||||||
|
|
||||||
return alpha
|
return alpha
|
||||||
|
|
||||||
|
|
||||||
def _enforce_max_diff(alpha: np.ndarray, max_diff: int = MAX_SFC_DIFF) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Enforce neighbor constraint |alpha[b] - alpha[b-1]| <= max_diff by clamping.
|
|
||||||
|
|
||||||
Uses a forward pass and a backward pass to reduce drift.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
alpha : np.ndarray
|
|
||||||
Alpha vector, shape (NB,).
|
|
||||||
max_diff : int
|
|
||||||
Maximum allowed absolute difference between adjacent bands.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.ndarray
|
|
||||||
Clamped alpha vector, int64, shape (NB,).
|
|
||||||
"""
|
|
||||||
a = np.asarray(alpha, dtype=np.int64).copy()
|
|
||||||
nb = a.shape[0]
|
|
||||||
|
|
||||||
for b in range(1, nb):
|
|
||||||
lo = int(a[b - 1] - max_diff)
|
|
||||||
hi = int(a[b - 1] + max_diff)
|
|
||||||
if a[b] < lo:
|
|
||||||
a[b] = lo
|
|
||||||
elif a[b] > hi:
|
|
||||||
a[b] = hi
|
|
||||||
|
|
||||||
for b in range(nb - 2, -1, -1):
|
|
||||||
lo = int(a[b + 1] - max_diff)
|
|
||||||
hi = int(a[b + 1] + max_diff)
|
|
||||||
if a[b] < lo:
|
|
||||||
a[b] = lo
|
|
||||||
elif a[b] > hi:
|
|
||||||
a[b] = hi
|
|
||||||
|
|
||||||
return a
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Public API
|
# Public API
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -401,7 +355,16 @@ def aac_quantizer(
|
|||||||
"""
|
"""
|
||||||
AAC quantizer for one channel (Level 3).
|
AAC quantizer for one channel (Level 3).
|
||||||
|
|
||||||
Quantizes MDCT coefficients using band-wise scalefactors derived from SMR.
|
Quantizes MDCT coefficients (after TNS) using band-wise scalefactors derived
|
||||||
|
from psychoacoustic thresholds computed via SMR.
|
||||||
|
|
||||||
|
The implementation follows the assignment procedure:
|
||||||
|
- Compute an initial frame-wise alpha_hat using Equation (14), based on the
|
||||||
|
maximum MDCT coefficient magnitude of the (sub)frame.
|
||||||
|
- For each band b, increase alpha(b) by 1 while the quantization error power
|
||||||
|
P_e(b) stays below the threshold T(b).
|
||||||
|
- Enforce the neighbor constraint |alpha(b) - alpha(b-1)| <= 60 during the
|
||||||
|
band-by-band search (no post-processing needed).
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -421,16 +384,19 @@ def aac_quantizer(
|
|||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
S : QuantizedSymbols
|
S : QuantizedSymbols
|
||||||
Quantized symbols S(k), shape (1024, 1) for all frame types.
|
Quantized symbols S(k), packed as shape (1024, 1) for all frame types.
|
||||||
|
For ESH, the 8 subframes are packed in column-major subframe layout.
|
||||||
sfc : ScaleFactors
|
sfc : ScaleFactors
|
||||||
DPCM-coded scalefactor differences sfc(b) = alpha(b) - alpha(b-1).
|
DPCM-coded scalefactors:
|
||||||
|
sfc(0) = alpha(0) = G
|
||||||
|
sfc(b) = alpha(b) - alpha(b-1), for b > 0
|
||||||
Shapes:
|
Shapes:
|
||||||
- Long: (NB, 1)
|
- Long: (NB, 1)
|
||||||
- ESH: (NB, 8)
|
- ESH: (NB, 8)
|
||||||
G : GlobalGain
|
G : GlobalGain
|
||||||
Global gain G = alpha(0).
|
Global gain G = alpha(0).
|
||||||
- Long: scalar float
|
- Long: scalar float
|
||||||
- ESH: array shape (1, 8)
|
- ESH: array shape (1, 8), dtype float64
|
||||||
"""
|
"""
|
||||||
bands = _band_slices(frame_type)
|
bands = _band_slices(frame_type)
|
||||||
NB = len(bands)
|
NB = len(bands)
|
||||||
@ -438,6 +404,9 @@ def aac_quantizer(
|
|||||||
X = np.asarray(frame_F, dtype=np.float64)
|
X = np.asarray(frame_F, dtype=np.float64)
|
||||||
SMR = np.asarray(SMR, dtype=np.float64)
|
SMR = np.asarray(SMR, dtype=np.float64)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# ESH: 8 short subframes, each of length 128
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
if frame_type == "ESH":
|
if frame_type == "ESH":
|
||||||
if X.shape != (128, 8):
|
if X.shape != (128, 8):
|
||||||
raise ValueError("For ESH, frame_F must have shape (128, 8).")
|
raise ValueError("For ESH, frame_F must have shape (128, 8).")
|
||||||
@ -446,42 +415,58 @@ def aac_quantizer(
|
|||||||
|
|
||||||
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
|
S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64)
|
||||||
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
|
sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64)
|
||||||
G_arr = np.zeros((1, 8), dtype=np.int64)
|
G_arr = np.zeros((1, 8), dtype=np.float64)
|
||||||
|
|
||||||
|
# Packed output view: (128, 8) with column-major layout
|
||||||
|
S_pack = S_out[:, 0].reshape(128, 8, order="F")
|
||||||
|
|
||||||
for j in range(8):
|
for j in range(8):
|
||||||
Xj = X[:, j].reshape(128)
|
Xj = X[:, j].reshape(128)
|
||||||
SMRj = SMR[:, j].reshape(NB)
|
SMRj = SMR[:, j].reshape(NB)
|
||||||
|
|
||||||
|
# Compute psychoacoustic threshold T(b) for this subframe
|
||||||
T = _threshold_T_from_SMR(Xj, SMRj, bands)
|
T = _threshold_T_from_SMR(Xj, SMRj, bands)
|
||||||
|
|
||||||
alpha0 = _alpha_initial_from_frame_max(Xj)
|
# Frame-wise initial estimate alpha_hat (Equation 14)
|
||||||
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
alpha_hat = _initial_alpha_hat(Xj)
|
||||||
|
|
||||||
|
# Band-wise scalefactors alpha(b)
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha_prev = int(alpha_hat)
|
||||||
|
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
alpha[b] = _best_alpha_for_band(
|
alpha_b = _best_alpha_for_band(
|
||||||
X=Xj, lo=lo, hi=hi, T_b=float(T[b]),
|
X=Xj,
|
||||||
alpha0=int(alpha[b]),
|
lo=lo,
|
||||||
|
hi=hi,
|
||||||
|
T_b=float(T[b]),
|
||||||
|
alpha_hat=int(alpha_hat),
|
||||||
|
alpha_prev=int(alpha_prev),
|
||||||
alpha_min=-4096,
|
alpha_min=-4096,
|
||||||
alpha_max=4096,
|
alpha_max=4096,
|
||||||
)
|
)
|
||||||
|
alpha[b] = int(alpha_b)
|
||||||
|
alpha_prev = int(alpha_b)
|
||||||
|
|
||||||
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
# DPCM-coded scalefactors
|
||||||
|
G_arr[0, j] = float(alpha[0])
|
||||||
G_arr[0, j] = int(alpha[0])
|
|
||||||
sfc[0, j] = int(alpha[0])
|
sfc[0, j] = int(alpha[0])
|
||||||
for b in range(1, NB):
|
for b in range(1, NB):
|
||||||
sfc[b, j] = int(alpha[b] - alpha[b - 1])
|
sfc[b, j] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
|
# Quantize MDCT coefficients band-by-band
|
||||||
Sj = np.zeros((128,), dtype=np.int64)
|
Sj = np.zeros((128,), dtype=np.int64)
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
|
Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
# Place this subframe in the packed output (column-major subframe layout)
|
# Store subframe in packed output
|
||||||
S_out[:, 0].reshape(128, 8, order="F")[:, j] = Sj
|
S_pack[:, j] = Sj
|
||||||
|
|
||||||
return S_out, sfc, G_arr.astype(np.float64)
|
return S_out, sfc, G_arr
|
||||||
|
|
||||||
# Long frames
|
# -------------------------------------------------------------------------
|
||||||
|
# Long frames: OLS / LSS / LPS, length 1024
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
if X.shape == (1024,):
|
if X.shape == (1024,):
|
||||||
Xv = X
|
Xv = X
|
||||||
elif X.shape == (1024, 1):
|
elif X.shape == (1024, 1):
|
||||||
@ -496,33 +481,44 @@ def aac_quantizer(
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
|
raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).")
|
||||||
|
|
||||||
|
# Compute psychoacoustic threshold T(b) for the long frame
|
||||||
T = _threshold_T_from_SMR(Xv, SMRv, bands)
|
T = _threshold_T_from_SMR(Xv, SMRv, bands)
|
||||||
|
|
||||||
alpha0 = _alpha_initial_from_frame_max(Xv)
|
# Frame-wise initial estimate alpha_hat (Equation 14)
|
||||||
alpha = np.full((NB,), alpha0, dtype=np.int64)
|
alpha_hat = _initial_alpha_hat(Xv)
|
||||||
|
|
||||||
|
# Band-wise scalefactors alpha(b)
|
||||||
|
alpha = np.zeros((NB,), dtype=np.int64)
|
||||||
|
alpha_prev = int(alpha_hat)
|
||||||
|
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
alpha[b] = _best_alpha_for_band(
|
alpha_b = _best_alpha_for_band(
|
||||||
X=Xv, lo=lo, hi=hi, T_b=float(T[b]),
|
X=Xv,
|
||||||
alpha0=int(alpha[b]),
|
lo=lo,
|
||||||
|
hi=hi,
|
||||||
|
T_b=float(T[b]),
|
||||||
|
alpha_hat=int(alpha_hat),
|
||||||
|
alpha_prev=int(alpha_prev),
|
||||||
alpha_min=-4096,
|
alpha_min=-4096,
|
||||||
alpha_max=4096,
|
alpha_max=4096,
|
||||||
)
|
)
|
||||||
|
alpha[b] = int(alpha_b)
|
||||||
|
alpha_prev = int(alpha_b)
|
||||||
|
|
||||||
alpha = _enforce_max_diff(alpha, MAX_SFC_DIFF)
|
# DPCM-coded scalefactors
|
||||||
|
sfc_out: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
|
||||||
sfc: ScaleFactors = np.zeros((NB, 1), dtype=np.int64)
|
sfc_out[0, 0] = int(alpha[0])
|
||||||
sfc[0, 0] = int(alpha[0])
|
|
||||||
for b in range(1, NB):
|
for b in range(1, NB):
|
||||||
sfc[b, 0] = int(alpha[b] - alpha[b - 1])
|
sfc_out[b, 0] = int(alpha[b] - alpha[b - 1])
|
||||||
|
|
||||||
G: float = float(alpha[0])
|
G: float = float(alpha[0])
|
||||||
|
|
||||||
|
# Quantize MDCT coefficients band-by-band
|
||||||
S_vec = np.zeros((1024,), dtype=np.int64)
|
S_vec = np.zeros((1024,), dtype=np.int64)
|
||||||
for b, (lo, hi) in enumerate(bands):
|
for b, (lo, hi) in enumerate(bands):
|
||||||
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
|
S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b]))
|
||||||
|
|
||||||
return S_vec.reshape(1024, 1), sfc, G
|
return S_vec.reshape(1024, 1), sfc_out, G
|
||||||
|
|
||||||
|
|
||||||
def aac_i_quantizer(
|
def aac_i_quantizer(
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user