diff --git a/report/.gitignore b/report/.gitignore new file mode 100644 index 0000000..bd9a553 --- /dev/null +++ b/report/.gitignore @@ -0,0 +1,6 @@ +# Report related files +*.aux +*.out +*.log +*.synctex.gz +_minted-report/* diff --git a/report/figures/bitrate_per_frame.png b/report/figures/bitrate_per_frame.png new file mode 100644 index 0000000..e946c1f Binary files /dev/null and b/report/figures/bitrate_per_frame.png differ diff --git a/report/figures/compression_per_frame.png b/report/figures/compression_per_frame.png new file mode 100644 index 0000000..d346ae2 Binary files /dev/null and b/report/figures/compression_per_frame.png differ diff --git a/report/report.pdf b/report/report.pdf new file mode 100644 index 0000000..8fa5d3c Binary files /dev/null and b/report/report.pdf differ diff --git a/report/report.tex b/report/report.tex new file mode 100644 index 0000000..8be112f --- /dev/null +++ b/report/report.tex @@ -0,0 +1,585 @@ +% +% !TEX TS-program = xelatex +% !TEX encoding = UTF-8 Unicode +% !TEX spellcheck = el-GR +% +% Information Systems Security assignment report +% +% Requires compilation with XeLaTeX +% +% authors: +% Χρήστος Χουτουρίδης ΑΕΜ 8997 +% cchoutou@ece.auth.gr + + +% Options: +% +% 1) mainlang= +% Default: english +% Set the default language of the document which affects hyphenations, +% localization (section, dates, etc...) +% +% example: \documentclass[mainlang=greek]{AUThReport} +% +% 2) +% Add hyphenation and typesetting support for other languages +% Currently supports: english, greek, german, frenc +% +% example: \documentclass[english, greek]{AUThReport} +% +% 3) short: Requests a shorter title for the document +% Default: no short +% +% example: \documentclass[short]{AUThReport} +% +\documentclass[a4paper, 11pt, mainlang=greek, english]{AUThReport/AUThReport} + +\CurrentDate{\today} + +% Greek report document setup suggestions +%--------------------------------- +% Document configuration +\AuthorName{Χρήστος Χουτουρίδης} +\AuthorAEM{8997} +\AuthorMail{cchoutou@ece.auth.gr} + +%\CoAuthorName{CoAuthor Name} +%\CoAuthorAEM{AEM} +%\CoAuthorMail{CoAuthor Mail} + +% \WorkGroup{Ομάδα Χ} + +\DocTitle{Εργασία Εξαμήνου} +\DocSubTitle{Απλοποιημένη κωδικοποίηση-αποκωδικοποίηση AAC} + +\Department{Τμήμα ΗΜΜΥ. Τομέας Ηλεκτρονικής} +\ClassName{Συστήματα Πολυμέσων και Εικονική Πραγματικότητα} + +\InstructorName{Αναστάσιος Ντελόπουλος} +\InstructorMail{antelopo@ece.auth.gr} + +%\CoInstructorName{} +%\CoInstructorMail{} + + +% Local package requirements +%--------------------------------- +%\usepackage{tabularx} +%\usepackage{array} +%\usepackage{commath} + +\usepackage{amsmath, amssymb, amsfonts} +\usepackage{graphicx} +\usepackage{caption} +\usepackage{subcaption} +\usepackage{float} + +\captionsetup[figure]{name=Εικόνα} + +% Requires: -shell-escape compile argument +\usepackage{minted} +\usepackage{xcolor} % + +\setminted[python]{ + fontsize=\small, + breaklines, + autogobble, + baselinestretch=1.1, + tabsize=2, + numbersep=8pt, + startinline, + gobble=0 +} + +\setminted[bash]{ + fontsize=\small, + breaklines, + autogobble, + baselinestretch=1.1, + tabsize=2, + numbersep=8pt, + startinline, + gobble=0 +} + +\newcommand{\repo}{https://git.hoo2.net/hoo2/Multimedia_AAC_Project} + + +\begin{document} + +% Request a title page or header +\InsertTitle + +\section{Εισαγωγή} + +Η παρούσα εργασία αφορά την υλοποίηση ενός απλοποιημένου κωδικοποιητή και αποκωδικοποιητή ήχου κατά το πρότυπο \textbf{Advanced Audio Coding} (AAC). +Το AAC αποτελεί μία μέθοδο κωδικοποίησης μετασχηματισμού (transform coding) η οποία συνδυάζει ανάλυση στο πεδίο της συχνότητας, ψυχοακουστικό μοντέλο και κωδικοποίηση εντροπίας, με στόχο την αποδοτική συμπίεση ηχητικών σημάτων υψηλής ποιότητας. +Η βασική αρχή λειτουργίας του βασίζεται στη μείωση της πλεονάζουσας πληροφορίας μέσω του μετασχηματισμού \textbf{MDCT} και στην εκμετάλλευση των ιδιοτήτων της ανθρώπινης ακοής ώστε να επιτρέπεται ελεγχόμενη απώλεια πληροφορίας που δεν είναι αντιληπτή. + +Στο πλαίσιο της εργασίας υλοποιούμε μία \textit{απλοποιημένη εκδοχή} του προτύπου, όπως αυτή περιγράφεται στην εκφώνηση, παραλείποντας ορισμένες βαθμίδες όπως το Mid/Side stereo και το Bit Reservoir, διατηρώντας όμως τον βασικό κορμό της αλυσίδας κωδικοποίησης. +Συγκεκριμένα, αναπτύσσουμε διαδοχικά τις βαθμίδες Sequence Segmentation Control, Filterbank (MDCT/IMDCT), Temporal Noise Shaping, Psychoacoustic Model, Quantization και Huffman coding, καθώς και τις αντίστροφες διαδικασίες αποκωδικοποίησης. + +Στην παρούσα αναφορά περιγράφουμε τη θεωρητική βάση κάθε βαθμίδας, τον τρόπο υλοποίησής της και τα αποτελέσματα που προκύπτουν από την πειραματική αξιολόγηση του συστήματος. + +\subsection{Παραδοτέα} +Τα παραδοτέα της εργασίας αποτελούνται από: +\begin{itemize} + \item Την παρούσα αναφορά. + \item Τους καταλόγους \texttt{level\_}, $i=1,2,3$ με τον κώδικα της εφαρμογής. + Ο καθένας περιέχει ένα αντίστοιχο script \texttt{level\_.py} που καλεί την demonstration συνάρτηση του αντίστοιχου level. + \item Το \href{\repo}{σύνδεσμο} με το αποθετήριο που περιέχει όλο το project με τον κώδικα της εφαρμογής και της παρούσας αναφοράς. +\end{itemize} + +\section{Υλοποίηση} + +Η υλοποίηση οργανώνεται \textbf{αρθρωτά}, με σαφή διαχωρισμό των επιμέρους λειτουργικών βαθμίδων, ώστε κάθε στάδιο της κωδικοποίησης και της αποκωδικοποίησης να μπορεί να ελεγχθεί και να επαληθευτεί ανεξάρτητα. +Η δομή αυτή μας επιτρέπει να αναπτύξουμε σταδιακά τρία επίπεδα ολοκλήρωσης, από την πλήρως αναστρέψιμη μετασχηματιστική κωδικοποίηση έως την πλήρη απωλεστική κωδικοποίηση με ψυχοακουστικό έλεγχο και κωδικοποίηση εντροπίας. + +Οι επιμέρους βαθμίδες του κωδικοποιητή και του αποκωδικοποιητή υλοποιήθηκαν ως \textbf{ανεξάρτητα modules}, τα οποία επαναχρησιμοποιούνται σε κάθε κατάλογο \texttt{level\_}. +Αν και η δομή του παραδοτέου δίνει την εντύπωση ότι οι κατάλογοι των επιπέδων περιέχουν αντίγραφα των ίδιων αρχείων, στην πράξη αυτό δεν ισχύει. +Η αρχική μας προσέγγιση ήταν η ύπαρξη ενός κεντρικού καταλόγου με κοινή υλοποίηση για όλα τα επίπεδα, ωστόσο οι περιορισμοί της εκφώνησης δεν επέτρεπαν τέτοια οργάνωση. +Αντί της χρήσης υπο-αποθετηρίων, επιλέξαμε τη λύση των hard links, ώστε τα αρχεία του κοινού πυρήνα να εμφανίζονται σε κάθε κατάλογο επιπέδου χωρίς να υπάρχει πραγματικός διπλασιασμός του κώδικα. +Με τον τρόπο αυτό διατηρούμε \textbf{ένα ενιαίο σημείο που συντηρείται ο κώδικας} και αποφεύγουμε αποκλίσεις μεταξύ των επιπέδων. + +Κάθε module αναπτύχθηκε ανεξάρτητα, με εκτεταμένη χρήση αυτοματοποιημένων δοκιμών. +Η ανάπτυξη ακολούθησε προσέγγιση \textbf{Test-Driven Development}, όπου τα tests σχεδιάστηκαν με βάση το δημόσιο interface κάθε module και περιγράφουν τη λειτουργική του συμπεριφορά. +Τα tests έχουν λειτουργικό και συμβατικό χαρακτήρα (functional and contract-based) και ελέγχουν σχήματα δεδομένων, αριθμητική σταθερότητα, οριακές περιπτώσεις και ιδιότητες αναστρεψιμότητας. +Η πρακτική αυτή μας επέτρεψε να εκτελούμε εύκολα \textbf{regression tests} μετά από κάθε τροποποίηση, διασφαλίζοντας ότι δεν εισάγονται ανεπιθύμητες παρενέργειες σε άλλα μέρη του συστήματος. + +Δημιουργήσαμε επίσης wrapper module για το δοθέν module κωδικοποίησης Huffman. +Κατά την ενσωμάτωσή του \textbf{εντοπίστηκε σφάλμα τύπου off-by-one} στον μηχανισμό αποκωδικοποίησης escape τιμών του codebook 11. +Το σφάλμα αφορούσε τον χειρισμό του bit διαχωρισμού (delimiter) στη μορφή κωδικοποίησης των escape τιμών. +Συγκεκριμένα, κατά την αποκωδικοποίηση μετρούνταν σωστά τα N διαδοχικά bits τιμής ‘1’, όμως το επόμενο bit ‘0’, που λειτουργεί ως διαχωριστής, δεν παραλειπόταν πριν την ανάγνωση των N+4 bits του payload. +Ως αποτέλεσμα, το bit διαχωρισμού συμπεριλαμβανόταν εσφαλμένα στα bits της escape τιμής, οδηγώντας σε μετατόπιση κατά ένα bit και σε λανθασμένη ανακατασκευή του μεγέθους. +Το σφάλμα αυτό δεν προκαλούσε εξαίρεση εκτέλεσης, αλλά αλλοίωνε τις διαφορές των scalefactors. +Η διόρθωση συνίσταται στη σωστή κατανάλωση του bit διαχωρισμού πριν την ανάγνωση των N+4 bits της escape τιμής. +Η προτεινόμενη αλλαγή κοινοποιήθηκε στους διδάσκοντες και έγινε αποδεκτή. + +Συνοπτικά, τα βασικά modules του συστήματος είναι τα ακόλουθα: + +\begin{itemize} + \item \textbf{\texttt{aac\_ssc}}: Υλοποίηση της βαθμίδας Sequence Segmentation Control. + \item \textbf{\texttt{aac\_filterbank}}: Υλοποίηση MDCT/IMDCT και διαχείριση παραθύρων. + \item \textbf{\texttt{aac\_tns}}: Υλοποίηση Temporal Noise Shaping. + \item \textbf{\texttt{aac\_psycho}}: Υλοποίηση ψυχοακουστικού μοντέλου και υπολογισμός SMR. + \item \textbf{\texttt{aac\_quantizer}}: Υλοποίηση μη ομοιόμορφου κβαντιστή και αντίστροφης κβάντισης. + \item \textbf{\texttt{aac\_huffman}}: Διαχείριση κωδικοποίησης και αποκωδικοποίησης Huffman. + \item \textbf{\texttt{aac\_coder} / \texttt{aac\_decoder}}: Σύνθεση της πλήρους ροής κωδικοποίησης και αποκωδικοποίησης. + \item \textbf{\texttt{aac\_utils}}: Βοηθητικές συναρτήσεις και μετρικές αξιολόγησης. +\end{itemize} + +Για την επιτυχή εκτέλεση του κώδικα απαιτείται η εγκατάσταση των εξαρτήσεων. +Για το λόγο αυτό από τον κεντρικό κατάλογο πρέπει να εκτελεστεί: +\begin{minted}{bash} + pip install -r requirements.txt +\end{minted} + +Τα αυτοματοποιημένα tests δεν απαιτούνται από την εκφώνηση και δεν εκτελούνται από τα scripts των επιπέδων. +Παρ' όλα αυτά, ο αναγνώστης μπορεί να τα εκτελέσει από τον κεντρικό κατάλογο του project: +\begin{minted}{bash} + pytest -v + # to run a specific module (for example filterbank) + pytest -v core/tests/test_filterbank.py +\end{minted} + + +\section{Level 1 -- SSC και MDCT} + +\subsection{Απαιτήσεις Level 1} + +Στο πρώτο επίπεδο ζητείται η υλοποίηση ενός πλήρως αναστρέψιμου συστήματος κωδικοποίησης--αποκωδικοποίησης βασισμένου στον μετασχηματισμό MDCT. +Το επίπεδο αυτό δεν περιλαμβάνει ψυχοακουστικό μοντέλο, κβάντιση ή κωδικοποίηση εντροπίας. +Στόχος είναι η σωστή υλοποίηση του Sequence Segmentation Control και του Filterbank, έτσι ώστε \textbf{η ανακατασκευή του σήματος να είναι αριθμητικά ταυτόσημη με το αρχικό}, μέχρι αριθμητική ακρίβεια κινητής υποδιαστολής. + +Το Level 1 λειτουργεί ως θεμέλιο για τα επόμενα επίπεδα, καθώς οποιοδήποτε σφάλμα στο μετασχηματιστικό στάδιο θα μεταφερόταν και θα ενισχυόταν στα επόμενα στάδια. + +\subsection{Modules που χρησιμοποιούνται} + +Για το Level 1 χρησιμοποιούνται τα εξής modules: + +\begin{itemize} + \item \textbf{\texttt{aac\_ssc}} για την επιλογή τύπου frame. + \item \textbf{\texttt{aac\_filterbank}} για MDCT/IMDCT και OLA. + \item \textbf{\texttt{aac\_coder\_1}} και \textbf{\texttt{aac\_decoder\_1}} για τη σύνθεση της ροής. + \item \textbf{\texttt{aac\_utils}} για βοηθητικές συναρτήσεις και υπολογισμό SNR. +\end{itemize} + +\subsection{Sequence Segmentation Control} + +Η βαθμίδα SSC υλοποιεί τον μηχανισμό επιλογής τύπου παραθύρου (OLS, LSS, ESH, LPS) με βάση την ανίχνευση spikes (attack detection). +Η ανίχνευση βασίζεται στον υπολογισμό της ενέργειας σε υπο-τμήματα μήκους 128 δειγμάτων και στον λόγο διαδοχικών ενεργειών. + +Η υλοποίηση είναι πλήρως συμμετρική για τα δύο κανάλια και ο τελικός τύπος frame προκύπτει από συγχώνευση των δύο καναλιών σύμφωνα με τον πίνακα μετάβασης. +Επίσης, δόθηκε ιδιαίτερη προσοχή στη αριθμητική σταθερότητα, ώστε να αποφεύγονται διαιρέσεις με μηδενική ενέργεια. + +\subsection{Filterbank και MDCT} + +Το module \texttt{aac\_filterbank} υλοποιεί τον MDCT και τον αντίστροφό του IMDCT σε διακριτή μορφή. +Χρησιμοποιούνται παράθυρα τύπου SIN και KBD, σύμφωνα με τον τύπο frame. +Η υλοποίηση διαχειρίζεται σωστά τις μεταβάσεις μεταξύ παραθύρων, εφαρμόζοντας overlap-add (OLA) ώστε να επιτυγχάνεται perfect reconstruction σε steady state. + +Τόσο η ιδιότητα γραμμικότητας του μετασχηματισμού, όσο και η σχέση ενίσχυσης: +\[ +\text{MDCT}(\text{IMDCT}(X)) \approx 2X +\] +ελέγχθηκαν αριθμητικά στο πλαίσιο των tests. + +Ενδεικτικά να αναφαίρουμε το παράδειγμα στον έλεγχος της σχέσης ενίσχυσης: +\begin{minted}{python} + X: MdctCoeffs = rng.normal(size=K) + x: TimeSignal = imdct(X) + X_hat: MdctCoeffs = mdct(x) + _assert_allclose(X_hat, 2.0 * X, rtol=tolerance, atol=tolerance) +\end{minted} + + +\subsection{Φιλοσοφία Ελέγχου Ορθότητας} + +Ως γνωστόν, στον προγραμματισμό και στο quality assurance, \textbf{δεν μπορεί να υπάρχει απόλυτη απόδειξη ορθότητας}. +Υπάρχει μόνο μείωση της πιθανότητας σφάλματος μέσω στοχευμένων ελέγχων. + +Για το Level 1 ελέγχθηκαν: + +\begin{itemize} + \item Ιδιότητες γραμμικότητας MDCT/IMDCT. + \item Απουσία NaN/Inf σε τυχαίες εισόδους. + \item Σωστή διαχείριση μεταβάσεων παραθύρων. + \item Ορθή συμπεριφορά SSC σε edge cases. + \item End-to-end ταυτότητα σήματος με υψηλό SNR. +\end{itemize} + +Τα tests μπορούν να εκτελεστούν με: + +\begin{minted}{bash} + pytest -v core/tests/test_filterbank.py + pytest -v core/tests/test_ssc.py + pytest -v core/tests/test_utils.py +\end{minted} + +Η επιτυχής εκτέλεση των παραπάνω tests διασφαλίζει ότι το Level 1 αποτελεί αριθμητικά σταθερό θεμέλιο για τα επόμενα επίπεδα. + +\subsection{Demo και Αποτελέσματα} + +Η demonstration του Level 1 μπορεί να εκτελεστεί με: + +\begin{minted}{bash} + cd level_1 + python -m level_1 material/LicorDeCalandraca.wav material/LicorDeCalandraca_out_l1.wav +\end{minted} + +Η έξοδος που προέκυψε ήταν: + +\begin{minted}{text} + Encoding ...................... done + Decoding ...................... done + SNR = 257.437 dB +\end{minted} + +Η τιμή SNR άνω των 250 dB \textbf{υποδηλώνει πρακτικά αριθμητική ταυτότητα μεταξύ αρχικού και ανακατασκευασμένου σήματος}. +Η απόκλιση οφείλεται αποκλειστικά σε \textbf{αριθμητική ακρίβεια κινητής υποδιαστολής} και όχι σε δομικό σφάλμα της υλοποίησης. +Το αποτέλεσμα αυτό επιβεβαιώνει ότι η υλοποίηση του Filterbank και του SSC είναι συνεπής και πλήρως αναστρέψιμη. + + +\section{Level 2 – Temporal Noise Shaping} + +\subsection{Απαιτήσεις Level 2} + +Στο δεύτερο επίπεδο ζητείται η ενσωμάτωση της βαθμίδας \textbf{Temporal Noise Shaping (TNS)} στη ροή κωδικοποίησης. +Το TNS εφαρμόζεται στο πεδίο των συντελεστών MDCT και βασίζεται σε γραμμική πρόβλεψη. +Στόχος της βαθμίδας είναι η χρονική ανακατανομή του σφάλματος κβάντισης, ώστε το αντιληπτό σφάλμα να κατανέμεται χρονικά με πιο ευνοϊκό τρόπο. + +Στο Level 2 η διαδικασία παραμένει πλήρως αναστρέψιμη. +Η κβάντιση των συντελεστών πρόβλεψης γίνεται με συγκεκριμένο βήμα και σε περιορισμένο εύρος τιμών, ώστε να διασφαλίζεται η σταθερότητα του αντίστροφου φίλτρου. + +\subsection{Modules που χρησιμοποιούνται} + +Για το Level 2 χρησιμοποιούνται: + +\begin{itemize} + \item \textbf{\texttt{aac\_ssc}}. + \item \textbf{\texttt{aac\_filterbank}}. + \item \textbf{\texttt{aac\_tns}}. + \item \textbf{\texttt{aac\_coder\_2}} και \textbf{\texttt{aac\_decoder\_2}}. + \item \textbf{\texttt{aac\_utils}}. +\end{itemize} + +Το νέο στοιχείο σε σχέση με το Level 1 είναι το module \texttt{aac\_tns}, το οποίο εφαρμόζεται μετά τον MDCT και πριν από την αντίστροφη διαδικασία στον αποκωδικοποιητή. + +\subsection{Υλοποίηση της βαθμίδας TNS} + +Για κάθε frame υπολογίζονται συντελεστές γραμμικής πρόβλεψης τάξης \texttt{PRED\_ORDER} (4). +Στην περίπτωση ESH, η διαδικασία εφαρμόζεται ανεξάρτητα σε κάθε ένα από τα 8 υπο-frames. +Οι συντελεστές πρόβλεψης κβαντίζονται με σταθερό βήμα \texttt{QUANT\_STEP} (0.1) και περιορίζονται στο διάστημα $[-\texttt{QUANT\_MAX}(-0.7), +\texttt{QUANT\_MAX}(+0.7)]$. + +Το φίλτρο που ορίζεται από τους συντελεστές $\{a_k\}$ έχει μορφή: + +\[ +H(z) = 1 - \sum_{k=1}^{p} a_k z^{-k}. +\] + +Στον κωδικοποιητή το φίλτρο εφαρμόζεται σε μορφή \textbf{FIR}. +Κάθε νέος συντελεστής MDCT προκύπτει αφαιρώντας γραμμικό συνδυασμό προηγούμενων συντελεστών. +Η επιλογή FIR μορφής εξασφαλίζει αριθμητική σταθερότητα, καθώς δεν υπάρχει ανατροφοδότηση. + +Στον αποκωδικοποιητή εφαρμόζεται το αντίστροφο φίλτρο: + +\[ +H^{-1}(z) = \frac{1}{1 - \sum_{k=1}^{p} a_k z^{-k}}, +\] + +το οποίο έχει μορφή \textbf{IIR}. +Η αντιστροφή υλοποιείται αναδρομικά, επαναφέροντας τους αρχικούς συντελεστές MDCT. +Η διαδικασία αυτή είναι πλήρως αναστρέψιμη υπό την προϋπόθεση ότι το IIR φίλτρο είναι σταθερό. + +Η σταθερότητα απαιτεί όλες οι ρίζες του πολυωνύμου: + +\[ +z^p - a_1 z^{p-1} - \dots - a_p = 0 +\] + +να βρίσκονται \textbf{εντός του μοναδιαίου κύκλου}. +Ακόμη και μικρή υπέρβαση της μοναδιαίας ακτίνας θα οδηγούσε σε εκθετική ενίσχυση κατά την αναδρομική εφαρμογή του φίλτρου και σε αριθμητική υπερχείλιση. + +\subsection{Φιλοσοφία Ελέγχου Ορθότητας} + +Όπως και στο Level 1, δεν υπάρχει απόλυτη μαθηματική απόδειξη ορθότητας. +Για τη βαθμίδα TNS ελέγχθηκαν: + +\begin{itemize} + \item Σωστά σχήματα εξόδου για long και ESH frames. + \item Κβάντιση πάνω στο επιθυμητό πλέγμα τιμών. + \item Περιορισμός συντελεστών στο επιτρεπτό εύρος. + \item Ρητός έλεγχος σταθερότητας μέσω υπολογισμού ριζών. + \item Ιδιότητα round-trip: $\text{iTNS}(\text{TNS}(X)) \approx X$. + \item Απουσία NaN/Inf σε τυχαίες και οριακές εισόδους. +\end{itemize} + +Ο έλεγχος σταθερότητας υλοποιείται αριθμητικά με την εξής λογική: +\begin{minted}{python} + poly = np.empty(p + 1) + poly[0] = 1.0 + poly[1:] = -a_q + roots = np.roots(poly) + margin = 1e-12 + assert np.all(np.abs(roots) < (1.0 - margin)) +\end{minted} + +Η ιδιότητα αντιστρεψιμότητας ελέγχεται μέσω round-trip ελέγχου: +\begin{minted}{python} + frame_F_tns, coeffs = aac_tns(frame_F_in, frame_type) + frame_F_hat = aac_i_tns(frame_F_tns, frame_type, coeffs) + np.testing.assert_allclose(frame_F_hat, frame_F_in) +\end{minted} + +Οι παραπάνω έλεγχοι δεν αποδεικνύουν μαθηματικά την ορθότητα, αλλά διασφαλίζουν ότι το ζεύγος FIR/IIR λειτουργεί ως ακριβές αντίστροφο εντός αριθμητικής ακρίβειας και ότι η κβάντιση δεν εισάγει αστάθεια. + +Τα ακριβή tests μπορούν να εκτελεστούν με: + +\begin{minted}{bash} + pytest -v core/tests/test_tns.py +\end{minted} + +\subsection{Demo και Αποτελέσματα} + +Η demonstration του Level 2 εκτελείται με: + +\begin{minted}{bash} + cd level_2 + python -m level_2 material/LicorDeCalandraca.wav material/LicorDeCalandraca_out_l2.wav +\end{minted} + +Η έξοδος που προέκυψε ήταν: + +\begin{minted}{text} + Encoding ...................... done + Decoding ...................... done + SNR = 257.437 dB +\end{minted} + +Η τιμή SNR είναι πρακτικά ταυτόσημη με εκείνη του Level 1. +Το αποτέλεσμα αυτό επιβεβαιώνει ότι η ενσωμάτωση του TNS \textbf{δεν εισάγει απώλεια πληροφορίας στο συγκεκριμένο στάδιο}. +Η υλοποίηση του FIR/IIR ζεύγους και ο έλεγχος σταθερότητας εξασφαλίζουν \textbf{πλήρη αναστρεψιμότητα} της διαδικασίας. + + + +\section{Level 3 – Πλήρης Απωλεστική Κωδικοποίηση} + +\subsection{Απαιτήσεις Level 3} + +Στο τρίτο επίπεδο ζητείται η υλοποίηση της \textbf{πλήρους απωλεστικής αλυσίδας κωδικοποίησης}. +Προστίθενται το ψυχοακουστικό μοντέλο, ο μη ομοιόμορφος κβαντιστής και η κωδικοποίηση Huffman. +Σε αντίθεση με τα προηγούμενα επίπεδα, το Level 3 \textbf{δεν είναι πλέον πλήρως αναστρέψιμο}. +Η απώλεια πληροφορίας είναι συνειδητή και ελεγχόμενη, με βάση τις αρχές της ψυχοακουστικής απόκρυψης. + +Στόχος είναι η μείωση του bitrate διατηρώντας αποδεκτή αντιληπτή ποιότητα. + +\subsection{Modules που χρησιμοποιούνται} + +Για το Level 3 χρησιμοποιούνται όλα τα modules του συστήματος: + +\begin{itemize} + \item \textbf{\texttt{aac\_ssc}}. + \item \textbf{\texttt{aac\_filterbank}}. + \item \textbf{\texttt{aac\_tns}}. + \item \textbf{\texttt{aac\_psycho}}. + \item \textbf{\texttt{aac\_quantizer}}. + \item \textbf{\texttt{aac\_huffman}}. + \item \textbf{\texttt{aac\_coder\_3}} και \textbf{\texttt{aac\_decoder\_3}}. + \item \textbf{\texttt{aac\_utils}}. +\end{itemize} + +Το Level 3 αποτελεί σύνθεση όλων των προηγούμενων βαθμίδων με προσθήκη κβάντισης και κωδικοποίησης εντροπίας. + +\subsection{Ψυχοακουστικό Μοντέλο} + +Το module \texttt{aac\_psycho} υπολογίζει το Signal-to-Mask Ratio (SMR) για κάθε Bark band. +Η διαδικασία βασίζεται στους πίνακες B219a και B219b για long και short frames αντίστοιχα. +Υπολογίζεται η ενέργεια ανά band, εφαρμόζεται spreading function και κατώφλι απόλυτης ακουστότητας. +Το SMR καθορίζει το επιτρεπτό σφάλμα κβάντισης σε κάθε band. + +Δόθηκε ιδιαίτερη προσοχή: + +\begin{itemize} + \item Στην αποφυγή διαίρεσης με μηδέν μέσω χρήσης μικρού όρου \texttt{EPS}. + \item Στον έλεγχο ότι όλες οι ενδιάμεσες ποσότητες παραμένουν πεπερασμένες. +\end{itemize} + +\subsection{Κβαντιστής και Scalefactors} + +Το module \texttt{aac\_quantizer} εφαρμόζει μη ομοιόμορφη κβάντιση των συντελεστών MDCT. +Για κάθε band επιλέγεται scalefactor που ικανοποιεί το αντίστοιχο SMR. +Οι scalefactors κωδικοποιούνται διαφορικά (DPCM). +Η ανακατασκευή γίνεται αθροιστικά και εφαρμόζεται εκθετικός παράγοντας της μορφής: + +\[ +\hat{X} = \text{sign}(S) |S|^{4/3} \times 2^{\alpha/4}. +\] + +Η συγκεκριμένη μορφή καθιστά το σύστημα \textbf{ιδιαίτερα ευαίσθητο σε αριθμητικά σφάλματα}. +Για τον λόγο αυτό χρησιμοποιήθηκαν: + +\begin{itemize} + \item Έλεγχοι πεπερασμένων τιμών (finite checks). + \item Έλεγχος αποφυγής overflow. + \item Προστασία με \texttt{EPS} σε παρονομαστές. +\end{itemize} + +\subsection{Κωδικοποίηση Huffman} + +Οι κβαντισμένοι συντελεστές και τα DPCM scalefactors κωδικοποιούνται με Huffman. +Το bitstream ανακατασκευάζεται πλήρως στον αποκωδικοποιητή. +Ιδιαίτερη προσοχή δόθηκε: + +\begin{itemize} + \item Στη σωστή \textbf{διαχείριση escape τιμών}. + \item Στην ορθή \textbf{κατανάλωση bitstreams}. + \item Στην \textbf{αποφυγή out-of-bounds} προσπελάσεων. +\end{itemize} + +\subsection{Φιλοσοφία Ελέγχου Ορθότητας} + +Το Level 3 \textbf{δεν μπορεί να ελεγχθεί με κριτήριο ταυτότητας}. +Ελέγχθηκε με κριτήρια λειτουργικής ορθότητας και αριθμητικής σταθερότητας. + +Συγκεκριμένα ελέγχθηκαν: + +\begin{itemize} + \item Ορθότητα DPCM ανακατασκευής scalefactors. + \item Συνεπής αντιστοίχιση ESH packing και unpacking. + \item Απουσία NaN/Inf σε όλη τη ροή. + \item Απουσία εκθετικής υπερχείλισης. + \item Έλεγχος μηδενικού lag μεταξύ αρχικού και ανακατασκευασμένου σήματος. + \item Έλεγχος μη ανταλλαγής καναλιών (L/R swap detection). + \item Έλεγχος απουσίας clipping. + \item Έλεγχος διατήρησης συνολικού gain. +\end{itemize} + +Ο \textbf{έλεγχος lag} υλοποιείται μέσω εκτίμησης χρονικής μετατόπισης. +Ο έλεγχος καναλιών διασφαλίζει ότι η \textbf{ενέργεια του αριστερού και δεξιού καναλιού δεν έχει ανταλλαγεί}. +Οι έλεγχοι αυτοί διασφαλίζουν ότι η απώλεια ποιότητας οφείλεται αποκλειστικά στην κβάντιση και όχι σε δομικό σφάλμα. + +Τα tests μπορούν να εκτελεστούν με: + +\begin{minted}{bash} + pytest -v core/tests/test_psycho.py + pytest -v core/tests/test_quantizer.py + pytest -v core/tests/test_huffman.py + pytest -v core/tests/test_aac_coder_decoder.py +\end{minted} + +\subsection{Demo και Αποτελέσματα} + +Η demonstration του Level 3 εκτελείται με: + +\begin{minted}{bash} + cd level_3 + python -m level_3 material/LicorDeCalandraca.wav material/LicorDeCalandraca_out_l3.wav material/aac_seq_3.mat +\end{minted} + +Η έξοδος που προέκυψε ήταν: + +\begin{minted}{text} + Storing coded sequence to material/aac_seq_3.mat + Encoding ...................... done + Decoding ...................... done + SNR = 9.454 dB + Bitrate (coded) = 216710.22 bits/s + Compression ratio = 7.0881 +\end{minted} + +Παρατηρούμε πως η τιμή SNR είναι σημαντικά χαμηλότερη σε σχέση με τα προηγούμενα επίπεδα, γεγονός αναμενόμενο λόγω απωλεστικής κβάντισης. +Παρά τη μείωση του SNR, ελέγχθηκε ότι: + +\begin{itemize} + \item Δεν υπάρχει χρονική μετατόπιση (lag). + \item Δεν υπάρχει ανταλλαγή καναλιών. + \item Δεν εμφανίζεται clipping. + \item Δεν υπάρχει μη ελεγχόμενη ενίσχυση (gain drift). +\end{itemize} + +\begin{figure}[!ht] + \centering + \includegraphics[width=0.75\linewidth]{figures/bitrate_per_frame.png} + \caption{Κατανομή του bitrate ανά frame. + Παρατηρείται σημαντική διακύμανση που αντανακλά την προσαρμοστική φύση της κωδικοποίησης.} + \label{fig:bitrate_per_frame} +\end{figure} + +\begin{figure}[!ht] + \centering + \includegraphics[width=0.75\linewidth]{figures/compression_per_frame.png} + \caption{Λόγος συμπίεσης ανά frame. + Η μεταβολή του compression ratio επιβεβαιώνει τη δυναμική κατανομή bit ανάλογα με το περιεχόμενο του σήματος.} + \label{fig:compression_per_frame} +\end{figure} + +Η χαμηλή τιμή SNR δεν αντικατοπτρίζει πλήρως την αντιληπτή ποιότητα, καθώς το ψυχοακουστικό μοντέλο επιτρέπει μεγάλο ενεργειακό σφάλμα σε περιοχές όπου το σφάλμα καλύπτεται από το φαινόμενο απόκρυψης. +Η παρατηρούμενη απώλεια αφορά κυρίως λεπτομέρειες υψηλών συχνοτήτων, χωρίς δομική αλλοίωση του σήματος. +Το αποτέλεσμα επιβεβαιώνει ότι η υλοποίηση είναι αριθμητικά σταθερή και ότι η απώλεια ποιότητας είναι αποτέλεσμα ελεγχόμενης κβάντισης και όχι υλοποιητικού σφάλματος. + +Όπως φαίνεται στην Εικόνα~\ref{fig:bitrate_per_frame}, παρατηρείται σημαντική διακύμανση του bitrate μεταξύ διαδοχικών frames, με τιμές που κυμαίνονται περίπου \textbf{μεταξύ 160 kbps και 250 kbps} και σποραδικές κορυφές υψηλότερων τιμών. +Η συμπεριφορά αυτή είναι αναμενόμενη, καθώς το ψυχοακουστικό μοντέλο κατανέμει περισσότερα bits σε χρονικές περιοχές με αυξημένη ενεργειακή ή φασματική πολυπλοκότητα. +Αντίστοιχα, στην Εικόνα~\ref{fig:compression_per_frame} παρατηρούμε ότι ο λόγος συμπίεσης μεταβάλλεται δυναμικά, με \textbf{τυπικές τιμές μεταξύ 6:1 και 9:1}. + +Οι χαμηλότερες τιμές λόγου συμπίεσης αντιστοιχούν σε frames όπου απαιτούνται περισσότερα bits για την ικανοποίηση του SMR, ενώ οι υψηλότερες τιμές εμφανίζονται σε περιοχές μικρότερης φασματικής πυκνότητας. +Η δυναμική αυτή συμπεριφορά επιβεβαιώνει ότι \textbf{η κωδικοποίηση δεν είναι σταθερού ρυθμού} (CBR), αλλά προσαρμοστική ως προς το περιεχόμενο του σήματος. +Η συνολική μέση τιμή συμπίεσης περίπου \textbf{7:1} επιτυγχάνεται μέσω της συνδυασμένης λειτουργίας ψυχοακουστικής κβάντισης και κωδικοποίησης εντροπίας, χωρίς να παρατηρούνται δομικά σφάλματα όπως χρονική μετατόπιση, ανταλλαγή καναλιών ή ανεξέλεγκτη ενίσχυση. + + +\section{Συμπεράσματα} + +Στην παρούσα εργασία υλοποιήσαμε μία απλοποιημένη αλλά πλήρως λειτουργική αλυσίδα κωδικοποίησης και αποκωδικοποίησης τύπου AAC, ακολουθώντας σταδιακή προσέγγιση τριών επιπέδων. +Ξεκινήσαμε από μία αυστηρά αναστρέψιμη μετασχηματιστική κωδικοποίηση με MDCT και Sequence Segmentation Control, επεκτείναμε το σύστημα με Temporal Noise Shaping διατηρώντας την αριθμητική αντιστρεψιμότητα και ολοκληρώσαμε με την ενσωμάτωση ψυχοακουστικού μοντέλου, μη ομοιόμορφης κβάντισης και κωδικοποίησης εντροπίας. +Η \textbf{αρθρωτή σχεδίαση} και η \textbf{εκτεταμένη χρήση αυτοματοποιημένων δοκιμών μας επέτρεψαν να ελέγχουμε ανεξάρτητα κάθε βαθμίδα και να περιορίζουμε συστηματικά την πιθανότητα σφαλμάτων}. + +Δώσαμε ιδιαίτερη έμφαση στη \textbf{αριθμητική σταθερότητα} του συστήματος. +Εφαρμόσαμε ελέγχους σταθερότητας φίλτρων, προστασίες τύπου \texttt{EPS} σε παρονομαστές, ελέγχους πεπερασμένων τιμών, καθώς και ελέγχους απουσίας χρονικής μετατόπισης, ανταλλαγής καναλιών και clipping. +Η φιλοσοφία αυτή αποδείχθηκε κρίσιμη στο Level 3, όπου μικρά αριθμητικά σφάλματα μπορούν να ενισχυθούν εκθετικά μέσω της απο-κβάντισης. +Η συστηματική προσέγγιση testing και οι round-trip έλεγχοι διασφάλισαν ότι η παρατηρούμενη απώλεια ποιότητας οφείλεται αποκλειστικά στη σχεδιασμένη κβάντιση και όχι σε υλοποιητικά σφάλματα. + +Τα αποτελέσματα έδειξαν ότι το σύστημα επιτυγχάνει \textbf{λόγο συμπίεσης περίπου 7:1}, με μέσο \textbf{bitrate περίπου 216 kbps}, διατηρώντας αποδεκτή αντιληπτή ποιότητα. +Η ανάλυση ανά frame κατέδειξε ότι η κατανομή bit είναι δυναμική και εξαρτάται από τη φασματική πολυπλοκότητα του σήματος, γεγονός που επιβεβαιώνει τη σωστή λειτουργία του ψυχοακουστικού μοντέλου. +Η σημαντική πτώση του SNR στο Level 3 είναι αναμενόμενη σε ενεργειακούς όρους και δεν αντανακλά πλήρως την αντιληπτή ποιότητα, καθώς το σφάλμα κατανέμεται σύμφωνα με τα φαινόμενα απόκρυψης. +Συνολικά, η υλοποίηση επιβεβαιώνει ότι ακόμη και ένα απλοποιημένο μοντέλο AAC μπορεί να επιτύχει ουσιαστική συμπίεση, εφόσον η σχεδίαση είναι προσεκτική και αριθμητικά συνεπής. + +\end{document} \ No newline at end of file diff --git a/source/core/aac_coder.py b/source/core/aac_coder.py index 07034e6..5987712 100644 --- a/source/core/aac_coder.py +++ b/source/core/aac_coder.py @@ -111,21 +111,6 @@ def _thresholds_from_smr( return T -def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: - """ - Normalize GlobalGain to match AACChannelFrameF3["G"] type: - - long: return float - - ESH: return float64 ndarray of shape (1, 8) - """ - if np.isscalar(G): - return float(G) - - G_arr = np.asarray(G) - if G_arr.size == 1: - return float(G_arr.reshape(-1)[0]) - - return np.asarray(G_arr, dtype=np.float64) - # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- @@ -476,10 +461,6 @@ def aac_coder_3( S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) - # Normalize G types for AACSeq3 schema (float | float64 ndarray). - G_Ln = _normalize_global_gain(G_L) - G_Rn = _normalize_global_gain(G_R) - # Huffman-code ONLY the DPCM differences for b>0. # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] @@ -503,7 +484,7 @@ def aac_coder_3( "chl": { "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64), - "G": G_Ln, + "G": G_L, "sfc": sfc_L_stream, "stream": mdct_L_stream, "codebook": int(cb_L), @@ -511,7 +492,7 @@ def aac_coder_3( "chr": { "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), "T": np.asarray(T_R, dtype=np.float64), - "G": G_Rn, + "G": G_R, "sfc": sfc_R_stream, "stream": mdct_R_stream, "codebook": int(cb_R), diff --git a/source/core/aac_decoder.py b/source/core/aac_decoder.py index be705df..378294e 100644 --- a/source/core/aac_decoder.py +++ b/source/core/aac_decoder.py @@ -42,7 +42,7 @@ def _nbands(frame_type: FrameType) -> int: # Public helpers # ----------------------------------------------------------------------------- -def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: +def aac_unpack_seq_channels(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: """ Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo FrameF container expected by aac_i_filter_bank(). @@ -167,7 +167,7 @@ def aac_decoder_1( chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) - frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_f: FrameF = aac_unpack_seq_channels(frame_type, chl_f, chr_f) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) start = i * hop @@ -427,7 +427,7 @@ def aac_decoder_3( X_R = aac_i_tns(Xq_R, frame_type, tns_R) # Re-pack to stereo container and inverse filterbank - frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_f = aac_unpack_seq_channels(frame_type, np.asarray(X_L), np.asarray(X_R)) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) start = i * hop diff --git a/source/core/aac_psycho.py b/source/core/aac_psycho.py index 4ecaafc..8069e45 100644 --- a/source/core/aac_psycho.py +++ b/source/core/aac_psycho.py @@ -189,7 +189,7 @@ def _predictability( # Band-domain aggregation # ----------------------------------------------------------------------------- -def _band_energy_and_weighted_predictability( +def _band_energy_and_pred( r: FloatArray, c: FloatArray, wlow: BandIndexArray, @@ -238,7 +238,7 @@ def _band_energy_and_weighted_predictability( return e_b, c_num_b -def _psycho_one_window( +def _psycho_window( time_x: FrameChannelT, prev1_x: FrameChannelT, prev2_x: FrameChannelT, @@ -288,7 +288,7 @@ def _psycho_one_window( c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2) # Aggregate into psycho bands - e_b, c_num_b = _band_energy_and_weighted_predictability(r, c_w, wlow, whigh) + e_b, c_num_b = _band_energy_and_pred(r, c_w, wlow, whigh) # Spread energies and predictability across bands: # ecb(b) = sum_bb e(bb) * S(bb, b) @@ -333,7 +333,7 @@ def _psycho_one_window( # ESH window slicing (match filterbank conventions) # ----------------------------------------------------------------------------- -def _esh_subframes_256(x_2048: FrameChannelT) -> list[FrameChannelT]: +def _esh_subframes(x_2048: FrameChannelT) -> list[FrameChannelT]: """ Extract the 8 overlapping 256-sample short windows used by AAC ESH. @@ -408,7 +408,7 @@ def aac_psycho( # Long frame types: compute one SMR vector (69 bands) if frame_type != "ESH": - return _psycho_one_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table) + return _psycho_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table) # ESH: compute 8 SMR vectors (42 bands each), one per short subframe. # @@ -419,8 +419,8 @@ def aac_psycho( # # This matches the "within-frame history" convention commonly used in # simplified psycho models for ESH. - cur_subs = _esh_subframes_256(frame_T) - prev1_subs = _esh_subframes_256(frame_T_prev_1) + cur_subs = _esh_subframes(frame_T) + prev1_subs = _esh_subframes(frame_T_prev_1) B = int(table.shape[0]) # expected 42 smr_out = np.zeros((B, 8), dtype=np.float64) @@ -436,6 +436,6 @@ def aac_psycho( x_m1 = cur_subs[j - 1] x_m2 = cur_subs[j - 2] - smr_out[:, j] = _psycho_one_window(cur_subs[j], x_m1, x_m2, N=256, table=table) + smr_out[:, j] = _psycho_window(cur_subs[j], x_m1, x_m2, N=256, table=table) return smr_out diff --git a/source/core/aac_quantizer.py b/source/core/aac_quantizer.py index 5bc8526..f2004b0 100644 --- a/source/core/aac_quantizer.py +++ b/source/core/aac_quantizer.py @@ -35,7 +35,7 @@ MAX_SF_DELTA:int = 60 # ----------------------------------------------------------------------------- # Helpers: ESH packing/unpacking (128x8 <-> 1024x1) # ----------------------------------------------------------------------------- -def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray: +def _esh_pack(x_128x8: FloatArray) -> FloatArray: """ Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1). @@ -58,7 +58,7 @@ def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray: return x_128x8.reshape(1024, 1, order="F") -def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray: +def _esh_unpack(x_1024x1: FloatArray) -> FloatArray: """ Unpack a packed ESH vector (1024 elements) back to shape (128, 8). @@ -226,7 +226,7 @@ def _band_energy(x: FloatArray, lo: int, hi: int) -> float: return float(np.sum(sec * sec)) -def _threshold_T_from_SMR( +def _psychoacoustic_threshold( X: FloatArray, SMR_col: FloatArray, bands: list[tuple[int, int]], @@ -425,7 +425,7 @@ def aac_quantizer( SMRj = SMR[:, j].reshape(NB) # Compute psychoacoustic threshold T(b) for this subframe - T = _threshold_T_from_SMR(Xj, SMRj, bands) + T = _psychoacoustic_threshold(Xj, SMRj, bands) # Frame-wise initial estimate alpha_hat (Equation 14) alpha_hat = _initial_alpha_hat(Xj) @@ -482,7 +482,7 @@ def aac_quantizer( raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") # Compute psychoacoustic threshold T(b) for the long frame - T = _threshold_T_from_SMR(Xv, SMRv, bands) + T = _psychoacoustic_threshold(Xv, SMRv, bands) # Frame-wise initial estimate alpha_hat (Equation 14) alpha_hat = _initial_alpha_hat(Xv) @@ -566,7 +566,7 @@ def aac_i_quantizer( if sfc.shape != (NB, 8): raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).") - S_128x8 = _esh_unpack_from_1024(S_flat) + S_128x8 = _esh_unpack(S_flat) Xrec = np.zeros((128, 8), dtype=np.float64) diff --git a/source/core/aac_tns.py b/source/core/aac_tns.py index 06227b1..4ba78ac 100644 --- a/source/core/aac_tns.py +++ b/source/core/aac_tns.py @@ -39,7 +39,7 @@ from core.aac_types import * # ----------------------------------------------------------------------------- -def _band_ranges_for_kcount(k_count: int) -> BandRanges: +def _band_ranges(k_count: int) -> BandRanges: """ Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count. @@ -66,7 +66,7 @@ def _band_ranges_for_kcount(k_count: int) -> BandRanges: start = tbl[:, 1].astype(int) end = tbl[:, 2].astype(int) - ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)] + ranges: BandRanges = [(int(s), int(e)) for s, e in zip(start, end)] for s, e in ranges: if s < 0 or e < s or e >= k_count: @@ -117,7 +117,7 @@ def _compute_sw(x: MdctCoeffs) -> MdctCoeffs: x = np.asarray(x, dtype=np.float64).reshape(-1) k_count = int(x.shape[0]) - bands = _band_ranges_for_kcount(k_count) + bands = _band_ranges(k_count) sw = np.zeros(k_count, dtype=np.float64) for s, e in bands: @@ -347,7 +347,7 @@ def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: return x_hat -def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: +def _tns_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: """ TNS for a single MDCT vector (one long frame or one short subframe). @@ -430,7 +430,7 @@ def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChan a_out = np.empty((PRED_ORDER, 8), dtype=np.float64) for j in range(8): - y[:, j], a_out[:, j] = _tns_one_vector(x[:, j]) + y[:, j], a_out[:, j] = _tns_vector(x[:, j]) return y, a_out @@ -443,7 +443,7 @@ def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChan else: raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') - y_vec, a_q = _tns_one_vector(x_vec) + y_vec, a_q = _tns_vector(x_vec) if out_shape == (1024,): y_out = y_vec diff --git a/source/core/aac_utils.py b/source/core/aac_utils.py index 0e18ed1..26cdbdf 100644 --- a/source/core/aac_utils.py +++ b/source/core/aac_utils.py @@ -163,6 +163,42 @@ def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: return float(10.0 * np.log10(ps / pn)) +def estimate_lag_mono(x_ref: TimeSignal, x_hat: TimeSignal, max_lag=4096): + """ + Estimate time lag between two mono signals. + Returns lag (positive means x_hat delayed). + """ + n = min(len(x_ref), len(x_hat)) + x_ref = x_ref[:n] + x_hat = x_hat[:n] + + corr = np.correlate(x_ref, x_hat, mode='full') + lags = np.arange(-n + 1, n) + + center = n - 1 + lo = max(0, center - max_lag) + hi = min(len(corr), center + max_lag + 1) + + best = lo + int(np.argmax(corr[lo:hi])) + return int(lags[best]) + + +def match_gain(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Least-squares gain g that best maps x_hat -> x_ref. + """ + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + r = x_ref[:n, :c].reshape(-1).astype(np.float64) + h = x_hat[:n, :c].reshape(-1).astype(np.float64) + + denom = float(np.dot(h, h)) + if denom <= 0.0: + return 1.0 + return float(np.dot(r, h) / denom) + + # ----------------------------------------------------------------------------- # Psychoacoustic band tables (TableB219.mat) # ----------------------------------------------------------------------------- diff --git a/source/core/tests/test_aac_coder_decoder.py b/source/core/tests/test_aac_coder_decoder.py index 54207fc..18d54d7 100644 --- a/source/core/tests/test_aac_coder_decoder.py +++ b/source/core/tests/test_aac_coder_decoder.py @@ -21,7 +21,7 @@ import soundfile as sf from core.aac_coder import aac_coder_1, aac_coder_2, aac_coder_3, aac_read_wav_stereo_48k from core.aac_decoder import aac_decoder_1, aac_decoder_2, aac_decoder_3, aac_remove_padding -from core.aac_utils import snr_db +from core.aac_utils import snr_db, estimate_lag_mono, match_gain from core.aac_types import * @@ -151,6 +151,29 @@ def test_aac_coder_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None: assert chl_f.shape == (1024, 1) assert chr_f.shape == (1024, 1) +@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) +def test_level_1_gain_close_to_one( + mk_actual_stereo_wav: Path, + tmp_path: Path, +) -> None: + """ + Guardrail: decoded signal should not have a large global gain mismatch. + """ + x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav) + assert int(fs) == 48000 + + out_wav = tmp_path / "decoded_level3.wav" + aac_seq_1: AACSeq1 = aac_coder_1(mk_actual_stereo_wav) + y_hat: StereoSignal = aac_decoder_1(aac_seq_1, out_wav) + + n = min(x_ref.shape[0], y_hat.shape[0]) + x_ref = x_ref[:n, :] + y_hat = y_hat[:n, :] + + g = match_gain(x_ref, y_hat) + # print (f"g = {g}") + # Allow some slack but catch big scaling regressions + assert 0.75 <= g <= 1.25 @pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True) def test_end_to_end_aac_coder_decoder_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None: @@ -207,6 +230,30 @@ def test_aac_coder_2_seq_schema_and_shapes(mk_random_stereo_wav: Path) -> None: assert coeffs.shape == (4, 1) +@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) +def test_level_2_gain_close_to_one( + mk_actual_stereo_wav: Path, + tmp_path: Path, +) -> None: + """ + Guardrail: decoded signal should not have a large global gain mismatch. + """ + x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav) + assert int(fs) == 48000 + + out_wav = tmp_path / "decoded_level3.wav" + aac_seq_2: AACSeq2 = aac_coder_2(mk_actual_stereo_wav) + y_hat: StereoSignal = aac_decoder_2(aac_seq_2, out_wav) + + n = min(x_ref.shape[0], y_hat.shape[0]) + x_ref = x_ref[:n, :] + y_hat = y_hat[:n, :] + + g = match_gain(x_ref, y_hat) + # print (f"g = {g}") + # Allow some slack but catch big scaling regressions + assert 0.75 <= g <= 1.25 + @pytest.mark.parametrize("mk_random_stereo_wav", [0.5], indirect=True) def test_end_to_end_level_2_high_snr(mk_random_stereo_wav: Path, tmp_path: Path) -> None: x_ref, fs = sf.read(str(mk_random_stereo_wav), always_2d=True) @@ -294,6 +341,90 @@ def test_aac_coder_3_seq_schema_and_shapes(mk_actual_stereo_wav: Path) -> None: else: assert np.isscalar(G) +@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) +def test_level_3_estimated_lag( + mk_actual_stereo_wav: Path, + tmp_path: Path, +) -> None: + """ + Check that the estimated lag between reference and decoded + signal is small (zero), to prevent catastrophic misalignment. + """ + x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav) + assert int(fs) == 48000 + + out_wav = tmp_path / "decoded_level3.wav" + + aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav) + y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav) + + # Use only common length + n = min(x_ref.shape[0], y_hat.shape[0]) + x_ref = x_ref[:n, :] + y_hat = y_hat[:n, :] + + lag_L = estimate_lag_mono(x_ref[:, 0], y_hat[:, 0], max_lag=4096) + lag_R = estimate_lag_mono(x_ref[:, 1], y_hat[:, 1], max_lag=4096) + + # We allow zero latency + assert abs(lag_L) == 0 + assert abs(lag_R) == 0 + + +@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) +def test_level_3_not_lr_swapped( + mk_actual_stereo_wav: Path, + tmp_path: Path, +) -> None: + """ + Ensure the decoded stereo channels are not swapped. + If channels are swapped, SNR against the original drops a lot, + while SNR against swapped reference increases. + """ + x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav) + assert int(fs) == 48000 + + out_wav = tmp_path / "decoded_level3.wav" + + aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav) + y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav) + + n = min(x_ref.shape[0], y_hat.shape[0]) + x_ref = x_ref[:n, :] + y_hat = y_hat[:n, :] + + snr_normal = snr_db(x_ref, y_hat) + snr_swapped = snr_db(x_ref[:, [1, 0]], y_hat) + + # If the decoded output were swapped, snr_swapped would be significantly higher. + assert snr_normal >= snr_swapped + + + +@pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) +def test_level_3_gain_close_to_one( + mk_actual_stereo_wav: Path, + tmp_path: Path, +) -> None: + """ + Guardrail: decoded signal should not have a large global gain mismatch. + """ + x_ref, fs = aac_read_wav_stereo_48k(mk_actual_stereo_wav) + assert int(fs) == 48000 + + out_wav = tmp_path / "decoded_level3.wav" + aac_seq_3: AACSeq3 = aac_coder_3(mk_actual_stereo_wav) + y_hat: StereoSignal = aac_decoder_3(aac_seq_3, out_wav) + + n = min(x_ref.shape[0], y_hat.shape[0]) + x_ref = x_ref[:n, :] + y_hat = y_hat[:n, :] + + g = match_gain(x_ref, y_hat) + + # Allow some slack but catch big scaling regressions + assert 0.75 <= g <= 1.25 + @pytest.mark.parametrize("mk_actual_stereo_wav", [0.5], indirect=True) def test_end_to_end_level_3_high_snr(mk_actual_stereo_wav: Path, tmp_path: Path) -> None: @@ -310,5 +441,4 @@ def test_end_to_end_level_3_high_snr(mk_actual_stereo_wav: Path, tmp_path: Path) n = min(x_ref.shape[0], y_hat.shape[0]) s = snr_db(x_ref[:n, :], y_hat[:n, :]) - # print(f" with SNR={s}") - assert s > 7.0 + assert s > 8.0 diff --git a/source/level_1/core/aac_coder.py b/source/level_1/core/aac_coder.py index 07034e6..5987712 100644 --- a/source/level_1/core/aac_coder.py +++ b/source/level_1/core/aac_coder.py @@ -111,21 +111,6 @@ def _thresholds_from_smr( return T -def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: - """ - Normalize GlobalGain to match AACChannelFrameF3["G"] type: - - long: return float - - ESH: return float64 ndarray of shape (1, 8) - """ - if np.isscalar(G): - return float(G) - - G_arr = np.asarray(G) - if G_arr.size == 1: - return float(G_arr.reshape(-1)[0]) - - return np.asarray(G_arr, dtype=np.float64) - # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- @@ -476,10 +461,6 @@ def aac_coder_3( S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) - # Normalize G types for AACSeq3 schema (float | float64 ndarray). - G_Ln = _normalize_global_gain(G_L) - G_Rn = _normalize_global_gain(G_R) - # Huffman-code ONLY the DPCM differences for b>0. # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] @@ -503,7 +484,7 @@ def aac_coder_3( "chl": { "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64), - "G": G_Ln, + "G": G_L, "sfc": sfc_L_stream, "stream": mdct_L_stream, "codebook": int(cb_L), @@ -511,7 +492,7 @@ def aac_coder_3( "chr": { "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), "T": np.asarray(T_R, dtype=np.float64), - "G": G_Rn, + "G": G_R, "sfc": sfc_R_stream, "stream": mdct_R_stream, "codebook": int(cb_R), diff --git a/source/level_1/core/aac_decoder.py b/source/level_1/core/aac_decoder.py index be705df..378294e 100644 --- a/source/level_1/core/aac_decoder.py +++ b/source/level_1/core/aac_decoder.py @@ -42,7 +42,7 @@ def _nbands(frame_type: FrameType) -> int: # Public helpers # ----------------------------------------------------------------------------- -def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: +def aac_unpack_seq_channels(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: """ Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo FrameF container expected by aac_i_filter_bank(). @@ -167,7 +167,7 @@ def aac_decoder_1( chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) - frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_f: FrameF = aac_unpack_seq_channels(frame_type, chl_f, chr_f) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) start = i * hop @@ -427,7 +427,7 @@ def aac_decoder_3( X_R = aac_i_tns(Xq_R, frame_type, tns_R) # Re-pack to stereo container and inverse filterbank - frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_f = aac_unpack_seq_channels(frame_type, np.asarray(X_L), np.asarray(X_R)) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) start = i * hop diff --git a/source/level_1/core/aac_huffman.py b/source/level_1/core/aac_huffman.py new file mode 100644 index 0000000..3d2eeca --- /dev/null +++ b/source/level_1/core/aac_huffman.py @@ -0,0 +1,112 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Huffman wrappers (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Thin wrappers around the provided Huffman utilities (material/huff_utils.py) +# so that the API matches the assignment text. +# +# Exposed API (assignment): +# huff_sec, huff_codebook = aac_encode_huff(coeff_sec, huff_LUT_list, force_codebook) +# dec_coeffs = aac_decode_huff(huff_sec, huff_codebook, huff_LUT_list) +# +# Notes: +# - Huffman coding operates on tuples. Therefore, decode(encode(x)) may return +# extra trailing symbols due to tuple padding. The AAC decoder knows the +# true section length from side information (band limits) and truncates. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Any +import numpy as np + +from material.huff_utils import encode_huff, decode_huff + + +def aac_encode_huff( + coeff_sec: np.ndarray, + huff_LUT_list: list[dict[str, Any]], + force_codebook: int | None = None, +) -> tuple[str, int]: + """ + Huffman-encode a section of coefficients (MDCT symbols or scalefactors). + + Parameters + ---------- + coeff_sec : np.ndarray + Coefficient section to be encoded. Any shape is accepted; the input + is flattened and treated as a 1-D sequence of int64 symbols. + huff_LUT_list : list[dict[str, Any]] + List of Huffman Look-Up Tables (LUTs) as returned by material.load_LUT(). + Index corresponds to codebook id (typically 1..11, with 0 reserved). + force_codebook : int | None + If provided, forces the use of this Huffman codebook. In the assignment, + scalefactors are encoded with codebook 11. For MDCT coefficients, this + argument is usually omitted (auto-selection). + + Returns + ------- + tuple[str, int] + (huff_sec, huff_codebook) + - huff_sec: bitstream as a string of '0'/'1' + - huff_codebook: codebook id used by the encoder + """ + coeff_sec_arr = np.asarray(coeff_sec, dtype=np.int64).reshape(-1) + + if force_codebook is None: + # Provided utility returns (bitstream, codebook) in the auto-selection case. + huff_sec, huff_codebook = encode_huff(coeff_sec_arr, huff_LUT_list) + return str(huff_sec), int(huff_codebook) + + # Provided utility returns ONLY the bitstream when force_codebook is set. + cb = int(force_codebook) + huff_sec = encode_huff(coeff_sec_arr, huff_LUT_list, force_codebook=cb) + return str(huff_sec), cb + + +def aac_decode_huff( + huff_sec: str | np.ndarray, + huff_codebook: int, + huff_LUT: list[dict[str, Any]], +) -> np.ndarray: + """ + Huffman-decode a bitstream using the specified codebook. + + Parameters + ---------- + huff_sec : str | np.ndarray + Huffman bitstream. Typically a string of '0'/'1'. If an array is provided, + it is passed through to the provided decoder. + huff_codebook : int + Codebook id that was returned by aac_encode_huff. + Codebook 0 represents an all-zero section. + huff_LUT : list[dict[str, Any]] + Huffman LUT list as returned by material.load_LUT(). + + Returns + ------- + np.ndarray + Decoded coefficients as a 1-D np.int64 array. + + Note: Due to tuple coding, the decoded array may contain extra trailing + padding symbols. The caller must truncate to the known section length. + """ + cb = int(huff_codebook) + + if cb == 0: + # Codebook 0 represents an all-zero section. The decoded length is not + # recoverable from the bitstream alone; the caller must expand/truncate. + return np.zeros((0,), dtype=np.int64) + + if cb < 0 or cb >= len(huff_LUT): + raise ValueError(f"Invalid Huffman codebook index: {cb}") + + lut = huff_LUT[cb] + dec = decode_huff(huff_sec, lut) + return np.asarray(dec, dtype=np.int64).reshape(-1) diff --git a/source/level_1/core/aac_psycho.py b/source/level_1/core/aac_psycho.py new file mode 100644 index 0000000..8069e45 --- /dev/null +++ b/source/level_1/core/aac_psycho.py @@ -0,0 +1,441 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Psychoacoustic Model +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Psychoacoustic model for ONE channel, based on the assignment notes (Section 2.4). +# +# Public API: +# SMR = aac_psycho(frame_T, frame_type, frame_T_prev_1, frame_T_prev_2) +# +# Output: +# - For long frames ("OLS", "LSS", "LPS"): SMR has shape (69,) +# - For short frames ("ESH"): SMR has shape (42, 8) (one column per subframe) +# +# Notes: +# - Uses Bark band tables from material/TableB219.mat: +# * B219a for long windows (69 bands, N=2048 FFT, N/2=1024 bins) +# * B219b for short windows (42 bands, N=256 FFT, N/2=128 bins) +# - Applies a Hann window in time domain before FFT magnitude/phase extraction. +# - Implements: +# spreading function -> band spreading -> tonality index -> masking thresholds -> SMR. +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np + +from core.aac_utils import band_limits, get_table +from core.aac_configuration import NMT_DB, TMN_DB +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Spreading function +# ----------------------------------------------------------------------------- + +def _spreading_matrix(bval: BandValueArray) -> FloatArray: + """ + Compute the spreading function matrix between psychoacoustic bands. + + The spreading function describes how energy in one critical band masks + nearby bands. The formula follows the assignment pseudo-code. + + Parameters + ---------- + bval : BandValueArray + Bark value per band, shape (B,). + + Returns + ------- + FloatArray + Spreading matrix S of shape (B, B), where: + S[bb, b] quantifies the contribution of band bb masking band b. + """ + bval = np.asarray(bval, dtype=np.float64).reshape(-1) + B = int(bval.shape[0]) + + spread = np.zeros((B, B), dtype=np.float64) + + for b in range(B): + for bb in range(B): + # tmpx depends on direction (asymmetric spreading) + if bb >= b: + tmpx = 3.0 * (bval[bb] - bval[b]) + else: + tmpx = 1.5 * (bval[bb] - bval[b]) + + # tmpz uses the "min(..., 0)" nonlinearity exactly as in the notes + tmpz = 8.0 * min((tmpx - 0.5) ** 2 - 2.0 * (tmpx - 0.5), 0.0) + tmpy = 15.811389 + 7.5 * (tmpx + 0.474) - 17.5 * np.sqrt(1.0 + (tmpx + 0.474) ** 2) + + # Clamp very small values (below -100 dB) to 0 contribution + if tmpy < -100.0: + spread[bb, b] = 0.0 + else: + spread[bb, b] = 10.0 ** ((tmpz + tmpy) / 10.0) + + return spread + + +# ----------------------------------------------------------------------------- +# Windowing + FFT feature extraction +# ----------------------------------------------------------------------------- + +def _hann_window(N: int) -> FloatArray: + """ + Hann window as specified in the notes: + w[n] = 0.5 - 0.5*cos(2*pi*(n + 0.5)/N) + + Parameters + ---------- + N : int + Window length. + + Returns + ------- + FloatArray + 1-D array of shape (N,), dtype float64. + """ + n = np.arange(N, dtype=np.float64) + return 0.5 - 0.5 * np.cos((2.0 * np.pi / N) * (n + 0.5)) + + +def _r_phi_from_time(x: FrameChannelT, N: int) -> tuple[FloatArray, FloatArray]: + """ + Compute FFT magnitude r(w) and phase phi(w) for bins w = 0 .. N/2-1. + + Processing: + 1) Apply Hann window in time domain. + 2) Compute N-point FFT. + 3) Keep only the positive-frequency bins [0 .. N/2-1]. + + Parameters + ---------- + x : FrameChannelT + Time-domain samples, shape (N,). + N : int + FFT size (2048 or 256). + + Returns + ------- + r : FloatArray + Magnitude spectrum for bins 0 .. N/2-1, shape (N/2,). + phi : FloatArray + Phase spectrum for bins 0 .. N/2-1, shape (N/2,). + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + if x.shape[0] != N: + raise ValueError(f"Expected time vector of length {N}, got {x.shape[0]}.") + + w = _hann_window(N) + X = np.fft.fft(x * w, n=N) + + Xp = X[: N // 2] + r = np.abs(Xp).astype(np.float64, copy=False) + phi = np.angle(Xp).astype(np.float64, copy=False) + return r, phi + + +def _predictability( + r: FloatArray, + phi: FloatArray, + r_m1: FloatArray, + phi_m1: FloatArray, + r_m2: FloatArray, + phi_m2: FloatArray, +) -> FloatArray: + """ + Compute predictability c(w) per spectral bin. + + The notes define: + r_pred(w) = 2*r_{-1}(w) - r_{-2}(w) + phi_pred(w) = 2*phi_{-1}(w) - phi_{-2}(w) + + c(w) = |X(w) - X_pred(w)| / (r(w) + |r_pred(w)|) + + where X(w) is represented in polar form using r(w), phi(w). + + Parameters + ---------- + r, phi : FloatArray + Current magnitude and phase, shape (N/2,). + r_m1, phi_m1 : FloatArray + Previous magnitude and phase, shape (N/2,). + r_m2, phi_m2 : FloatArray + Pre-previous magnitude and phase, shape (N/2,). + + Returns + ------- + FloatArray + Predictability c(w), shape (N/2,). + """ + r_pred = 2.0 * r_m1 - r_m2 + phi_pred = 2.0 * phi_m1 - phi_m2 + + num = np.sqrt( + (r * np.cos(phi) - r_pred * np.cos(phi_pred)) ** 2 + + (r * np.sin(phi) - r_pred * np.sin(phi_pred)) ** 2 + ) + den = r + np.abs(r_pred) + 1e-12 # avoid division-by-zero without altering behavior + return (num / den).astype(np.float64, copy=False) + + +# ----------------------------------------------------------------------------- +# Band-domain aggregation +# ----------------------------------------------------------------------------- + +def _band_energy_and_pred( + r: FloatArray, + c: FloatArray, + wlow: BandIndexArray, + whigh: BandIndexArray, +) -> tuple[FloatArray, FloatArray]: + """ + Aggregate spectral bin quantities into psychoacoustic bands. + + Definitions (notes): + e(b) = sum_{w=wlow(b)..whigh(b)} r(w)^2 + c_num(b) = sum_{w=wlow(b)..whigh(b)} c(w) * r(w)^2 + + The band predictability c(b) is later computed after spreading as: + cb(b) = ct(b) / ecb(b) + + Parameters + ---------- + r : FloatArray + Magnitude spectrum, shape (N/2,). + c : FloatArray + Predictability per bin, shape (N/2,). + wlow, whigh : BandIndexArray + Band limits (inclusive indices), shape (B,). + + Returns + ------- + e_b : FloatArray + Band energies e(b), shape (B,). + c_num_b : FloatArray + Weighted predictability numerators c_num(b), shape (B,). + """ + r2 = (r * r).astype(np.float64, copy=False) + + B = int(wlow.shape[0]) + e_b = np.zeros(B, dtype=np.float64) + c_num_b = np.zeros(B, dtype=np.float64) + + for b in range(B): + a = int(wlow[b]) + z = int(whigh[b]) + + seg_r2 = r2[a : z + 1] + e_b[b] = float(np.sum(seg_r2)) + c_num_b[b] = float(np.sum(c[a : z + 1] * seg_r2)) + + return e_b, c_num_b + + +def _psycho_window( + time_x: FrameChannelT, + prev1_x: FrameChannelT, + prev2_x: FrameChannelT, + *, + N: int, + table: BarkTable, +) -> FloatArray: + """ + Compute SMR for one FFT analysis window (N=2048 for long, N=256 for short). + + This implements the pipeline described in the notes: + - FFT magnitude/phase + - predictability per bin + - band energies and predictability + - band spreading + - tonality index tb(b) + - masking threshold (noise + threshold in quiet) + - SMR(b) = e(b) / np(b) + + Parameters + ---------- + time_x : FrameChannelT + Current time-domain samples, shape (N,). + prev1_x : FrameChannelT + Previous time-domain samples, shape (N,). + prev2_x : FrameChannelT + Pre-previous time-domain samples, shape (N,). + N : int + FFT size. + table : BarkTable + Psychoacoustic band table (B219a or B219b). + + Returns + ------- + FloatArray + SMR per band, shape (B,). + """ + wlow, whigh, bval, qthr_db = band_limits(table) + spread = _spreading_matrix(bval) + + # FFT features for current and history windows + r, phi = _r_phi_from_time(time_x, N) + r_m1, phi_m1 = _r_phi_from_time(prev1_x, N) + r_m2, phi_m2 = _r_phi_from_time(prev2_x, N) + + # Predictability per bin + c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2) + + # Aggregate into psycho bands + e_b, c_num_b = _band_energy_and_pred(r, c_w, wlow, whigh) + + # Spread energies and predictability across bands: + # ecb(b) = sum_bb e(bb) * S(bb, b) + # ct(b) = sum_bb c_num(bb) * S(bb, b) + ecb = spread.T @ e_b + ct = spread.T @ c_num_b + + # Band predictability after spreading: cb(b) = ct(b) / ecb(b) + cb = ct / (ecb + 1e-12) + + # Normalized energy term: + # en(b) = ecb(b) / sum_bb S(bb, b) + spread_colsum = np.sum(spread, axis=0) + en = ecb / (spread_colsum + 1e-12) + + # Tonality index (clamped to [0, 1]) + tb = -0.299 - 0.43 * np.log(np.maximum(cb, 1e-12)) + tb = np.clip(tb, 0.0, 1.0) + + # Required SNR per band (dB): interpolate between TMN and NMT + snr_b = tb * TMN_DB + (1.0 - tb) * NMT_DB + bc = 10.0 ** (-snr_b / 10.0) + + # Noise masking threshold estimate (power domain) + nb = en * bc + + # Threshold in quiet (convert from dB to power domain): + # qthr_power = eps * (N/2) * 10^(qthr_db/10) + qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0)) + + # Final masking threshold per band: + # np(b) = max(nb(b), qthr(b)) + npart = np.maximum(nb, qthr_power) + + # Signal-to-mask ratio: + # SMR(b) = e(b) / np(b) + smr = e_b / (npart + 1e-12) + return smr.astype(np.float64, copy=False) + + +# ----------------------------------------------------------------------------- +# ESH window slicing (match filterbank conventions) +# ----------------------------------------------------------------------------- + +def _esh_subframes(x_2048: FrameChannelT) -> list[FrameChannelT]: + """ + Extract the 8 overlapping 256-sample short windows used by AAC ESH. + + The project convention (matching the filterbank) is: + start_j = 448 + 128*j, for j = 0..7 + subframe_j = x[start_j : start_j + 256] + + This selects the central 1152-sample region [448, 1600) and produces + 8 windows with 50% overlap. + + Parameters + ---------- + x_2048 : FrameChannelT + Time-domain channel frame, shape (2048,). + + Returns + ------- + list[FrameChannelT] + List of 8 subframes, each of shape (256,). + """ + x_2048 = np.asarray(x_2048, dtype=np.float64).reshape(-1) + if x_2048.shape[0] != 2048: + raise ValueError("ESH requires 2048-sample input frames.") + + subs: list[FrameChannelT] = [] + for j in range(8): + start = 448 + 128 * j + subs.append(x_2048[start : start + 256]) + return subs + + +# ----------------------------------------------------------------------------- +# Public API +# ----------------------------------------------------------------------------- + +def aac_psycho( + frame_T: FrameChannelT, + frame_type: FrameType, + frame_T_prev_1: FrameChannelT, + frame_T_prev_2: FrameChannelT, +) -> FloatArray: + """ + Psychoacoustic model for ONE channel. + + Parameters + ---------- + frame_T : FrameChannelT + Current time-domain channel frame, shape (2048,). + For "ESH", the 8 short windows are derived internally. + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + frame_T_prev_1 : FrameChannelT + Previous time-domain channel frame, shape (2048,). + frame_T_prev_2 : FrameChannelT + Pre-previous time-domain channel frame, shape (2048,). + + Returns + ------- + FloatArray + Signal-to-Mask Ratio (SMR), per psychoacoustic band. + - If frame_type == "ESH": shape (42, 8) + - Else: shape (69,) + """ + frame_T = np.asarray(frame_T, dtype=np.float64).reshape(-1) + frame_T_prev_1 = np.asarray(frame_T_prev_1, dtype=np.float64).reshape(-1) + frame_T_prev_2 = np.asarray(frame_T_prev_2, dtype=np.float64).reshape(-1) + + if frame_T.shape[0] != 2048 or frame_T_prev_1.shape[0] != 2048 or frame_T_prev_2.shape[0] != 2048: + raise ValueError("aac_psycho expects 2048-sample frames for current/prev1/prev2.") + + table, N = get_table(frame_type) + + # Long frame types: compute one SMR vector (69 bands) + if frame_type != "ESH": + return _psycho_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table) + + # ESH: compute 8 SMR vectors (42 bands each), one per short subframe. + # + # The notes use short-window history for predictability: + # - For j=0: use previous frame's subframes (7, 6) + # - For j=1: use current subframe 0 and previous frame's subframe 7 + # - For j>=2: use current subframes (j-1, j-2) + # + # This matches the "within-frame history" convention commonly used in + # simplified psycho models for ESH. + cur_subs = _esh_subframes(frame_T) + prev1_subs = _esh_subframes(frame_T_prev_1) + + B = int(table.shape[0]) # expected 42 + smr_out = np.zeros((B, 8), dtype=np.float64) + + for j in range(8): + if j == 0: + x_m1 = prev1_subs[7] + x_m2 = prev1_subs[6] + elif j == 1: + x_m1 = cur_subs[0] + x_m2 = prev1_subs[7] + else: + x_m1 = cur_subs[j - 1] + x_m2 = cur_subs[j - 2] + + smr_out[:, j] = _psycho_window(cur_subs[j], x_m1, x_m2, N=256, table=table) + + return smr_out diff --git a/source/level_1/core/aac_quantizer.py b/source/level_1/core/aac_quantizer.py new file mode 100644 index 0000000..f2004b0 --- /dev/null +++ b/source/level_1/core/aac_quantizer.py @@ -0,0 +1,600 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Implements AAC quantizer and inverse quantizer for one channel. +# Based on assignment section 2.6 (Eq. 12-15). +# +# Notes: +# - Bit reservoir is not implemented (assignment simplification). +# - Scalefactor bands are assumed equal to psychoacoustic bands +# (Table B.2.1.9a / B.2.1.9b from TableB219.mat). +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np + +from core.aac_utils import get_table, band_limits +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Constants (assignment) +# ----------------------------------------------------------------------------- +MAGIC_NUMBER: float = 0.4054 +EPS: float = 1e-12 +MAX_SF_DELTA:int = 60 + + +# ----------------------------------------------------------------------------- +# Helpers: ESH packing/unpacking (128x8 <-> 1024x1) +# ----------------------------------------------------------------------------- +def _esh_pack(x_128x8: FloatArray) -> FloatArray: + """ + Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1). + + Packing order: + Columns are concatenated in subframe order (0..7), column-major. + + Parameters + ---------- + x_128x8 : FloatArray + ESH coefficients, shape (128, 8). + + Returns + ------- + FloatArray + Packed coefficients, shape (1024, 1). + """ + x_128x8 = np.asarray(x_128x8, dtype=np.float64) + if x_128x8.shape != (128, 8): + raise ValueError("ESH pack expects shape (128, 8).") + return x_128x8.reshape(1024, 1, order="F") + + +def _esh_unpack(x_1024x1: FloatArray) -> FloatArray: + """ + Unpack a packed ESH vector (1024 elements) back to shape (128, 8). + + Parameters + ---------- + x_1024x1 : FloatArray + Packed ESH vector, shape (1024,) or (1024, 1) after flattening. + + Returns + ------- + FloatArray + Unpacked ESH coefficients, shape (128, 8). + """ + x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1) + if x_1024x1.shape[0] != 1024: + raise ValueError("ESH unpack expects 1024 elements.") + return x_1024x1.reshape(128, 8, order="F") + + +# ----------------------------------------------------------------------------- +# Core quantizer formulas (Eq. 12, Eq. 13) +# ----------------------------------------------------------------------------- +def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols: + """ + Quantize MDCT coefficients to integer symbols S(k). + + Implements Eq. (12): + S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER ) + + Parameters + ---------- + x : FloatArray + MDCT coefficients for a contiguous set of spectral lines. + Shape: (N,) + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + QuantizedSymbols + Quantized symbols S(k) as int64, shape (N,). + """ + x = np.asarray(x, dtype=np.float64) + + scale = 2.0 ** (-0.25 * float(alpha)) + ax = np.abs(x) * scale + + y = np.power(ax, 0.75, dtype=np.float64) + + # "int" in the assignment corresponds to truncation. + q = np.floor(y + MAGIC_NUMBER).astype(np.int64) + return (np.sign(x).astype(np.int64) * q).astype(np.int64) + + +def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray: + """ + Inverse quantizer (dequantization of symbols). + + Implements Eq. (13): + Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4) + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols S(k), int64, shape (N,). + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + FloatArray + Reconstructed MDCT coefficients Xhat(k), float64, shape (N,). + """ + S = np.asarray(S, dtype=np.int64) + + scale = 2.0 ** (0.25 * float(alpha)) + aS = np.abs(S).astype(np.float64) + y = np.power(aS, 4.0 / 3.0, dtype=np.float64) + + return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64) + + +# ----------------------------------------------------------------------------- +# Alpha initialization (Eq. 14) +# ----------------------------------------------------------------------------- +def _initial_alpha_hat(X: "FloatArray", MQ: int = 8191) -> int: + """ + Compute the initial scalefactor estimate alpha_hat for a frame. + + The assignment proposes the following first approximation (Equation 14): + + alpha_hat = (16/3) * log2( max_k(|X(k)|)^(3/4) / MQ ) + + where max_k runs over all MDCT coefficients of the frame (not per band), + and MQ is the maximum quantization level parameter (2*MQ + 1 levels). + + Parameters + ---------- + X : FloatArray + MDCT coefficients of one frame (or one ESH subframe), shape (N,). + MQ : int + Quantizer parameter (default 8191, as per assignment). + + Returns + ------- + int + Integer alpha_hat (rounded to nearest integer). + """ + x_max = float(np.max(np.abs(X))) + if x_max <= 0.0: + return 0 + + alpha_hat = (16.0 / 3.0) * np.log2((x_max ** (3.0 / 4.0)) / float(MQ)) + return int(np.round(alpha_hat)) + + +# ----------------------------------------------------------------------------- +# Band utilities +# ----------------------------------------------------------------------------- +def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type. + + These are derived from the psychoacoustic tables (TableB219), + and map directly to MDCT indices: + - long: 0..1023 + - short (ESH subframe): 0..127 + + Parameters + ---------- + frame_type : FrameType + Frame type ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + list[tuple[int, int]] + List of (lo, hi) inclusive index pairs for each band. + """ + table, _Nfft = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + + bands: list[tuple[int, int]] = [] + for lo, hi in zip(wlow, whigh): + bands.append((int(lo), int(hi))) + return bands + + +def _band_energy(x: FloatArray, lo: int, hi: int) -> float: + """ + Compute energy of a spectral segment x[lo:hi+1]. + + Parameters + ---------- + x : FloatArray + MDCT coefficient vector. + lo, hi : int + Inclusive index range. + + Returns + ------- + float + Sum of squares (energy) within the band. + """ + sec = x[lo : hi + 1] + return float(np.sum(sec * sec)) + + +def _psychoacoustic_threshold( + X: FloatArray, + SMR_col: FloatArray, + bands: list[tuple[int, int]], +) -> FloatArray: + """ + Compute psychoacoustic thresholds T(b) per band. + + Uses: + P(b) = sum_{k in band} X(k)^2 + T(b) = P(b) / SMR(b) + + Parameters + ---------- + X : FloatArray + MDCT coefficients for a frame (long) or one ESH subframe (short). + SMR_col : FloatArray + SMR values for this frame/subframe, shape (NB,). + bands : list[tuple[int, int]] + Band index ranges. + + Returns + ------- + FloatArray + Threshold vector T(b), shape (NB,). + """ + nb = len(bands) + T = np.zeros((nb,), dtype=np.float64) + + for b, (lo, hi) in enumerate(bands): + P = _band_energy(X, lo, hi) + smr = float(SMR_col[b]) + if smr <= EPS: + T[b] = 0.0 + else: + T[b] = P / smr + + return T + + +# ----------------------------------------------------------------------------- +# Alpha selection per band + neighbor-difference constraint +# ----------------------------------------------------------------------------- +def _best_alpha_for_band( + X: "FloatArray", lo: int, hi: int, T_b: float, + alpha_hat: int, alpha_prev: int, alpha_min: int, alpha_max: int, +) -> int: + """ + Determine the band-wise scalefactor alpha(b) following the assignment. + + Procedure: + - Start from a frame-wise initial estimate alpha_hat. + - Iteratively increase alpha(b) by 1 as long as the quantization error power + stays below the psychoacoustic threshold T(b): P_e(b) = sum_{k in band} ( X(k) - Xhat(k) )^2 + + - Stop increasing alpha(b) if the neighbor constraint would be violated: |alpha(b) - alpha(b-1)| <= 60 + When processing bands sequentially (low -> high), this becomes: alpha(b) <= alpha_prev + 60 + + Notes: + - This function does not decrease alpha if the initial value already violates + the threshold; the assignment only specifies iterative increase. + + Parameters + ---------- + X : FloatArray + Full MDCT vector of the current (sub)frame, shape (N,). + lo, hi : int + Band index bounds (inclusive), defining the band slice. + T_b : float + Threshold T(b) for this band. + alpha_hat : int + Initial frame-wise estimate (Equation 14). + alpha_prev : int + Previously selected alpha for band b-1 (neighbor constraint reference). + alpha_min, alpha_max : int + Safeguard bounds for alpha. + + Returns + ------- + int + Selected integer alpha(b). + """ + if T_b <= 0.0: + return int(alpha_hat) + Xsec = X[lo : hi + 1] + + # Neighbor constraint (sequential processing): alpha(b) <= alpha_prev + 60 + alpha_limit = min(int(alpha_max), int(alpha_prev) + MAX_SF_DELTA) + + # Start from alpha_hat, clamped to feasible range + alpha = int(alpha_hat) + alpha = max(int(alpha_min), min(alpha, int(alpha_limit))) + + # Evaluate at current alpha + Ssec = _quantize_symbol(Xsec, alpha) + Xhat = _dequantize_symbol(Ssec, alpha) + Pe = float(np.sum((Xsec - Xhat) ** 2)) + + # If already above threshold, return current alpha (no decrease step specified) + if Pe > T_b: + return alpha + + # Increase alpha while still under threshold and within constraints + while True: + alpha_next = alpha + 1 + if alpha_next > alpha_limit: + break + Ssec = _quantize_symbol(Xsec, alpha_next) + Xhat = _dequantize_symbol(Ssec, alpha_next) + Pe_next = float(np.sum((Xsec - Xhat) ** 2)) + + if Pe_next > T_b: + break + alpha = alpha_next + + return alpha + + +# ----------------------------------------------------------------------------- +# Public API +# ----------------------------------------------------------------------------- +def aac_quantizer( + frame_F: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]: + """ + AAC quantizer for one channel (Level 3). + + Quantizes MDCT coefficients (after TNS) using band-wise scalefactors derived + from psychoacoustic thresholds computed via SMR. + + The implementation follows the assignment procedure: + - Compute an initial frame-wise alpha_hat using Equation (14), based on the + maximum MDCT coefficient magnitude of the (sub)frame. + - For each band b, increase alpha(b) by 1 while the quantization error power + P_e(b) stays below the threshold T(b). + - Enforce the neighbor constraint |alpha(b) - alpha(b-1)| <= 60 during the + band-by-band search (no post-processing needed). + + Parameters + ---------- + frame_F : FrameChannelF + MDCT coefficients after TNS, one channel. + Shapes: + - Long frames: (1024,) or (1024, 1) + - ESH: (128, 8) + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + SMR : FloatArray + Signal-to-Mask Ratio per band. + Shapes: + - Long: (NB,) or (NB, 1) + - ESH: (NB, 8) + + Returns + ------- + S : QuantizedSymbols + Quantized symbols S(k), packed as shape (1024, 1) for all frame types. + For ESH, the 8 subframes are packed in column-major subframe layout. + sfc : ScaleFactors + DPCM-coded scalefactors: + sfc(0) = alpha(0) = G + sfc(b) = alpha(b) - alpha(b-1), for b > 0 + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain G = alpha(0). + - Long: scalar float + - ESH: array shape (1, 8), dtype float64 + """ + bands = _band_slices(frame_type) + NB = len(bands) + + X = np.asarray(frame_F, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + # ------------------------------------------------------------------------- + # ESH: 8 short subframes, each of length 128 + # ------------------------------------------------------------------------- + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64) + sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64) + G_arr = np.zeros((1, 8), dtype=np.float64) + + # Packed output view: (128, 8) with column-major layout + S_pack = S_out[:, 0].reshape(128, 8, order="F") + + for j in range(8): + Xj = X[:, j].reshape(128) + SMRj = SMR[:, j].reshape(NB) + + # Compute psychoacoustic threshold T(b) for this subframe + T = _psychoacoustic_threshold(Xj, SMRj, bands) + + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xj) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) + + for b, (lo, hi) in enumerate(bands): + alpha_b = _best_alpha_for_band( + X=Xj, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), + alpha_min=-4096, + alpha_max=4096, + ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) + + # DPCM-coded scalefactors + G_arr[0, j] = float(alpha[0]) + sfc[0, j] = int(alpha[0]) + for b in range(1, NB): + sfc[b, j] = int(alpha[b] - alpha[b - 1]) + + # Quantize MDCT coefficients band-by-band + Sj = np.zeros((128,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b])) + + # Store subframe in packed output + S_pack[:, j] = Sj + + return S_out, sfc, G_arr + + # ------------------------------------------------------------------------- + # Long frames: OLS / LSS / LPS, length 1024 + # ------------------------------------------------------------------------- + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") + + # Compute psychoacoustic threshold T(b) for the long frame + T = _psychoacoustic_threshold(Xv, SMRv, bands) + + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xv) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) + + for b, (lo, hi) in enumerate(bands): + alpha_b = _best_alpha_for_band( + X=Xv, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), + alpha_min=-4096, + alpha_max=4096, + ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) + + # DPCM-coded scalefactors + sfc_out: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) + sfc_out[0, 0] = int(alpha[0]) + for b in range(1, NB): + sfc_out[b, 0] = int(alpha[b] - alpha[b - 1]) + + G: float = float(alpha[0]) + + # Quantize MDCT coefficients band-by-band + S_vec = np.zeros((1024,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b])) + + return S_vec.reshape(1024, 1), sfc_out, G + + +def aac_i_quantizer( + S: QuantizedSymbols, + sfc: ScaleFactors, + G: GlobalGain, + frame_type: FrameType, +) -> FrameChannelF: + """ + Inverse quantizer (iQuantizer) for one channel. + + Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors. + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols, shape (1024, 1) (or any array with 1024 elements). + sfc : ScaleFactors + DPCM-coded scalefactors. + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain (not strictly required if sfc includes sfc(0)=alpha(0)). + Present for API compatibility with the assignment. + frame_type : FrameType + AAC frame type. + + Returns + ------- + FrameChannelF + Reconstructed MDCT coefficients: + - ESH: (128, 8) + - Long: (1024, 1) + """ + bands = _band_slices(frame_type) + NB = len(bands) + + S_flat = np.asarray(S, dtype=np.int64).reshape(-1) + if S_flat.shape[0] != 1024: + raise ValueError("S must contain 1024 symbols.") + + if frame_type == "ESH": + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 8): + raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).") + + S_128x8 = _esh_unpack(S_flat) + + Xrec = np.zeros((128, 8), dtype=np.float64) + + for j in range(8): + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, j]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, j]) + + Xj = np.zeros((128,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b])) + + Xrec[:, j] = Xj + + return Xrec + + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 1): + raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).") + + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, 0]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, 0]) + + Xrec = np.zeros((1024,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b])) + + return Xrec.reshape(1024, 1) diff --git a/source/level_1/core/aac_tns.py b/source/level_1/core/aac_tns.py new file mode 100644 index 0000000..4ba78ac --- /dev/null +++ b/source/level_1/core/aac_tns.py @@ -0,0 +1,514 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Temporal Noise Shaping (TNS) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Temporal Noise Shaping (TNS) module (Level 2). +# +# Public API: +# frame_F_out, tns_coeffs = aac_tns(frame_F_in, frame_type) +# frame_F_out = aac_i_tns(frame_F_in, frame_type, tns_coeffs) +# +# Notes (per assignment): +# - TNS is applied per channel (not stereo). +# - For ESH, TNS is applied independently to each of the 8 short subframes. +# - Bark band tables are taken from TableB.2.1.9a (long) and TableB.2.1.9b (short) +# provided in TableB219.mat. +# - Predictor order is fixed to p = 4. +# - Coefficients are quantized with a 4-bit uniform symmetric quantizer, step = 0.1. +# - Forward TNS applies FIR: H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p +# - Inverse TNS applies the inverse IIR filter using the same quantized coefficients. +# ------------------------------------------------------------ +from __future__ import annotations + +from pathlib import Path +from typing import Tuple + +from core.aac_utils import load_b219_tables +from core.aac_configuration import PRED_ORDER, QUANT_STEP, QUANT_MAX +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Private helpers +# ----------------------------------------------------------------------------- + + +def _band_ranges(k_count: int) -> BandRanges: + """ + Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count. + + Parameters + ---------- + k_count : int + Number of MDCT lines: + - 1024 for long frames + - 128 for short subframes (ESH) + + Returns + ------- + BandRanges (list[tuple[int, int]]) + Each tuple is (start_k, end_k) inclusive. + """ + tables = load_b219_tables() + if k_count == 1024: + tbl = tables["B219a"] + elif k_count == 128: + tbl = tables["B219b"] + else: + raise ValueError("TNS supports only k_count=1024 (long) or k_count=128 (short).") + + start = tbl[:, 1].astype(int) + end = tbl[:, 2].astype(int) + + ranges: BandRanges = [(int(s), int(e)) for s, e in zip(start, end)] + + for s, e in ranges: + if s < 0 or e < s or e >= k_count: + raise ValueError("Invalid band table ranges for given k_count.") + return ranges + + +# ----------------------------------------------------------------------------- +# Core DSP helpers +# ----------------------------------------------------------------------------- + +def _smooth_sw_inplace(sw: MdctCoeffs) -> None: + """ + Smooth Sw(k) to reduce discontinuities between adjacent Bark bands. + + The assignment applies two passes: + - Backward: Sw(k) = (Sw(k) + Sw(k+1))/2 + - Forward: Sw(k) = (Sw(k) + Sw(k-1))/2 + + Parameters + ---------- + sw : MdctCoeffs + 1-D array of length K (float64). Modified in-place. + """ + k_count = int(sw.shape[0]) + + for k in range(k_count - 2, -1, -1): + sw[k] = 0.5 * (sw[k] + sw[k + 1]) + + for k in range(1, k_count): + sw[k] = 0.5 * (sw[k] + sw[k - 1]) + + +def _compute_sw(x: MdctCoeffs) -> MdctCoeffs: + """ + Compute Sw(k) from band energies P(j) and apply boundary smoothing. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT line array, length K. + + Returns + ------- + MdctCoeffs + Sw(k), 1-D array of length K, float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + k_count = int(x.shape[0]) + + bands = _band_ranges(k_count) + sw = np.zeros(k_count, dtype=np.float64) + + for s, e in bands: + seg = x[s : e + 1] + p_j = float(np.sum(seg * seg)) + sw_val = float(np.sqrt(p_j)) + sw[s : e + 1] = sw_val + + _smooth_sw_inplace(sw) + return sw + + +def _autocorr(x: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Autocorrelation r(m) for m=0..p. + + Parameters + ---------- + x : MdctCoeffs + 1-D signal. + p : int + Maximum lag. + + Returns + ------- + MdctCoeffs + r, shape (p+1,), float64. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + n = int(x.shape[0]) + + r = np.zeros(p + 1, dtype=np.float64) + for m in range(p + 1): + r[m] = float(np.dot(x[m:], x[: n - m])) + return r + + +def _lpc_coeffs(xw: MdctCoeffs, p: int) -> MdctCoeffs: + """ + Solve Yule-Walker normal equations for LPC coefficients of order p. + + Parameters + ---------- + xw : MdctCoeffs + 1-D normalized sequence Xw(k). + p : int + Predictor order. + + Returns + ------- + MdctCoeffs + LPC coefficients a[0..p-1], shape (p,), float64. + """ + r = _autocorr(xw, p) + + R = np.empty((p, p), dtype=np.float64) + for i in range(p): + for j in range(p): + R[i, j] = r[abs(i - j)] + + rhs = r[1 : p + 1].reshape(p) + + reg = 1e-12 + R_reg = R + reg * np.eye(p, dtype=np.float64) + + a = np.linalg.solve(R_reg, rhs) + return a + + +def _quantize_coeffs(a: MdctCoeffs) -> MdctCoeffs: + """ + Quantize LPC coefficients with uniform symmetric quantizer and clamp. + + Parameters + ---------- + a : MdctCoeffs + LPC coefficient array, shape (p,). + + Returns + ------- + MdctCoeffs + Quantized coefficients, shape (p,), float64. + """ + a = np.asarray(a, dtype=np.float64).reshape(-1) + q = np.round(a / QUANT_STEP) * QUANT_STEP + q = np.clip(q, -QUANT_MAX, QUANT_MAX) + return q.astype(np.float64, copy=False) + + +def _is_inverse_stable(a_q: MdctCoeffs) -> bool: + """ + Check stability of the inverse TNS filter H_TNS^{-1}. + + Forward filter: + H_TNS(z) = 1 - a1 z^-1 - ... - ap z^-p + + Inverse filter poles are roots of: + A(z) = 1 - a1 z^-1 - ... - ap z^-p + Multiply by z^p: + z^p - a1 z^{p-1} - ... - ap = 0 + + Stability condition: + all roots satisfy |z| < 1. + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + bool + True if stable, else False. + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + + # Polynomial in z: z^p - a1 z^{p-1} - ... - ap + poly = np.empty(p + 1, dtype=np.float64) + poly[0] = 1.0 + poly[1:] = -a_q + + roots = np.roots(poly) + + # Strictly inside unit circle for stability. Add tiny margin for numeric safety. + margin = 1e-12 + return bool(np.all(np.abs(roots) < (1.0 - margin))) + + +def _stabilize_quantized_coeffs(a_q: MdctCoeffs) -> MdctCoeffs: + """ + Make quantized predictor coefficients stable for inverse filtering. + + Policy: + - If already stable: return as-is. + - Else: iteratively shrink coefficients by gamma and re-quantize to the 0.1 grid. + - If still unstable after attempts: fall back to all-zero coefficients (disable TNS). + + Parameters + ---------- + a_q : MdctCoeffs + Quantized predictor coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Stable quantized coefficients, shape (p,). + """ + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + + if _is_inverse_stable(a_q): + return a_q + + # Try a few shrinking factors. Re-quantize after shrinking to keep coefficients on-grid. + gammas = (0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1) + + for g in gammas: + cand = _quantize_coeffs(g * a_q) + if _is_inverse_stable(cand): + return cand + + # Last resort: disable TNS for this vector + return np.zeros_like(a_q, dtype=np.float64) + + +def _apply_tns_fir(x: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply forward TNS FIR filter: + y[k] = x[k] - sum_{l=1..p} a_l * x[k-l] + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT lines, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Filtered MDCT lines y, length K. + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(x.shape[0]) + + y = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = x[k] + for l in range(1, p + 1): + if k - l >= 0: + acc -= a_q[l - 1] * x[k - l] + y[k] = acc + return y + + +def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: + """ + Apply inverse TNS IIR filter: + x_hat[k] = y[k] + sum_{l=1..p} a_l * x_hat[k-l] + + Parameters + ---------- + y : MdctCoeffs + 1-D MDCT lines after TNS, length K. + a_q : MdctCoeffs + Quantized LPC coefficients, shape (p,). + + Returns + ------- + MdctCoeffs + Reconstructed MDCT lines x_hat, length K. + """ + y = np.asarray(y, dtype=np.float64).reshape(-1) + a_q = np.asarray(a_q, dtype=np.float64).reshape(-1) + p = int(a_q.shape[0]) + k_count = int(y.shape[0]) + + x_hat = np.zeros(k_count, dtype=np.float64) + for k in range(k_count): + acc = y[k] + for l in range(1, p + 1): + if k - l >= 0: + acc += a_q[l - 1] * x_hat[k - l] + x_hat[k] = acc + return x_hat + + +def _tns_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: + """ + TNS for a single MDCT vector (one long frame or one short subframe). + + Steps: + 1) Compute Sw(k) from Bark band energies and smooth it. + 2) Normalize: Xw(k) = X(k) / Sw(k) (safe when Sw=0). + 3) Compute LPC coefficients (order p=PRED_ORDER) on Xw. + 4) Quantize coefficients (4-bit symmetric, step QUANT_STEP). + 5) Apply FIR filter on original X(k) using quantized coefficients. + + Parameters + ---------- + x : MdctCoeffs + 1-D MDCT vector. + + Returns + ------- + y : MdctCoeffs + TNS-processed MDCT vector (same length). + a_q : MdctCoeffs + Quantized LPC coefficients, shape (PRED_ORDER,). + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + sw = _compute_sw(x) + + eps = 1e-12 + xw = np.zeros_like(x, dtype=np.float64) + mask = sw > eps + np.divide(x, sw, out=xw, where=mask) + + a = _lpc_coeffs(xw, PRED_ORDER) + a_q = _quantize_coeffs(a) + + # Ensure inverse stability (assignment requirement) + a_q = _stabilize_quantized_coeffs(a_q) + + y = _apply_tns_fir(x, a_q) + return y, a_q + + + +# ----------------------------------------------------------------------------- +# Public Functions +# ----------------------------------------------------------------------------- + +def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChannelF, TnsCoeffs]: + """ + Temporal Noise Shaping (TNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + frame_F_out : FrameChannelF + Per-channel MDCT coefficients after applying TNS. + Same shape convention as input. + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + """ + x = np.asarray(frame_F_in, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + + y = np.empty_like(x, dtype=np.float64) + a_out = np.empty((PRED_ORDER, 8), dtype=np.float64) + + for j in range(8): + y[:, j], a_out[:, j] = _tns_vector(x[:, j]) + + return y, a_out + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec, a_q = _tns_vector(x_vec) + + if out_shape == (1024,): + y_out = y_vec + else: + y_out = y_vec.reshape(1024, 1) + + a_out = a_q.reshape(PRED_ORDER, 1) + return y_out, a_out + + +def aac_i_tns(frame_F_in: FrameChannelF, frame_type: FrameType, tns_coeffs: TnsCoeffs) -> FrameChannelF: + """ + Inverse Temporal Noise Shaping (iTNS) for ONE channel. + + Parameters + ---------- + frame_F_in : FrameChannelF + Per-channel MDCT coefficients after TNS. + Expected (typical) shapes: + - If frame_type == "ESH": (128, 8) + - Else: (1024, 1) or (1024,) + + frame_type : FrameType + Frame type code ("OLS", "LSS", "ESH", "LPS"). + + tns_coeffs : TnsCoeffs + Quantized TNS predictor coefficients. + Expected shapes: + - If frame_type == "ESH": (PRED_ORDER, 8) + - Else: (PRED_ORDER, 1) + + Returns + ------- + FrameChannelF + Per-channel MDCT coefficients after inverse TNS. + Same shape convention as input frame_F_in. + """ + x = np.asarray(frame_F_in, dtype=np.float64) + a = np.asarray(tns_coeffs, dtype=np.float64) + + if frame_type == "ESH": + if x.shape != (128, 8): + raise ValueError("For ESH, frame_F_in must have shape (128, 8).") + if a.shape != (PRED_ORDER, 8): + raise ValueError("For ESH, tns_coeffs must have shape (PRED_ORDER, 8).") + + y = np.empty_like(x, dtype=np.float64) + for j in range(8): + y[:, j] = _apply_itns_iir(x[:, j], a[:, j]) + return y + + if a.shape != (PRED_ORDER, 1): + raise ValueError("For non-ESH, tns_coeffs must have shape (PRED_ORDER, 1).") + + if x.shape == (1024,): + x_vec = x + out_shape = (1024,) + elif x.shape == (1024, 1): + x_vec = x[:, 0] + out_shape = (1024, 1) + else: + raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') + + y_vec = _apply_itns_iir(x_vec, a[:, 0]) + + if out_shape == (1024,): + return y_vec + return y_vec.reshape(1024, 1) diff --git a/source/level_1/core/aac_utils.py b/source/level_1/core/aac_utils.py index 0e18ed1..26cdbdf 100644 --- a/source/level_1/core/aac_utils.py +++ b/source/level_1/core/aac_utils.py @@ -163,6 +163,42 @@ def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: return float(10.0 * np.log10(ps / pn)) +def estimate_lag_mono(x_ref: TimeSignal, x_hat: TimeSignal, max_lag=4096): + """ + Estimate time lag between two mono signals. + Returns lag (positive means x_hat delayed). + """ + n = min(len(x_ref), len(x_hat)) + x_ref = x_ref[:n] + x_hat = x_hat[:n] + + corr = np.correlate(x_ref, x_hat, mode='full') + lags = np.arange(-n + 1, n) + + center = n - 1 + lo = max(0, center - max_lag) + hi = min(len(corr), center + max_lag + 1) + + best = lo + int(np.argmax(corr[lo:hi])) + return int(lags[best]) + + +def match_gain(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Least-squares gain g that best maps x_hat -> x_ref. + """ + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + r = x_ref[:n, :c].reshape(-1).astype(np.float64) + h = x_hat[:n, :c].reshape(-1).astype(np.float64) + + denom = float(np.dot(h, h)) + if denom <= 0.0: + return 1.0 + return float(np.dot(r, h) / denom) + + # ----------------------------------------------------------------------------- # Psychoacoustic band tables (TableB219.mat) # ----------------------------------------------------------------------------- diff --git a/source/level_1/material/huff_utils.py b/source/level_1/material/huff_utils.py new file mode 100644 index 0000000..8cdd279 --- /dev/null +++ b/source/level_1/material/huff_utils.py @@ -0,0 +1,403 @@ +import numpy as np +import scipy.io as sio +import os + +# ------------------ LOAD LUT ------------------ + +def load_LUT(mat_filename=None): + """ + Loads the list of Huffman Codebooks (LUTs) + + Returns: + huffLUT : list (index 1..11 used, index 0 unused) + """ + if mat_filename is None: + current_dir = os.path.dirname(os.path.abspath(__file__)) + mat_filename = os.path.join(current_dir, "huffCodebooks.mat") + + mat = sio.loadmat(mat_filename) + + + huffCodebooks_raw = mat['huffCodebooks'].squeeze() + + huffCodebooks = [] + for i in range(11): + huffCodebooks.append(np.array(huffCodebooks_raw[i])) + + # Build inverse VLC tables + invTable = [None] * 11 + + for i in range(11): + h = huffCodebooks[i][:, 2].astype(int) # column 3 + hlength = huffCodebooks[i][:, 1].astype(int) # column 2 + + hbin = [] + for j in range(len(h)): + hbin.append(format(h[j], f'0{hlength[j]}b')) + + invTable[i] = vlc_table(hbin) + + # Build Huffman LUT dicts + huffLUT = [None] * 12 # index 0 unused + params = [ + (4, 1, True), + (4, 1, True), + (4, 2, False), + (4, 2, False), + (2, 4, True), + (2, 4, True), + (2, 7, False), + (2, 7, False), + (2, 12, False), + (2, 12, False), + (2, 16, False), + ] + + for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1): + huffLUT[i] = { + 'LUT': huffCodebooks[i-1], + 'invTable': invTable[i-1], + 'codebook': i, + 'nTupleSize': nTupleSize, + 'maxAbsCodeVal': maxAbs, + 'signedValues': signed + } + + return huffLUT + +def vlc_table(code_array): + """ + codeArray: list of strings, each string is a Huffman codeword (e.g. '0101') + returns: + h : NumPy array of shape (num_nodes, 3) + columns: + [ next_if_0 , next_if_1 , symbol_index ] + """ + h = np.zeros((1, 3), dtype=int) + + for code_index, code in enumerate(code_array, start=1): + word = [int(bit) for bit in code] + h_index = 0 + + for bit in word: + k = bit + next_node = h[h_index, k] + if next_node == 0: + h = np.vstack([h, [0, 0, 0]]) + new_index = h.shape[0] - 1 + h[h_index, k] = new_index + h_index = new_index + else: + h_index = next_node + + h[h_index, 2] = code_index + + return h + +# ------------------ ENCODE ------------------ + +def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None): + """ + Huffman-encode a sequence of quantized coefficients. + + This function selects the appropriate Huffman codebook based on the + maximum absolute value of the input coefficients, encodes the coefficients + into a binary Huffman bitstream, and returns both the bitstream and the + selected codebook index. + + This is the Python equivalent of the MATLAB `encodeHuff.m` function used + in audio/image coding (e.g., scale factor band encoding). The input + coefficient sequence is grouped into fixed-size tuples as defined by + the chosen Huffman LUT. Zero-padding may be applied internally. + + Parameters + ---------- + coeff_sec : array_like of int + 1-D array of quantized integer coefficients to encode. + Typically corresponds to a "section" or scale-factor band. + + huff_LUT_list : list + List of Huffman lookup-table dictionaries as returned by `loadLUT()`. + Index 1..11 correspond to valid Huffman codebooks. + Index 0 is unused. + + Returns + ------- + huffSec : str + Huffman-encoded bitstream represented as a string of '0' and '1' + characters. + + huffCodebook : int + Index (1..11) of the Huffman codebook used for encoding. + A value of 0 indicates a special all-zero section. + """ + if force_codebook is not None: + return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec) + + maxAbsVal = np.max(np.abs(coeff_sec)) + + if maxAbsVal == 0: + huffCodebook = 0 + huffSec = huff_LUT_code_0() + + elif maxAbsVal == 1: + candidates = [1, 2] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal == 2: + candidates = [3, 4] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (3, 4): + candidates = [5, 6] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (5, 6, 7): + candidates = [7, 8] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (8, 9, 10, 11, 12): + candidates = [9, 10] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (13, 14, 15): + huffCodebook = 11 + huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec) + + else: + huffCodebook = 11 + huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec) + + return huffSec, huffCodebook + +def huff_LUT_code_1(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + + if signedValues: + coeff = coeff_sec + maxAbsCodeVal + base = 2 * maxAbsCodeVal + 1 + else: + coeff = coeff_sec + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff)] = coeff + + huffSec = [] + + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + huffIndex = int(np.abs(nTuple) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + if signedValues: + huffSec.append(bits) + else: + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits) + + return ''.join(huffSec) + +def huff_LUT_code_0(): + return '' + +def huff_LUT_code_ESC(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff_sec)] = coeff_sec + + huffSec = [] + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + + lnTuple = nTuple.astype(float) + lnTuple[lnTuple == 0] = np.finfo(float).eps + + N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int)) + N = np.maximum(0, N4 - 4) + esc = np.abs(nTuple) > 15 + + nTupleESC = nTuple.copy() + nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16 + + huffIndex = int(np.abs(nTupleESC) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + escSeq = '' + for k in range(nTupleSize): + if esc[k]: + escSeq += '1' * N[k] + escSeq += '0' + escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b') + + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits + escSeq) + + return ''.join(huffSec) + +# ------------------ DECODE ------------------ + +def decode_huff(huff_sec, huff_LUT): + """ + Decode a Huffman-encoded stream. + + Parameters + ---------- + huff_sec : array-like of int or str + Huffman encoded stream as a sequence of 0 and 1 (string or list/array). + huff_LUT : dict + Huffman lookup table with keys: + - 'invTable': inverse table (numpy array) + - 'codebook': codebook number + - 'nTupleSize': tuple size + - 'maxAbsCodeVal': maximum absolute code value + - 'signedValues': True/False + + Returns + ------- + decCoeffs : list of int + Decoded quantized coefficients. + """ + + h = huff_LUT['invTable'] + huffCodebook = huff_LUT['codebook'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + # Convert string to array of ints + if isinstance(huff_sec, str): + huff_sec = np.array([int(b) for b in huff_sec]) + + eos = False + decCoeffs = [] + streamIndex = 0 + + while not eos: + wordbit = 0 + r = 0 # start at root + + # Decode Huffman word using inverse table + while True: + b = huff_sec[streamIndex + wordbit] + wordbit += 1 + rOld = r + r = h[rOld, b] + if h[r, 0] == 0 and h[r, 1] == 0: + symbolIndex = h[r, 2] - 1 # zero-based + streamIndex += wordbit + break + + # Decode n-tuple magnitudes + if signedValues: + base = 2 * maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val - maxAbsCodeVal) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + else: + base = maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + + # Apply sign bits + nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize] + nTupleSign = -(np.sign(nTupleSignBits - 0.5)) + streamIndex += nTupleSize + nTupleDec = nTupleDec * nTupleSign + + # Handle escape sequences + escIndex = np.where(np.abs(nTupleDec) == 16)[0] + if huffCodebook == 11 and escIndex.size > 0: + for idx in escIndex: + N = 0 + b = huff_sec[streamIndex] + while b: + N += 1 + b = huff_sec[streamIndex + N] + # Skip the N leading '1' bits AND the terminating '0' delimiter. + # The encoder writes: '1'*N + '0' + + streamIndex += N +1 + N4 = N + 4 + escape_word = huff_sec[streamIndex:streamIndex + N4] + escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2) + nTupleDec[idx] = escape_value + # We already consumed the delimiter above; now consume only N4 bits. + streamIndex += N4 + # Apply signs again + nTupleDec[escIndex] *= nTupleSign[escIndex] + + decCoeffs.extend(nTupleDec.tolist()) + + if streamIndex >= len(huff_sec): + eos = True + + return decCoeffs + + diff --git a/source/level_2/core/aac_coder.py b/source/level_2/core/aac_coder.py index 07034e6..5987712 100644 --- a/source/level_2/core/aac_coder.py +++ b/source/level_2/core/aac_coder.py @@ -111,21 +111,6 @@ def _thresholds_from_smr( return T -def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: - """ - Normalize GlobalGain to match AACChannelFrameF3["G"] type: - - long: return float - - ESH: return float64 ndarray of shape (1, 8) - """ - if np.isscalar(G): - return float(G) - - G_arr = np.asarray(G) - if G_arr.size == 1: - return float(G_arr.reshape(-1)[0]) - - return np.asarray(G_arr, dtype=np.float64) - # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- @@ -476,10 +461,6 @@ def aac_coder_3( S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) - # Normalize G types for AACSeq3 schema (float | float64 ndarray). - G_Ln = _normalize_global_gain(G_L) - G_Rn = _normalize_global_gain(G_R) - # Huffman-code ONLY the DPCM differences for b>0. # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] @@ -503,7 +484,7 @@ def aac_coder_3( "chl": { "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64), - "G": G_Ln, + "G": G_L, "sfc": sfc_L_stream, "stream": mdct_L_stream, "codebook": int(cb_L), @@ -511,7 +492,7 @@ def aac_coder_3( "chr": { "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), "T": np.asarray(T_R, dtype=np.float64), - "G": G_Rn, + "G": G_R, "sfc": sfc_R_stream, "stream": mdct_R_stream, "codebook": int(cb_R), diff --git a/source/level_2/core/aac_decoder.py b/source/level_2/core/aac_decoder.py index be705df..378294e 100644 --- a/source/level_2/core/aac_decoder.py +++ b/source/level_2/core/aac_decoder.py @@ -42,7 +42,7 @@ def _nbands(frame_type: FrameType) -> int: # Public helpers # ----------------------------------------------------------------------------- -def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: +def aac_unpack_seq_channels(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: """ Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo FrameF container expected by aac_i_filter_bank(). @@ -167,7 +167,7 @@ def aac_decoder_1( chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) - frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_f: FrameF = aac_unpack_seq_channels(frame_type, chl_f, chr_f) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) start = i * hop @@ -427,7 +427,7 @@ def aac_decoder_3( X_R = aac_i_tns(Xq_R, frame_type, tns_R) # Re-pack to stereo container and inverse filterbank - frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_f = aac_unpack_seq_channels(frame_type, np.asarray(X_L), np.asarray(X_R)) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) start = i * hop diff --git a/source/level_2/core/aac_huffman.py b/source/level_2/core/aac_huffman.py new file mode 100644 index 0000000..3d2eeca --- /dev/null +++ b/source/level_2/core/aac_huffman.py @@ -0,0 +1,112 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Huffman wrappers (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Thin wrappers around the provided Huffman utilities (material/huff_utils.py) +# so that the API matches the assignment text. +# +# Exposed API (assignment): +# huff_sec, huff_codebook = aac_encode_huff(coeff_sec, huff_LUT_list, force_codebook) +# dec_coeffs = aac_decode_huff(huff_sec, huff_codebook, huff_LUT_list) +# +# Notes: +# - Huffman coding operates on tuples. Therefore, decode(encode(x)) may return +# extra trailing symbols due to tuple padding. The AAC decoder knows the +# true section length from side information (band limits) and truncates. +# ------------------------------------------------------------ +from __future__ import annotations + +from typing import Any +import numpy as np + +from material.huff_utils import encode_huff, decode_huff + + +def aac_encode_huff( + coeff_sec: np.ndarray, + huff_LUT_list: list[dict[str, Any]], + force_codebook: int | None = None, +) -> tuple[str, int]: + """ + Huffman-encode a section of coefficients (MDCT symbols or scalefactors). + + Parameters + ---------- + coeff_sec : np.ndarray + Coefficient section to be encoded. Any shape is accepted; the input + is flattened and treated as a 1-D sequence of int64 symbols. + huff_LUT_list : list[dict[str, Any]] + List of Huffman Look-Up Tables (LUTs) as returned by material.load_LUT(). + Index corresponds to codebook id (typically 1..11, with 0 reserved). + force_codebook : int | None + If provided, forces the use of this Huffman codebook. In the assignment, + scalefactors are encoded with codebook 11. For MDCT coefficients, this + argument is usually omitted (auto-selection). + + Returns + ------- + tuple[str, int] + (huff_sec, huff_codebook) + - huff_sec: bitstream as a string of '0'/'1' + - huff_codebook: codebook id used by the encoder + """ + coeff_sec_arr = np.asarray(coeff_sec, dtype=np.int64).reshape(-1) + + if force_codebook is None: + # Provided utility returns (bitstream, codebook) in the auto-selection case. + huff_sec, huff_codebook = encode_huff(coeff_sec_arr, huff_LUT_list) + return str(huff_sec), int(huff_codebook) + + # Provided utility returns ONLY the bitstream when force_codebook is set. + cb = int(force_codebook) + huff_sec = encode_huff(coeff_sec_arr, huff_LUT_list, force_codebook=cb) + return str(huff_sec), cb + + +def aac_decode_huff( + huff_sec: str | np.ndarray, + huff_codebook: int, + huff_LUT: list[dict[str, Any]], +) -> np.ndarray: + """ + Huffman-decode a bitstream using the specified codebook. + + Parameters + ---------- + huff_sec : str | np.ndarray + Huffman bitstream. Typically a string of '0'/'1'. If an array is provided, + it is passed through to the provided decoder. + huff_codebook : int + Codebook id that was returned by aac_encode_huff. + Codebook 0 represents an all-zero section. + huff_LUT : list[dict[str, Any]] + Huffman LUT list as returned by material.load_LUT(). + + Returns + ------- + np.ndarray + Decoded coefficients as a 1-D np.int64 array. + + Note: Due to tuple coding, the decoded array may contain extra trailing + padding symbols. The caller must truncate to the known section length. + """ + cb = int(huff_codebook) + + if cb == 0: + # Codebook 0 represents an all-zero section. The decoded length is not + # recoverable from the bitstream alone; the caller must expand/truncate. + return np.zeros((0,), dtype=np.int64) + + if cb < 0 or cb >= len(huff_LUT): + raise ValueError(f"Invalid Huffman codebook index: {cb}") + + lut = huff_LUT[cb] + dec = decode_huff(huff_sec, lut) + return np.asarray(dec, dtype=np.int64).reshape(-1) diff --git a/source/level_2/core/aac_psycho.py b/source/level_2/core/aac_psycho.py new file mode 100644 index 0000000..8069e45 --- /dev/null +++ b/source/level_2/core/aac_psycho.py @@ -0,0 +1,441 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Psychoacoustic Model +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Psychoacoustic model for ONE channel, based on the assignment notes (Section 2.4). +# +# Public API: +# SMR = aac_psycho(frame_T, frame_type, frame_T_prev_1, frame_T_prev_2) +# +# Output: +# - For long frames ("OLS", "LSS", "LPS"): SMR has shape (69,) +# - For short frames ("ESH"): SMR has shape (42, 8) (one column per subframe) +# +# Notes: +# - Uses Bark band tables from material/TableB219.mat: +# * B219a for long windows (69 bands, N=2048 FFT, N/2=1024 bins) +# * B219b for short windows (42 bands, N=256 FFT, N/2=128 bins) +# - Applies a Hann window in time domain before FFT magnitude/phase extraction. +# - Implements: +# spreading function -> band spreading -> tonality index -> masking thresholds -> SMR. +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np + +from core.aac_utils import band_limits, get_table +from core.aac_configuration import NMT_DB, TMN_DB +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Spreading function +# ----------------------------------------------------------------------------- + +def _spreading_matrix(bval: BandValueArray) -> FloatArray: + """ + Compute the spreading function matrix between psychoacoustic bands. + + The spreading function describes how energy in one critical band masks + nearby bands. The formula follows the assignment pseudo-code. + + Parameters + ---------- + bval : BandValueArray + Bark value per band, shape (B,). + + Returns + ------- + FloatArray + Spreading matrix S of shape (B, B), where: + S[bb, b] quantifies the contribution of band bb masking band b. + """ + bval = np.asarray(bval, dtype=np.float64).reshape(-1) + B = int(bval.shape[0]) + + spread = np.zeros((B, B), dtype=np.float64) + + for b in range(B): + for bb in range(B): + # tmpx depends on direction (asymmetric spreading) + if bb >= b: + tmpx = 3.0 * (bval[bb] - bval[b]) + else: + tmpx = 1.5 * (bval[bb] - bval[b]) + + # tmpz uses the "min(..., 0)" nonlinearity exactly as in the notes + tmpz = 8.0 * min((tmpx - 0.5) ** 2 - 2.0 * (tmpx - 0.5), 0.0) + tmpy = 15.811389 + 7.5 * (tmpx + 0.474) - 17.5 * np.sqrt(1.0 + (tmpx + 0.474) ** 2) + + # Clamp very small values (below -100 dB) to 0 contribution + if tmpy < -100.0: + spread[bb, b] = 0.0 + else: + spread[bb, b] = 10.0 ** ((tmpz + tmpy) / 10.0) + + return spread + + +# ----------------------------------------------------------------------------- +# Windowing + FFT feature extraction +# ----------------------------------------------------------------------------- + +def _hann_window(N: int) -> FloatArray: + """ + Hann window as specified in the notes: + w[n] = 0.5 - 0.5*cos(2*pi*(n + 0.5)/N) + + Parameters + ---------- + N : int + Window length. + + Returns + ------- + FloatArray + 1-D array of shape (N,), dtype float64. + """ + n = np.arange(N, dtype=np.float64) + return 0.5 - 0.5 * np.cos((2.0 * np.pi / N) * (n + 0.5)) + + +def _r_phi_from_time(x: FrameChannelT, N: int) -> tuple[FloatArray, FloatArray]: + """ + Compute FFT magnitude r(w) and phase phi(w) for bins w = 0 .. N/2-1. + + Processing: + 1) Apply Hann window in time domain. + 2) Compute N-point FFT. + 3) Keep only the positive-frequency bins [0 .. N/2-1]. + + Parameters + ---------- + x : FrameChannelT + Time-domain samples, shape (N,). + N : int + FFT size (2048 or 256). + + Returns + ------- + r : FloatArray + Magnitude spectrum for bins 0 .. N/2-1, shape (N/2,). + phi : FloatArray + Phase spectrum for bins 0 .. N/2-1, shape (N/2,). + """ + x = np.asarray(x, dtype=np.float64).reshape(-1) + if x.shape[0] != N: + raise ValueError(f"Expected time vector of length {N}, got {x.shape[0]}.") + + w = _hann_window(N) + X = np.fft.fft(x * w, n=N) + + Xp = X[: N // 2] + r = np.abs(Xp).astype(np.float64, copy=False) + phi = np.angle(Xp).astype(np.float64, copy=False) + return r, phi + + +def _predictability( + r: FloatArray, + phi: FloatArray, + r_m1: FloatArray, + phi_m1: FloatArray, + r_m2: FloatArray, + phi_m2: FloatArray, +) -> FloatArray: + """ + Compute predictability c(w) per spectral bin. + + The notes define: + r_pred(w) = 2*r_{-1}(w) - r_{-2}(w) + phi_pred(w) = 2*phi_{-1}(w) - phi_{-2}(w) + + c(w) = |X(w) - X_pred(w)| / (r(w) + |r_pred(w)|) + + where X(w) is represented in polar form using r(w), phi(w). + + Parameters + ---------- + r, phi : FloatArray + Current magnitude and phase, shape (N/2,). + r_m1, phi_m1 : FloatArray + Previous magnitude and phase, shape (N/2,). + r_m2, phi_m2 : FloatArray + Pre-previous magnitude and phase, shape (N/2,). + + Returns + ------- + FloatArray + Predictability c(w), shape (N/2,). + """ + r_pred = 2.0 * r_m1 - r_m2 + phi_pred = 2.0 * phi_m1 - phi_m2 + + num = np.sqrt( + (r * np.cos(phi) - r_pred * np.cos(phi_pred)) ** 2 + + (r * np.sin(phi) - r_pred * np.sin(phi_pred)) ** 2 + ) + den = r + np.abs(r_pred) + 1e-12 # avoid division-by-zero without altering behavior + return (num / den).astype(np.float64, copy=False) + + +# ----------------------------------------------------------------------------- +# Band-domain aggregation +# ----------------------------------------------------------------------------- + +def _band_energy_and_pred( + r: FloatArray, + c: FloatArray, + wlow: BandIndexArray, + whigh: BandIndexArray, +) -> tuple[FloatArray, FloatArray]: + """ + Aggregate spectral bin quantities into psychoacoustic bands. + + Definitions (notes): + e(b) = sum_{w=wlow(b)..whigh(b)} r(w)^2 + c_num(b) = sum_{w=wlow(b)..whigh(b)} c(w) * r(w)^2 + + The band predictability c(b) is later computed after spreading as: + cb(b) = ct(b) / ecb(b) + + Parameters + ---------- + r : FloatArray + Magnitude spectrum, shape (N/2,). + c : FloatArray + Predictability per bin, shape (N/2,). + wlow, whigh : BandIndexArray + Band limits (inclusive indices), shape (B,). + + Returns + ------- + e_b : FloatArray + Band energies e(b), shape (B,). + c_num_b : FloatArray + Weighted predictability numerators c_num(b), shape (B,). + """ + r2 = (r * r).astype(np.float64, copy=False) + + B = int(wlow.shape[0]) + e_b = np.zeros(B, dtype=np.float64) + c_num_b = np.zeros(B, dtype=np.float64) + + for b in range(B): + a = int(wlow[b]) + z = int(whigh[b]) + + seg_r2 = r2[a : z + 1] + e_b[b] = float(np.sum(seg_r2)) + c_num_b[b] = float(np.sum(c[a : z + 1] * seg_r2)) + + return e_b, c_num_b + + +def _psycho_window( + time_x: FrameChannelT, + prev1_x: FrameChannelT, + prev2_x: FrameChannelT, + *, + N: int, + table: BarkTable, +) -> FloatArray: + """ + Compute SMR for one FFT analysis window (N=2048 for long, N=256 for short). + + This implements the pipeline described in the notes: + - FFT magnitude/phase + - predictability per bin + - band energies and predictability + - band spreading + - tonality index tb(b) + - masking threshold (noise + threshold in quiet) + - SMR(b) = e(b) / np(b) + + Parameters + ---------- + time_x : FrameChannelT + Current time-domain samples, shape (N,). + prev1_x : FrameChannelT + Previous time-domain samples, shape (N,). + prev2_x : FrameChannelT + Pre-previous time-domain samples, shape (N,). + N : int + FFT size. + table : BarkTable + Psychoacoustic band table (B219a or B219b). + + Returns + ------- + FloatArray + SMR per band, shape (B,). + """ + wlow, whigh, bval, qthr_db = band_limits(table) + spread = _spreading_matrix(bval) + + # FFT features for current and history windows + r, phi = _r_phi_from_time(time_x, N) + r_m1, phi_m1 = _r_phi_from_time(prev1_x, N) + r_m2, phi_m2 = _r_phi_from_time(prev2_x, N) + + # Predictability per bin + c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2) + + # Aggregate into psycho bands + e_b, c_num_b = _band_energy_and_pred(r, c_w, wlow, whigh) + + # Spread energies and predictability across bands: + # ecb(b) = sum_bb e(bb) * S(bb, b) + # ct(b) = sum_bb c_num(bb) * S(bb, b) + ecb = spread.T @ e_b + ct = spread.T @ c_num_b + + # Band predictability after spreading: cb(b) = ct(b) / ecb(b) + cb = ct / (ecb + 1e-12) + + # Normalized energy term: + # en(b) = ecb(b) / sum_bb S(bb, b) + spread_colsum = np.sum(spread, axis=0) + en = ecb / (spread_colsum + 1e-12) + + # Tonality index (clamped to [0, 1]) + tb = -0.299 - 0.43 * np.log(np.maximum(cb, 1e-12)) + tb = np.clip(tb, 0.0, 1.0) + + # Required SNR per band (dB): interpolate between TMN and NMT + snr_b = tb * TMN_DB + (1.0 - tb) * NMT_DB + bc = 10.0 ** (-snr_b / 10.0) + + # Noise masking threshold estimate (power domain) + nb = en * bc + + # Threshold in quiet (convert from dB to power domain): + # qthr_power = eps * (N/2) * 10^(qthr_db/10) + qthr_power = np.finfo('float').eps * (N / 2.0) * (10.0 ** (qthr_db / 10.0)) + + # Final masking threshold per band: + # np(b) = max(nb(b), qthr(b)) + npart = np.maximum(nb, qthr_power) + + # Signal-to-mask ratio: + # SMR(b) = e(b) / np(b) + smr = e_b / (npart + 1e-12) + return smr.astype(np.float64, copy=False) + + +# ----------------------------------------------------------------------------- +# ESH window slicing (match filterbank conventions) +# ----------------------------------------------------------------------------- + +def _esh_subframes(x_2048: FrameChannelT) -> list[FrameChannelT]: + """ + Extract the 8 overlapping 256-sample short windows used by AAC ESH. + + The project convention (matching the filterbank) is: + start_j = 448 + 128*j, for j = 0..7 + subframe_j = x[start_j : start_j + 256] + + This selects the central 1152-sample region [448, 1600) and produces + 8 windows with 50% overlap. + + Parameters + ---------- + x_2048 : FrameChannelT + Time-domain channel frame, shape (2048,). + + Returns + ------- + list[FrameChannelT] + List of 8 subframes, each of shape (256,). + """ + x_2048 = np.asarray(x_2048, dtype=np.float64).reshape(-1) + if x_2048.shape[0] != 2048: + raise ValueError("ESH requires 2048-sample input frames.") + + subs: list[FrameChannelT] = [] + for j in range(8): + start = 448 + 128 * j + subs.append(x_2048[start : start + 256]) + return subs + + +# ----------------------------------------------------------------------------- +# Public API +# ----------------------------------------------------------------------------- + +def aac_psycho( + frame_T: FrameChannelT, + frame_type: FrameType, + frame_T_prev_1: FrameChannelT, + frame_T_prev_2: FrameChannelT, +) -> FloatArray: + """ + Psychoacoustic model for ONE channel. + + Parameters + ---------- + frame_T : FrameChannelT + Current time-domain channel frame, shape (2048,). + For "ESH", the 8 short windows are derived internally. + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + frame_T_prev_1 : FrameChannelT + Previous time-domain channel frame, shape (2048,). + frame_T_prev_2 : FrameChannelT + Pre-previous time-domain channel frame, shape (2048,). + + Returns + ------- + FloatArray + Signal-to-Mask Ratio (SMR), per psychoacoustic band. + - If frame_type == "ESH": shape (42, 8) + - Else: shape (69,) + """ + frame_T = np.asarray(frame_T, dtype=np.float64).reshape(-1) + frame_T_prev_1 = np.asarray(frame_T_prev_1, dtype=np.float64).reshape(-1) + frame_T_prev_2 = np.asarray(frame_T_prev_2, dtype=np.float64).reshape(-1) + + if frame_T.shape[0] != 2048 or frame_T_prev_1.shape[0] != 2048 or frame_T_prev_2.shape[0] != 2048: + raise ValueError("aac_psycho expects 2048-sample frames for current/prev1/prev2.") + + table, N = get_table(frame_type) + + # Long frame types: compute one SMR vector (69 bands) + if frame_type != "ESH": + return _psycho_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table) + + # ESH: compute 8 SMR vectors (42 bands each), one per short subframe. + # + # The notes use short-window history for predictability: + # - For j=0: use previous frame's subframes (7, 6) + # - For j=1: use current subframe 0 and previous frame's subframe 7 + # - For j>=2: use current subframes (j-1, j-2) + # + # This matches the "within-frame history" convention commonly used in + # simplified psycho models for ESH. + cur_subs = _esh_subframes(frame_T) + prev1_subs = _esh_subframes(frame_T_prev_1) + + B = int(table.shape[0]) # expected 42 + smr_out = np.zeros((B, 8), dtype=np.float64) + + for j in range(8): + if j == 0: + x_m1 = prev1_subs[7] + x_m2 = prev1_subs[6] + elif j == 1: + x_m1 = cur_subs[0] + x_m2 = prev1_subs[7] + else: + x_m1 = cur_subs[j - 1] + x_m2 = cur_subs[j - 2] + + smr_out[:, j] = _psycho_window(cur_subs[j], x_m1, x_m2, N=256, table=table) + + return smr_out diff --git a/source/level_2/core/aac_quantizer.py b/source/level_2/core/aac_quantizer.py new file mode 100644 index 0000000..f2004b0 --- /dev/null +++ b/source/level_2/core/aac_quantizer.py @@ -0,0 +1,600 @@ +# ------------------------------------------------------------ +# AAC Coder/Decoder - Quantizer / iQuantizer (Level 3) +# +# Multimedia course at Aristotle University of +# Thessaloniki (AUTh) +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# Implements AAC quantizer and inverse quantizer for one channel. +# Based on assignment section 2.6 (Eq. 12-15). +# +# Notes: +# - Bit reservoir is not implemented (assignment simplification). +# - Scalefactor bands are assumed equal to psychoacoustic bands +# (Table B.2.1.9a / B.2.1.9b from TableB219.mat). +# ------------------------------------------------------------ +from __future__ import annotations + +import numpy as np + +from core.aac_utils import get_table, band_limits +from core.aac_types import * + +# ----------------------------------------------------------------------------- +# Constants (assignment) +# ----------------------------------------------------------------------------- +MAGIC_NUMBER: float = 0.4054 +EPS: float = 1e-12 +MAX_SF_DELTA:int = 60 + + +# ----------------------------------------------------------------------------- +# Helpers: ESH packing/unpacking (128x8 <-> 1024x1) +# ----------------------------------------------------------------------------- +def _esh_pack(x_128x8: FloatArray) -> FloatArray: + """ + Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1). + + Packing order: + Columns are concatenated in subframe order (0..7), column-major. + + Parameters + ---------- + x_128x8 : FloatArray + ESH coefficients, shape (128, 8). + + Returns + ------- + FloatArray + Packed coefficients, shape (1024, 1). + """ + x_128x8 = np.asarray(x_128x8, dtype=np.float64) + if x_128x8.shape != (128, 8): + raise ValueError("ESH pack expects shape (128, 8).") + return x_128x8.reshape(1024, 1, order="F") + + +def _esh_unpack(x_1024x1: FloatArray) -> FloatArray: + """ + Unpack a packed ESH vector (1024 elements) back to shape (128, 8). + + Parameters + ---------- + x_1024x1 : FloatArray + Packed ESH vector, shape (1024,) or (1024, 1) after flattening. + + Returns + ------- + FloatArray + Unpacked ESH coefficients, shape (128, 8). + """ + x_1024x1 = np.asarray(x_1024x1, dtype=np.float64).reshape(-1) + if x_1024x1.shape[0] != 1024: + raise ValueError("ESH unpack expects 1024 elements.") + return x_1024x1.reshape(128, 8, order="F") + + +# ----------------------------------------------------------------------------- +# Core quantizer formulas (Eq. 12, Eq. 13) +# ----------------------------------------------------------------------------- +def _quantize_symbol(x: FloatArray, alpha: float) -> QuantizedSymbols: + """ + Quantize MDCT coefficients to integer symbols S(k). + + Implements Eq. (12): + S(k) = sgn(X(k)) * int( (|X(k)| * 2^(-alpha/4))^(3/4) + MAGIC_NUMBER ) + + Parameters + ---------- + x : FloatArray + MDCT coefficients for a contiguous set of spectral lines. + Shape: (N,) + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + QuantizedSymbols + Quantized symbols S(k) as int64, shape (N,). + """ + x = np.asarray(x, dtype=np.float64) + + scale = 2.0 ** (-0.25 * float(alpha)) + ax = np.abs(x) * scale + + y = np.power(ax, 0.75, dtype=np.float64) + + # "int" in the assignment corresponds to truncation. + q = np.floor(y + MAGIC_NUMBER).astype(np.int64) + return (np.sign(x).astype(np.int64) * q).astype(np.int64) + + +def _dequantize_symbol(S: QuantizedSymbols, alpha: float) -> FloatArray: + """ + Inverse quantizer (dequantization of symbols). + + Implements Eq. (13): + Xhat(k) = sgn(S(k)) * |S(k)|^(4/3) * 2^(alpha/4) + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols S(k), int64, shape (N,). + alpha : float + Scalefactor gain for the corresponding scalefactor band. + + Returns + ------- + FloatArray + Reconstructed MDCT coefficients Xhat(k), float64, shape (N,). + """ + S = np.asarray(S, dtype=np.int64) + + scale = 2.0 ** (0.25 * float(alpha)) + aS = np.abs(S).astype(np.float64) + y = np.power(aS, 4.0 / 3.0, dtype=np.float64) + + return (np.sign(S).astype(np.float64) * y * scale).astype(np.float64) + + +# ----------------------------------------------------------------------------- +# Alpha initialization (Eq. 14) +# ----------------------------------------------------------------------------- +def _initial_alpha_hat(X: "FloatArray", MQ: int = 8191) -> int: + """ + Compute the initial scalefactor estimate alpha_hat for a frame. + + The assignment proposes the following first approximation (Equation 14): + + alpha_hat = (16/3) * log2( max_k(|X(k)|)^(3/4) / MQ ) + + where max_k runs over all MDCT coefficients of the frame (not per band), + and MQ is the maximum quantization level parameter (2*MQ + 1 levels). + + Parameters + ---------- + X : FloatArray + MDCT coefficients of one frame (or one ESH subframe), shape (N,). + MQ : int + Quantizer parameter (default 8191, as per assignment). + + Returns + ------- + int + Integer alpha_hat (rounded to nearest integer). + """ + x_max = float(np.max(np.abs(X))) + if x_max <= 0.0: + return 0 + + alpha_hat = (16.0 / 3.0) * np.log2((x_max ** (3.0 / 4.0)) / float(MQ)) + return int(np.round(alpha_hat)) + + +# ----------------------------------------------------------------------------- +# Band utilities +# ----------------------------------------------------------------------------- +def _band_slices(frame_type: FrameType) -> list[tuple[int, int]]: + """ + Return scalefactor band ranges [wlow, whigh] (inclusive) for the given frame type. + + These are derived from the psychoacoustic tables (TableB219), + and map directly to MDCT indices: + - long: 0..1023 + - short (ESH subframe): 0..127 + + Parameters + ---------- + frame_type : FrameType + Frame type ("OLS", "LSS", "ESH", "LPS"). + + Returns + ------- + list[tuple[int, int]] + List of (lo, hi) inclusive index pairs for each band. + """ + table, _Nfft = get_table(frame_type) + wlow, whigh, _bval, _qthr_db = band_limits(table) + + bands: list[tuple[int, int]] = [] + for lo, hi in zip(wlow, whigh): + bands.append((int(lo), int(hi))) + return bands + + +def _band_energy(x: FloatArray, lo: int, hi: int) -> float: + """ + Compute energy of a spectral segment x[lo:hi+1]. + + Parameters + ---------- + x : FloatArray + MDCT coefficient vector. + lo, hi : int + Inclusive index range. + + Returns + ------- + float + Sum of squares (energy) within the band. + """ + sec = x[lo : hi + 1] + return float(np.sum(sec * sec)) + + +def _psychoacoustic_threshold( + X: FloatArray, + SMR_col: FloatArray, + bands: list[tuple[int, int]], +) -> FloatArray: + """ + Compute psychoacoustic thresholds T(b) per band. + + Uses: + P(b) = sum_{k in band} X(k)^2 + T(b) = P(b) / SMR(b) + + Parameters + ---------- + X : FloatArray + MDCT coefficients for a frame (long) or one ESH subframe (short). + SMR_col : FloatArray + SMR values for this frame/subframe, shape (NB,). + bands : list[tuple[int, int]] + Band index ranges. + + Returns + ------- + FloatArray + Threshold vector T(b), shape (NB,). + """ + nb = len(bands) + T = np.zeros((nb,), dtype=np.float64) + + for b, (lo, hi) in enumerate(bands): + P = _band_energy(X, lo, hi) + smr = float(SMR_col[b]) + if smr <= EPS: + T[b] = 0.0 + else: + T[b] = P / smr + + return T + + +# ----------------------------------------------------------------------------- +# Alpha selection per band + neighbor-difference constraint +# ----------------------------------------------------------------------------- +def _best_alpha_for_band( + X: "FloatArray", lo: int, hi: int, T_b: float, + alpha_hat: int, alpha_prev: int, alpha_min: int, alpha_max: int, +) -> int: + """ + Determine the band-wise scalefactor alpha(b) following the assignment. + + Procedure: + - Start from a frame-wise initial estimate alpha_hat. + - Iteratively increase alpha(b) by 1 as long as the quantization error power + stays below the psychoacoustic threshold T(b): P_e(b) = sum_{k in band} ( X(k) - Xhat(k) )^2 + + - Stop increasing alpha(b) if the neighbor constraint would be violated: |alpha(b) - alpha(b-1)| <= 60 + When processing bands sequentially (low -> high), this becomes: alpha(b) <= alpha_prev + 60 + + Notes: + - This function does not decrease alpha if the initial value already violates + the threshold; the assignment only specifies iterative increase. + + Parameters + ---------- + X : FloatArray + Full MDCT vector of the current (sub)frame, shape (N,). + lo, hi : int + Band index bounds (inclusive), defining the band slice. + T_b : float + Threshold T(b) for this band. + alpha_hat : int + Initial frame-wise estimate (Equation 14). + alpha_prev : int + Previously selected alpha for band b-1 (neighbor constraint reference). + alpha_min, alpha_max : int + Safeguard bounds for alpha. + + Returns + ------- + int + Selected integer alpha(b). + """ + if T_b <= 0.0: + return int(alpha_hat) + Xsec = X[lo : hi + 1] + + # Neighbor constraint (sequential processing): alpha(b) <= alpha_prev + 60 + alpha_limit = min(int(alpha_max), int(alpha_prev) + MAX_SF_DELTA) + + # Start from alpha_hat, clamped to feasible range + alpha = int(alpha_hat) + alpha = max(int(alpha_min), min(alpha, int(alpha_limit))) + + # Evaluate at current alpha + Ssec = _quantize_symbol(Xsec, alpha) + Xhat = _dequantize_symbol(Ssec, alpha) + Pe = float(np.sum((Xsec - Xhat) ** 2)) + + # If already above threshold, return current alpha (no decrease step specified) + if Pe > T_b: + return alpha + + # Increase alpha while still under threshold and within constraints + while True: + alpha_next = alpha + 1 + if alpha_next > alpha_limit: + break + Ssec = _quantize_symbol(Xsec, alpha_next) + Xhat = _dequantize_symbol(Ssec, alpha_next) + Pe_next = float(np.sum((Xsec - Xhat) ** 2)) + + if Pe_next > T_b: + break + alpha = alpha_next + + return alpha + + +# ----------------------------------------------------------------------------- +# Public API +# ----------------------------------------------------------------------------- +def aac_quantizer( + frame_F: FrameChannelF, + frame_type: FrameType, + SMR: FloatArray, +) -> tuple[QuantizedSymbols, ScaleFactors, GlobalGain]: + """ + AAC quantizer for one channel (Level 3). + + Quantizes MDCT coefficients (after TNS) using band-wise scalefactors derived + from psychoacoustic thresholds computed via SMR. + + The implementation follows the assignment procedure: + - Compute an initial frame-wise alpha_hat using Equation (14), based on the + maximum MDCT coefficient magnitude of the (sub)frame. + - For each band b, increase alpha(b) by 1 while the quantization error power + P_e(b) stays below the threshold T(b). + - Enforce the neighbor constraint |alpha(b) - alpha(b-1)| <= 60 during the + band-by-band search (no post-processing needed). + + Parameters + ---------- + frame_F : FrameChannelF + MDCT coefficients after TNS, one channel. + Shapes: + - Long frames: (1024,) or (1024, 1) + - ESH: (128, 8) + frame_type : FrameType + AAC frame type ("OLS", "LSS", "ESH", "LPS"). + SMR : FloatArray + Signal-to-Mask Ratio per band. + Shapes: + - Long: (NB,) or (NB, 1) + - ESH: (NB, 8) + + Returns + ------- + S : QuantizedSymbols + Quantized symbols S(k), packed as shape (1024, 1) for all frame types. + For ESH, the 8 subframes are packed in column-major subframe layout. + sfc : ScaleFactors + DPCM-coded scalefactors: + sfc(0) = alpha(0) = G + sfc(b) = alpha(b) - alpha(b-1), for b > 0 + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain G = alpha(0). + - Long: scalar float + - ESH: array shape (1, 8), dtype float64 + """ + bands = _band_slices(frame_type) + NB = len(bands) + + X = np.asarray(frame_F, dtype=np.float64) + SMR = np.asarray(SMR, dtype=np.float64) + + # ------------------------------------------------------------------------- + # ESH: 8 short subframes, each of length 128 + # ------------------------------------------------------------------------- + if frame_type == "ESH": + if X.shape != (128, 8): + raise ValueError("For ESH, frame_F must have shape (128, 8).") + if SMR.shape != (NB, 8): + raise ValueError(f"For ESH, SMR must have shape ({NB}, 8).") + + S_out: QuantizedSymbols = np.zeros((1024, 1), dtype=np.int64) + sfc: ScaleFactors = np.zeros((NB, 8), dtype=np.int64) + G_arr = np.zeros((1, 8), dtype=np.float64) + + # Packed output view: (128, 8) with column-major layout + S_pack = S_out[:, 0].reshape(128, 8, order="F") + + for j in range(8): + Xj = X[:, j].reshape(128) + SMRj = SMR[:, j].reshape(NB) + + # Compute psychoacoustic threshold T(b) for this subframe + T = _psychoacoustic_threshold(Xj, SMRj, bands) + + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xj) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) + + for b, (lo, hi) in enumerate(bands): + alpha_b = _best_alpha_for_band( + X=Xj, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), + alpha_min=-4096, + alpha_max=4096, + ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) + + # DPCM-coded scalefactors + G_arr[0, j] = float(alpha[0]) + sfc[0, j] = int(alpha[0]) + for b in range(1, NB): + sfc[b, j] = int(alpha[b] - alpha[b - 1]) + + # Quantize MDCT coefficients band-by-band + Sj = np.zeros((128,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + Sj[lo : hi + 1] = _quantize_symbol(Xj[lo : hi + 1], float(alpha[b])) + + # Store subframe in packed output + S_pack[:, j] = Sj + + return S_out, sfc, G_arr + + # ------------------------------------------------------------------------- + # Long frames: OLS / LSS / LPS, length 1024 + # ------------------------------------------------------------------------- + if X.shape == (1024,): + Xv = X + elif X.shape == (1024, 1): + Xv = X[:, 0] + else: + raise ValueError("For non-ESH, frame_F must have shape (1024,) or (1024, 1).") + + if SMR.shape == (NB,): + SMRv = SMR + elif SMR.shape == (NB, 1): + SMRv = SMR[:, 0] + else: + raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") + + # Compute psychoacoustic threshold T(b) for the long frame + T = _psychoacoustic_threshold(Xv, SMRv, bands) + + # Frame-wise initial estimate alpha_hat (Equation 14) + alpha_hat = _initial_alpha_hat(Xv) + + # Band-wise scalefactors alpha(b) + alpha = np.zeros((NB,), dtype=np.int64) + alpha_prev = int(alpha_hat) + + for b, (lo, hi) in enumerate(bands): + alpha_b = _best_alpha_for_band( + X=Xv, + lo=lo, + hi=hi, + T_b=float(T[b]), + alpha_hat=int(alpha_hat), + alpha_prev=int(alpha_prev), + alpha_min=-4096, + alpha_max=4096, + ) + alpha[b] = int(alpha_b) + alpha_prev = int(alpha_b) + + # DPCM-coded scalefactors + sfc_out: ScaleFactors = np.zeros((NB, 1), dtype=np.int64) + sfc_out[0, 0] = int(alpha[0]) + for b in range(1, NB): + sfc_out[b, 0] = int(alpha[b] - alpha[b - 1]) + + G: float = float(alpha[0]) + + # Quantize MDCT coefficients band-by-band + S_vec = np.zeros((1024,), dtype=np.int64) + for b, (lo, hi) in enumerate(bands): + S_vec[lo : hi + 1] = _quantize_symbol(Xv[lo : hi + 1], float(alpha[b])) + + return S_vec.reshape(1024, 1), sfc_out, G + + +def aac_i_quantizer( + S: QuantizedSymbols, + sfc: ScaleFactors, + G: GlobalGain, + frame_type: FrameType, +) -> FrameChannelF: + """ + Inverse quantizer (iQuantizer) for one channel. + + Reconstructs MDCT coefficients from quantized symbols and DPCM scalefactors. + + Parameters + ---------- + S : QuantizedSymbols + Quantized symbols, shape (1024, 1) (or any array with 1024 elements). + sfc : ScaleFactors + DPCM-coded scalefactors. + Shapes: + - Long: (NB, 1) + - ESH: (NB, 8) + G : GlobalGain + Global gain (not strictly required if sfc includes sfc(0)=alpha(0)). + Present for API compatibility with the assignment. + frame_type : FrameType + AAC frame type. + + Returns + ------- + FrameChannelF + Reconstructed MDCT coefficients: + - ESH: (128, 8) + - Long: (1024, 1) + """ + bands = _band_slices(frame_type) + NB = len(bands) + + S_flat = np.asarray(S, dtype=np.int64).reshape(-1) + if S_flat.shape[0] != 1024: + raise ValueError("S must contain 1024 symbols.") + + if frame_type == "ESH": + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 8): + raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).") + + S_128x8 = _esh_unpack(S_flat) + + Xrec = np.zeros((128, 8), dtype=np.float64) + + for j in range(8): + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, j]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, j]) + + Xj = np.zeros((128,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xj[lo : hi + 1] = _dequantize_symbol(S_128x8[lo : hi + 1, j].astype(np.int64), float(alpha[b])) + + Xrec[:, j] = Xj + + return Xrec + + sfc = np.asarray(sfc, dtype=np.int64) + if sfc.shape != (NB, 1): + raise ValueError(f"For non-ESH, sfc must have shape ({NB}, 1).") + + alpha = np.zeros((NB,), dtype=np.int64) + alpha[0] = int(sfc[0, 0]) + for b in range(1, NB): + alpha[b] = int(alpha[b - 1] + sfc[b, 0]) + + Xrec = np.zeros((1024,), dtype=np.float64) + for b, (lo, hi) in enumerate(bands): + Xrec[lo : hi + 1] = _dequantize_symbol(S_flat[lo : hi + 1], float(alpha[b])) + + return Xrec.reshape(1024, 1) diff --git a/source/level_2/core/aac_tns.py b/source/level_2/core/aac_tns.py index 06227b1..4ba78ac 100644 --- a/source/level_2/core/aac_tns.py +++ b/source/level_2/core/aac_tns.py @@ -39,7 +39,7 @@ from core.aac_types import * # ----------------------------------------------------------------------------- -def _band_ranges_for_kcount(k_count: int) -> BandRanges: +def _band_ranges(k_count: int) -> BandRanges: """ Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count. @@ -66,7 +66,7 @@ def _band_ranges_for_kcount(k_count: int) -> BandRanges: start = tbl[:, 1].astype(int) end = tbl[:, 2].astype(int) - ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)] + ranges: BandRanges = [(int(s), int(e)) for s, e in zip(start, end)] for s, e in ranges: if s < 0 or e < s or e >= k_count: @@ -117,7 +117,7 @@ def _compute_sw(x: MdctCoeffs) -> MdctCoeffs: x = np.asarray(x, dtype=np.float64).reshape(-1) k_count = int(x.shape[0]) - bands = _band_ranges_for_kcount(k_count) + bands = _band_ranges(k_count) sw = np.zeros(k_count, dtype=np.float64) for s, e in bands: @@ -347,7 +347,7 @@ def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: return x_hat -def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: +def _tns_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: """ TNS for a single MDCT vector (one long frame or one short subframe). @@ -430,7 +430,7 @@ def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChan a_out = np.empty((PRED_ORDER, 8), dtype=np.float64) for j in range(8): - y[:, j], a_out[:, j] = _tns_one_vector(x[:, j]) + y[:, j], a_out[:, j] = _tns_vector(x[:, j]) return y, a_out @@ -443,7 +443,7 @@ def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChan else: raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') - y_vec, a_q = _tns_one_vector(x_vec) + y_vec, a_q = _tns_vector(x_vec) if out_shape == (1024,): y_out = y_vec diff --git a/source/level_2/core/aac_utils.py b/source/level_2/core/aac_utils.py index 0e18ed1..26cdbdf 100644 --- a/source/level_2/core/aac_utils.py +++ b/source/level_2/core/aac_utils.py @@ -163,6 +163,42 @@ def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: return float(10.0 * np.log10(ps / pn)) +def estimate_lag_mono(x_ref: TimeSignal, x_hat: TimeSignal, max_lag=4096): + """ + Estimate time lag between two mono signals. + Returns lag (positive means x_hat delayed). + """ + n = min(len(x_ref), len(x_hat)) + x_ref = x_ref[:n] + x_hat = x_hat[:n] + + corr = np.correlate(x_ref, x_hat, mode='full') + lags = np.arange(-n + 1, n) + + center = n - 1 + lo = max(0, center - max_lag) + hi = min(len(corr), center + max_lag + 1) + + best = lo + int(np.argmax(corr[lo:hi])) + return int(lags[best]) + + +def match_gain(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Least-squares gain g that best maps x_hat -> x_ref. + """ + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + r = x_ref[:n, :c].reshape(-1).astype(np.float64) + h = x_hat[:n, :c].reshape(-1).astype(np.float64) + + denom = float(np.dot(h, h)) + if denom <= 0.0: + return 1.0 + return float(np.dot(r, h) / denom) + + # ----------------------------------------------------------------------------- # Psychoacoustic band tables (TableB219.mat) # ----------------------------------------------------------------------------- diff --git a/source/level_2/material/huff_utils.py b/source/level_2/material/huff_utils.py new file mode 100644 index 0000000..8cdd279 --- /dev/null +++ b/source/level_2/material/huff_utils.py @@ -0,0 +1,403 @@ +import numpy as np +import scipy.io as sio +import os + +# ------------------ LOAD LUT ------------------ + +def load_LUT(mat_filename=None): + """ + Loads the list of Huffman Codebooks (LUTs) + + Returns: + huffLUT : list (index 1..11 used, index 0 unused) + """ + if mat_filename is None: + current_dir = os.path.dirname(os.path.abspath(__file__)) + mat_filename = os.path.join(current_dir, "huffCodebooks.mat") + + mat = sio.loadmat(mat_filename) + + + huffCodebooks_raw = mat['huffCodebooks'].squeeze() + + huffCodebooks = [] + for i in range(11): + huffCodebooks.append(np.array(huffCodebooks_raw[i])) + + # Build inverse VLC tables + invTable = [None] * 11 + + for i in range(11): + h = huffCodebooks[i][:, 2].astype(int) # column 3 + hlength = huffCodebooks[i][:, 1].astype(int) # column 2 + + hbin = [] + for j in range(len(h)): + hbin.append(format(h[j], f'0{hlength[j]}b')) + + invTable[i] = vlc_table(hbin) + + # Build Huffman LUT dicts + huffLUT = [None] * 12 # index 0 unused + params = [ + (4, 1, True), + (4, 1, True), + (4, 2, False), + (4, 2, False), + (2, 4, True), + (2, 4, True), + (2, 7, False), + (2, 7, False), + (2, 12, False), + (2, 12, False), + (2, 16, False), + ] + + for i, (nTupleSize, maxAbs, signed) in enumerate(params, start=1): + huffLUT[i] = { + 'LUT': huffCodebooks[i-1], + 'invTable': invTable[i-1], + 'codebook': i, + 'nTupleSize': nTupleSize, + 'maxAbsCodeVal': maxAbs, + 'signedValues': signed + } + + return huffLUT + +def vlc_table(code_array): + """ + codeArray: list of strings, each string is a Huffman codeword (e.g. '0101') + returns: + h : NumPy array of shape (num_nodes, 3) + columns: + [ next_if_0 , next_if_1 , symbol_index ] + """ + h = np.zeros((1, 3), dtype=int) + + for code_index, code in enumerate(code_array, start=1): + word = [int(bit) for bit in code] + h_index = 0 + + for bit in word: + k = bit + next_node = h[h_index, k] + if next_node == 0: + h = np.vstack([h, [0, 0, 0]]) + new_index = h.shape[0] - 1 + h[h_index, k] = new_index + h_index = new_index + else: + h_index = next_node + + h[h_index, 2] = code_index + + return h + +# ------------------ ENCODE ------------------ + +def encode_huff(coeff_sec, huff_LUT_list, force_codebook = None): + """ + Huffman-encode a sequence of quantized coefficients. + + This function selects the appropriate Huffman codebook based on the + maximum absolute value of the input coefficients, encodes the coefficients + into a binary Huffman bitstream, and returns both the bitstream and the + selected codebook index. + + This is the Python equivalent of the MATLAB `encodeHuff.m` function used + in audio/image coding (e.g., scale factor band encoding). The input + coefficient sequence is grouped into fixed-size tuples as defined by + the chosen Huffman LUT. Zero-padding may be applied internally. + + Parameters + ---------- + coeff_sec : array_like of int + 1-D array of quantized integer coefficients to encode. + Typically corresponds to a "section" or scale-factor band. + + huff_LUT_list : list + List of Huffman lookup-table dictionaries as returned by `loadLUT()`. + Index 1..11 correspond to valid Huffman codebooks. + Index 0 is unused. + + Returns + ------- + huffSec : str + Huffman-encoded bitstream represented as a string of '0' and '1' + characters. + + huffCodebook : int + Index (1..11) of the Huffman codebook used for encoding. + A value of 0 indicates a special all-zero section. + """ + if force_codebook is not None: + return huff_LUT_code_1(huff_LUT_list[force_codebook], coeff_sec) + + maxAbsVal = np.max(np.abs(coeff_sec)) + + if maxAbsVal == 0: + huffCodebook = 0 + huffSec = huff_LUT_code_0() + + elif maxAbsVal == 1: + candidates = [1, 2] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal == 2: + candidates = [3, 4] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (3, 4): + candidates = [5, 6] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (5, 6, 7): + candidates = [7, 8] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (8, 9, 10, 11, 12): + candidates = [9, 10] + huffSec1 = huff_LUT_code_1(huff_LUT_list[candidates[0]], coeff_sec) + huffSec2 = huff_LUT_code_1(huff_LUT_list[candidates[1]], coeff_sec) + if len(huffSec1) <= len(huffSec2): + huffSec = huffSec1 + huffCodebook = candidates[0] + else: + huffSec = huffSec2 + huffCodebook = candidates[1] + + elif maxAbsVal in (13, 14, 15): + huffCodebook = 11 + huffSec = huff_LUT_code_1(huff_LUT_list[huffCodebook], coeff_sec) + + else: + huffCodebook = 11 + huffSec = huff_LUT_code_ESC(huff_LUT_list[huffCodebook], coeff_sec) + + return huffSec, huffCodebook + +def huff_LUT_code_1(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + + if signedValues: + coeff = coeff_sec + maxAbsCodeVal + base = 2 * maxAbsCodeVal + 1 + else: + coeff = coeff_sec + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff)] = coeff + + huffSec = [] + + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + huffIndex = int(np.abs(nTuple) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + if signedValues: + huffSec.append(bits) + else: + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits) + + return ''.join(huffSec) + +def huff_LUT_code_0(): + return '' + +def huff_LUT_code_ESC(huff_LUT, coeff_sec): + LUT = huff_LUT['LUT'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + + numTuples = int(np.ceil(len(coeff_sec) / nTupleSize)) + base = maxAbsCodeVal + 1 + + coeffPad = np.zeros(numTuples * nTupleSize, dtype=int) + coeffPad[:len(coeff_sec)] = coeff_sec + + huffSec = [] + powers = base ** np.arange(nTupleSize - 1, -1, -1) + + for i in range(numTuples): + nTuple = coeffPad[i*nTupleSize:(i+1)*nTupleSize] + + lnTuple = nTuple.astype(float) + lnTuple[lnTuple == 0] = np.finfo(float).eps + + N4 = np.maximum(0, np.floor(np.log2(np.abs(lnTuple))).astype(int)) + N = np.maximum(0, N4 - 4) + esc = np.abs(nTuple) > 15 + + nTupleESC = nTuple.copy() + nTupleESC[esc] = np.sign(nTupleESC[esc]) * 16 + + huffIndex = int(np.abs(nTupleESC) @ powers) + + hexVal = LUT[huffIndex, 2] + huffLen = LUT[huffIndex, 1] + + bits = format(int(hexVal), f'0{int(huffLen)}b') + + escSeq = '' + for k in range(nTupleSize): + if esc[k]: + escSeq += '1' * N[k] + escSeq += '0' + escSeq += format(abs(nTuple[k]) - (1 << N4[k]), f'0{N4[k]}b') + + signBits = ''.join('1' if v < 0 else '0' for v in nTuple) + huffSec.append(bits + signBits + escSeq) + + return ''.join(huffSec) + +# ------------------ DECODE ------------------ + +def decode_huff(huff_sec, huff_LUT): + """ + Decode a Huffman-encoded stream. + + Parameters + ---------- + huff_sec : array-like of int or str + Huffman encoded stream as a sequence of 0 and 1 (string or list/array). + huff_LUT : dict + Huffman lookup table with keys: + - 'invTable': inverse table (numpy array) + - 'codebook': codebook number + - 'nTupleSize': tuple size + - 'maxAbsCodeVal': maximum absolute code value + - 'signedValues': True/False + + Returns + ------- + decCoeffs : list of int + Decoded quantized coefficients. + """ + + h = huff_LUT['invTable'] + huffCodebook = huff_LUT['codebook'] + nTupleSize = huff_LUT['nTupleSize'] + maxAbsCodeVal = huff_LUT['maxAbsCodeVal'] + signedValues = huff_LUT['signedValues'] + + # Convert string to array of ints + if isinstance(huff_sec, str): + huff_sec = np.array([int(b) for b in huff_sec]) + + eos = False + decCoeffs = [] + streamIndex = 0 + + while not eos: + wordbit = 0 + r = 0 # start at root + + # Decode Huffman word using inverse table + while True: + b = huff_sec[streamIndex + wordbit] + wordbit += 1 + rOld = r + r = h[rOld, b] + if h[r, 0] == 0 and h[r, 1] == 0: + symbolIndex = h[r, 2] - 1 # zero-based + streamIndex += wordbit + break + + # Decode n-tuple magnitudes + if signedValues: + base = 2 * maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val - maxAbsCodeVal) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + else: + base = maxAbsCodeVal + 1 + nTupleDec = [] + tmp = symbolIndex + for p in reversed(range(nTupleSize)): + val = tmp // (base ** p) + nTupleDec.append(val) + tmp = tmp % (base ** p) + nTupleDec = np.array(nTupleDec) + + # Apply sign bits + nTupleSignBits = huff_sec[streamIndex:streamIndex + nTupleSize] + nTupleSign = -(np.sign(nTupleSignBits - 0.5)) + streamIndex += nTupleSize + nTupleDec = nTupleDec * nTupleSign + + # Handle escape sequences + escIndex = np.where(np.abs(nTupleDec) == 16)[0] + if huffCodebook == 11 and escIndex.size > 0: + for idx in escIndex: + N = 0 + b = huff_sec[streamIndex] + while b: + N += 1 + b = huff_sec[streamIndex + N] + # Skip the N leading '1' bits AND the terminating '0' delimiter. + # The encoder writes: '1'*N + '0' + + streamIndex += N +1 + N4 = N + 4 + escape_word = huff_sec[streamIndex:streamIndex + N4] + escape_value = 2 ** N4 + int("".join(map(str, escape_word)), 2) + nTupleDec[idx] = escape_value + # We already consumed the delimiter above; now consume only N4 bits. + streamIndex += N4 + # Apply signs again + nTupleDec[escIndex] *= nTupleSign[escIndex] + + decCoeffs.extend(nTupleDec.tolist()) + + if streamIndex >= len(huff_sec): + eos = True + + return decCoeffs + + diff --git a/source/level_3/core/aac_coder.py b/source/level_3/core/aac_coder.py index 07034e6..5987712 100644 --- a/source/level_3/core/aac_coder.py +++ b/source/level_3/core/aac_coder.py @@ -111,21 +111,6 @@ def _thresholds_from_smr( return T -def _normalize_global_gain(G: GlobalGain) -> float | FloatArray: - """ - Normalize GlobalGain to match AACChannelFrameF3["G"] type: - - long: return float - - ESH: return float64 ndarray of shape (1, 8) - """ - if np.isscalar(G): - return float(G) - - G_arr = np.asarray(G) - if G_arr.size == 1: - return float(G_arr.reshape(-1)[0]) - - return np.asarray(G_arr, dtype=np.float64) - # ----------------------------------------------------------------------------- # Public helpers (useful for level_x demo wrappers) # ----------------------------------------------------------------------------- @@ -476,10 +461,6 @@ def aac_coder_3( S_L, sfc_L, G_L = aac_quantizer(chl_f_tns, frame_type, SMR_L) S_R, sfc_R, G_R = aac_quantizer(chr_f_tns, frame_type, SMR_R) - # Normalize G types for AACSeq3 schema (float | float64 ndarray). - G_Ln = _normalize_global_gain(G_L) - G_Rn = _normalize_global_gain(G_R) - # Huffman-code ONLY the DPCM differences for b>0. # sfc[0] corresponds to alpha(0)=G and is stored separately in the frame. sfc_L_dpcm = np.asarray(sfc_L, dtype=np.int64)[1:, ...] @@ -503,7 +484,7 @@ def aac_coder_3( "chl": { "tns_coeffs": np.asarray(chl_tns_coeffs, dtype=np.float64), "T": np.asarray(T_L, dtype=np.float64), - "G": G_Ln, + "G": G_L, "sfc": sfc_L_stream, "stream": mdct_L_stream, "codebook": int(cb_L), @@ -511,7 +492,7 @@ def aac_coder_3( "chr": { "tns_coeffs": np.asarray(chr_tns_coeffs, dtype=np.float64), "T": np.asarray(T_R, dtype=np.float64), - "G": G_Rn, + "G": G_R, "sfc": sfc_R_stream, "stream": mdct_R_stream, "codebook": int(cb_R), diff --git a/source/level_3/core/aac_decoder.py b/source/level_3/core/aac_decoder.py index be705df..378294e 100644 --- a/source/level_3/core/aac_decoder.py +++ b/source/level_3/core/aac_decoder.py @@ -42,7 +42,7 @@ def _nbands(frame_type: FrameType) -> int: # Public helpers # ----------------------------------------------------------------------------- -def aac_unpack_seq_channels_to_frame_f(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: +def aac_unpack_seq_channels(frame_type: FrameType, chl_f: FrameChannelF, chr_f: FrameChannelF) -> FrameF: """ Re-pack per-channel spectra from the Level-1 AACSeq1 schema into the stereo FrameF container expected by aac_i_filter_bank(). @@ -167,7 +167,7 @@ def aac_decoder_1( chl_f = np.asarray(fr["chl"]["frame_F"], dtype=np.float64) chr_f = np.asarray(fr["chr"]["frame_F"], dtype=np.float64) - frame_f: FrameF = aac_unpack_seq_channels_to_frame_f(frame_type, chl_f, chr_f) + frame_f: FrameF = aac_unpack_seq_channels(frame_type, chl_f, chr_f) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) # (2048, 2) start = i * hop @@ -427,7 +427,7 @@ def aac_decoder_3( X_R = aac_i_tns(Xq_R, frame_type, tns_R) # Re-pack to stereo container and inverse filterbank - frame_f = aac_unpack_seq_channels_to_frame_f(frame_type, np.asarray(X_L), np.asarray(X_R)) + frame_f = aac_unpack_seq_channels(frame_type, np.asarray(X_L), np.asarray(X_R)) frame_t_hat: FrameT = aac_i_filter_bank(frame_f, frame_type, win_type) start = i * hop diff --git a/source/level_3/core/aac_psycho.py b/source/level_3/core/aac_psycho.py index 4ecaafc..8069e45 100644 --- a/source/level_3/core/aac_psycho.py +++ b/source/level_3/core/aac_psycho.py @@ -189,7 +189,7 @@ def _predictability( # Band-domain aggregation # ----------------------------------------------------------------------------- -def _band_energy_and_weighted_predictability( +def _band_energy_and_pred( r: FloatArray, c: FloatArray, wlow: BandIndexArray, @@ -238,7 +238,7 @@ def _band_energy_and_weighted_predictability( return e_b, c_num_b -def _psycho_one_window( +def _psycho_window( time_x: FrameChannelT, prev1_x: FrameChannelT, prev2_x: FrameChannelT, @@ -288,7 +288,7 @@ def _psycho_one_window( c_w = _predictability(r, phi, r_m1, phi_m1, r_m2, phi_m2) # Aggregate into psycho bands - e_b, c_num_b = _band_energy_and_weighted_predictability(r, c_w, wlow, whigh) + e_b, c_num_b = _band_energy_and_pred(r, c_w, wlow, whigh) # Spread energies and predictability across bands: # ecb(b) = sum_bb e(bb) * S(bb, b) @@ -333,7 +333,7 @@ def _psycho_one_window( # ESH window slicing (match filterbank conventions) # ----------------------------------------------------------------------------- -def _esh_subframes_256(x_2048: FrameChannelT) -> list[FrameChannelT]: +def _esh_subframes(x_2048: FrameChannelT) -> list[FrameChannelT]: """ Extract the 8 overlapping 256-sample short windows used by AAC ESH. @@ -408,7 +408,7 @@ def aac_psycho( # Long frame types: compute one SMR vector (69 bands) if frame_type != "ESH": - return _psycho_one_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table) + return _psycho_window(frame_T, frame_T_prev_1, frame_T_prev_2, N=N, table=table) # ESH: compute 8 SMR vectors (42 bands each), one per short subframe. # @@ -419,8 +419,8 @@ def aac_psycho( # # This matches the "within-frame history" convention commonly used in # simplified psycho models for ESH. - cur_subs = _esh_subframes_256(frame_T) - prev1_subs = _esh_subframes_256(frame_T_prev_1) + cur_subs = _esh_subframes(frame_T) + prev1_subs = _esh_subframes(frame_T_prev_1) B = int(table.shape[0]) # expected 42 smr_out = np.zeros((B, 8), dtype=np.float64) @@ -436,6 +436,6 @@ def aac_psycho( x_m1 = cur_subs[j - 1] x_m2 = cur_subs[j - 2] - smr_out[:, j] = _psycho_one_window(cur_subs[j], x_m1, x_m2, N=256, table=table) + smr_out[:, j] = _psycho_window(cur_subs[j], x_m1, x_m2, N=256, table=table) return smr_out diff --git a/source/level_3/core/aac_quantizer.py b/source/level_3/core/aac_quantizer.py index 5bc8526..f2004b0 100644 --- a/source/level_3/core/aac_quantizer.py +++ b/source/level_3/core/aac_quantizer.py @@ -35,7 +35,7 @@ MAX_SF_DELTA:int = 60 # ----------------------------------------------------------------------------- # Helpers: ESH packing/unpacking (128x8 <-> 1024x1) # ----------------------------------------------------------------------------- -def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray: +def _esh_pack(x_128x8: FloatArray) -> FloatArray: """ Pack ESH coefficients (128 x 8) into a single long vector (1024 x 1). @@ -58,7 +58,7 @@ def _esh_pack_to_1024(x_128x8: FloatArray) -> FloatArray: return x_128x8.reshape(1024, 1, order="F") -def _esh_unpack_from_1024(x_1024x1: FloatArray) -> FloatArray: +def _esh_unpack(x_1024x1: FloatArray) -> FloatArray: """ Unpack a packed ESH vector (1024 elements) back to shape (128, 8). @@ -226,7 +226,7 @@ def _band_energy(x: FloatArray, lo: int, hi: int) -> float: return float(np.sum(sec * sec)) -def _threshold_T_from_SMR( +def _psychoacoustic_threshold( X: FloatArray, SMR_col: FloatArray, bands: list[tuple[int, int]], @@ -425,7 +425,7 @@ def aac_quantizer( SMRj = SMR[:, j].reshape(NB) # Compute psychoacoustic threshold T(b) for this subframe - T = _threshold_T_from_SMR(Xj, SMRj, bands) + T = _psychoacoustic_threshold(Xj, SMRj, bands) # Frame-wise initial estimate alpha_hat (Equation 14) alpha_hat = _initial_alpha_hat(Xj) @@ -482,7 +482,7 @@ def aac_quantizer( raise ValueError(f"For non-ESH, SMR must have shape ({NB},) or ({NB}, 1).") # Compute psychoacoustic threshold T(b) for the long frame - T = _threshold_T_from_SMR(Xv, SMRv, bands) + T = _psychoacoustic_threshold(Xv, SMRv, bands) # Frame-wise initial estimate alpha_hat (Equation 14) alpha_hat = _initial_alpha_hat(Xv) @@ -566,7 +566,7 @@ def aac_i_quantizer( if sfc.shape != (NB, 8): raise ValueError(f"For ESH, sfc must have shape ({NB}, 8).") - S_128x8 = _esh_unpack_from_1024(S_flat) + S_128x8 = _esh_unpack(S_flat) Xrec = np.zeros((128, 8), dtype=np.float64) diff --git a/source/level_3/core/aac_tns.py b/source/level_3/core/aac_tns.py index 06227b1..4ba78ac 100644 --- a/source/level_3/core/aac_tns.py +++ b/source/level_3/core/aac_tns.py @@ -39,7 +39,7 @@ from core.aac_types import * # ----------------------------------------------------------------------------- -def _band_ranges_for_kcount(k_count: int) -> BandRanges: +def _band_ranges(k_count: int) -> BandRanges: """ Return Bark band index ranges [start, end] (inclusive) for the given MDCT line count. @@ -66,7 +66,7 @@ def _band_ranges_for_kcount(k_count: int) -> BandRanges: start = tbl[:, 1].astype(int) end = tbl[:, 2].astype(int) - ranges: list[tuple[int, int]] = [(int(s), int(e)) for s, e in zip(start, end)] + ranges: BandRanges = [(int(s), int(e)) for s, e in zip(start, end)] for s, e in ranges: if s < 0 or e < s or e >= k_count: @@ -117,7 +117,7 @@ def _compute_sw(x: MdctCoeffs) -> MdctCoeffs: x = np.asarray(x, dtype=np.float64).reshape(-1) k_count = int(x.shape[0]) - bands = _band_ranges_for_kcount(k_count) + bands = _band_ranges(k_count) sw = np.zeros(k_count, dtype=np.float64) for s, e in bands: @@ -347,7 +347,7 @@ def _apply_itns_iir(y: MdctCoeffs, a_q: MdctCoeffs) -> MdctCoeffs: return x_hat -def _tns_one_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: +def _tns_vector(x: MdctCoeffs) -> tuple[MdctCoeffs, MdctCoeffs]: """ TNS for a single MDCT vector (one long frame or one short subframe). @@ -430,7 +430,7 @@ def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChan a_out = np.empty((PRED_ORDER, 8), dtype=np.float64) for j in range(8): - y[:, j], a_out[:, j] = _tns_one_vector(x[:, j]) + y[:, j], a_out[:, j] = _tns_vector(x[:, j]) return y, a_out @@ -443,7 +443,7 @@ def aac_tns(frame_F_in: FrameChannelF, frame_type: FrameType) -> Tuple[FrameChan else: raise ValueError('For non-ESH, frame_F_in must have shape (1024,) or (1024, 1).') - y_vec, a_q = _tns_one_vector(x_vec) + y_vec, a_q = _tns_vector(x_vec) if out_shape == (1024,): y_out = y_vec diff --git a/source/level_3/core/aac_utils.py b/source/level_3/core/aac_utils.py index 0e18ed1..26cdbdf 100644 --- a/source/level_3/core/aac_utils.py +++ b/source/level_3/core/aac_utils.py @@ -163,6 +163,42 @@ def snr_db(x_ref: StereoSignal, x_hat: StereoSignal) -> float: return float(10.0 * np.log10(ps / pn)) +def estimate_lag_mono(x_ref: TimeSignal, x_hat: TimeSignal, max_lag=4096): + """ + Estimate time lag between two mono signals. + Returns lag (positive means x_hat delayed). + """ + n = min(len(x_ref), len(x_hat)) + x_ref = x_ref[:n] + x_hat = x_hat[:n] + + corr = np.correlate(x_ref, x_hat, mode='full') + lags = np.arange(-n + 1, n) + + center = n - 1 + lo = max(0, center - max_lag) + hi = min(len(corr), center + max_lag + 1) + + best = lo + int(np.argmax(corr[lo:hi])) + return int(lags[best]) + + +def match_gain(x_ref: StereoSignal, x_hat: StereoSignal) -> float: + """ + Least-squares gain g that best maps x_hat -> x_ref. + """ + n = min(x_ref.shape[0], x_hat.shape[0]) + c = min(x_ref.shape[1], x_hat.shape[1]) + + r = x_ref[:n, :c].reshape(-1).astype(np.float64) + h = x_hat[:n, :c].reshape(-1).astype(np.float64) + + denom = float(np.dot(h, h)) + if denom <= 0.0: + return 1.0 + return float(np.dot(r, h) / denom) + + # ----------------------------------------------------------------------------- # Psychoacoustic band tables (TableB219.mat) # ----------------------------------------------------------------------------- diff --git a/source/level_3/level_3.py b/source/level_3/level_3.py index 2f2105b..b7470da 100644 --- a/source/level_3/level_3.py +++ b/source/level_3/level_3.py @@ -27,13 +27,17 @@ from typing import Optional, Tuple, Union import os import soundfile as sf +import numpy as np +import matplotlib.pyplot as plt from core.aac_types import AACSeq3, StereoSignal from core.aac_coder import aac_coder_3 as core_aac_coder_3 from core.aac_coder import aac_read_wav_stereo_48k from core.aac_decoder import aac_decoder_3 as core_aac_decoder_3 -from core.aac_utils import snr_db +from core.aac_utils import snr_db, estimate_lag_mono, match_gain +# Global variable to "pass" AACSeq3 without changing the demo_aac_e interface. +AAC_Seq_3: AACSeq3 # ----------------------------------------------------------------------------- # Helpers (Level 3 metrics) @@ -90,6 +94,83 @@ def _bitrate_after_from_aacseq(aac_seq_3: AACSeq3, duration_sec: float) -> float return float(total_bits) / float(duration_sec) +def _plot_frame_bitrate_and_compression( + aac_seq_3: AACSeq3, + wav_path: Union[str, Path], + fname_bitrate: Union[str, Path], + fname_comp: Union[str, Path], +) -> None: + """ + Compute and plot per-frame bitrate and compression ratio + for a Level 3 AAC sequence. + + Parameters + ---------- + aac_seq_3 : list + Output of aac_coder_3 (list of frame dictionaries). + wav_path : str or Path + Path to original WAV file (PCM 48 kHz stereo). + fname_bitrate : str or Path + Path to original bitrate per frame plot output file. + fname_comp : str or Path + Path to original compression per frame plot output file. + """ + + # Read WAV metadata + info = sf.info(str(wav_path)) + samplerate = info.samplerate + total_samples = info.frames + total_duration = total_samples / samplerate + + n_frames = len(aac_seq_3) + + # AAC long-frame hop size is 1024 new samples per frame + samples_per_frame = 1024 + duration_per_frame = samples_per_frame / samplerate + + # Original bitrate (file-based estimate) + original_bits = os.path.getsize(wav_path) * 8.0 + original_bitrate = original_bits / total_duration + + frame_bitrates = [] + frame_compression = [] + + for fr in aac_seq_3: + bits = 0 + bits += len(fr["chl"]["sfc"]) + bits += len(fr["chl"]["stream"]) + bits += len(fr["chr"]["sfc"]) + bits += len(fr["chr"]["stream"]) + + bitrate = bits / duration_per_frame + compression = original_bitrate / bitrate if bitrate > 0 else np.inf + + frame_bitrates.append(bitrate) + frame_compression.append(compression) + + frame_indices = np.arange(n_frames) + + # Plot bitrate per frame and save to file + plt.figure(figsize=(6, 3), dpi=300) + plt.plot(frame_indices, frame_bitrates) + plt.xlabel("Frame index") + plt.ylabel("Bitrate (bits/s)") + plt.title("Bitrate (per-frame)") + plt.tight_layout() + plt.savefig(str(fname_bitrate)) + plt.close() + + # Plot compression ratio per frame and save to file + plt.figure(figsize=(6, 3), dpi=300) + plt.plot(frame_indices, frame_compression) + plt.xlabel("Frame index") + plt.ylabel("Compression Ratio") + plt.title("Compression Ratio (per-frame)") + plt.tight_layout() + plt.savefig(str(fname_comp)) + plt.close() + + # ----------------------------------------------------------------------------- # Public Level 3 API (wrappers) # ----------------------------------------------------------------------------- @@ -187,20 +268,21 @@ def demo_aac_3( raise ValueError("Input sampling rate must be 48 kHz.") # Encode / decode - aac_seq_3 = aac_coder_3(filename_in, filename_aac_coded) - x_hat = i_aac_coder_3(aac_seq_3, filename_out) + global AAC_Seq_3 # pick coder output + AAC_Seq_3 = aac_coder_3(filename_in, filename_aac_coded) + x_hat = i_aac_coder_3(AAC_Seq_3, filename_out) # Optional sanity: ensure output file exists and is readable _, fs_hat = sf.read(str(filename_out), always_2d=True) if int(fs_hat) != 48000: raise ValueError("Decoded output sampling rate must be 48 kHz.") - # Metrics + # Quality metrics s = snr_db(x_ref, x_hat) duration = _wav_duration_seconds(filename_in) bitrate_before = _bitrate_before_from_file(filename_in) - bitrate_after = _bitrate_after_from_aacseq(aac_seq_3, duration) + bitrate_after = _bitrate_after_from_aacseq(AAC_Seq_3, duration) compression = float("inf") if bitrate_after <= 0.0 else (bitrate_before / bitrate_after) return float(s), float(bitrate_after), float(compression) @@ -218,20 +300,28 @@ if __name__ == "__main__": # python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav # or # python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav aac_seq_3.mat + # or + # python -m level_3 material/LicorDeCalandraca.wav LicorDeCalandraca_out_l3.wav aac_seq_3.mat bitrate.png compression.png import sys - if len(sys.argv) not in (3, 4): - raise SystemExit("Usage: python -m level_3 [aac_seq_3.mat]") - + if len(sys.argv) not in (3, 4, 5, 6): + raise SystemExit( + "Usage: python -m level_3 [aac_seq_3.mat] [bitrate_fname] [compression_fname]" + ) in_wav = Path(sys.argv[1]) out_wav = Path(sys.argv[2]) aac_mat = Path(sys.argv[3]) if len(sys.argv) == 4 else None + fname_bitrate = Path(sys.argv[4]) if len(sys.argv) == 5 else "bitrate_per_frame.png" + fname_comp = Path(sys.argv[5]) if len(sys.argv) == 6 else "compression_per_frame.png" print(f"Encoding/Decoding {in_wav} to {out_wav}") if aac_mat is not None: print(f"Storing coded sequence to {aac_mat}") snr, bitrate, compression = demo_aac_3(in_wav, out_wav, aac_mat) + # plot compresion / bitrate + _plot_frame_bitrate_and_compression(AAC_Seq_3, in_wav, fname_bitrate, fname_comp) + print(f"SNR = {snr:.3f} dB") print(f"Bitrate (coded) = {bitrate:.2f} bits/s") print(f"Compression ratio = {compression:.4f}") diff --git a/source/requirements.txt b/source/requirements.txt index 475a4e0..1b2fb83 100644 --- a/source/requirements.txt +++ b/source/requirements.txt @@ -2,4 +2,5 @@ numpy scipy scipy-stubs soundfile -pytest \ No newline at end of file +pytest +matplotlib \ No newline at end of file