PR-Assignment2025_26/src/partB.py

# ------------------------------------------------------------
# Part B - Parzen Window Density Estimation (1D)
# Pattern Recognition – Semester Assignment
#
# Author:
#   Christos Choutouridis (ΑΕΜ 8997)
#   cchoutou@ece.auth.gr
#
# Description:
#   This module implements Part B of the assignment:
#   - 1D Parzen window density estimation using uniform and
#     Gaussian kernels
#   - Computation of predicted likelihood for varying bandwidth h
#   - Comparison with the true N(1, 4) distribution
#   - MSE analysis and optimal bandwidth selection
#
# ------------------------------------------------------------

import sys

import numpy as np
import matplotlib.pyplot as plt

from typing import Sequence
from toolbox import load_csv, dataset2


# --------------------------------------------------
# Optional: D-dimensional Bishop-style kernels (not used in Part B)
# --------------------------------------------------
def kernel_hypercube(u: np.ndarray) -> float:
    """
    D-dimensional uniform kernel (hypercube).

    Bishop eq. (2.247):
        k(u) = 1 if |u_i| <= 1/2 for all i, else 0

    In 1D this reduces to:
        k(u) = 1 if |u| <= 1/2 else 0

    This kernel integrates to 1:
        ∫_{-1/2}^{1/2} 1 du = 1
    """
    return float(np.all(np.abs(u) <= 0.5))


def kernel_gaussian(u: np.ndarray) -> float:
    """
    D-dimensional Gaussian kernel.

    k(u) = (2π)^(-D/2) * exp(-||u||^2 / 2)

    Integral over R^D is 1.
    """
    d = u.shape[0]
    norm_const = 1.0 / ((2.0 * np.pi) ** (d / 2.0))
    return float(norm_const * np.exp(-0.5 * np.dot(u, u)))


# --------------------------------------------------
# 1D Parzen kernels (used in this Part)
# --------------------------------------------------
def parzen_kernel_uniform(u: np.ndarray) -> np.ndarray:
    """
    1D uniform Parzen kernel (box).

    Bishop-style in 1D:
        k(u) = 1 if |u| <= 1/2
             = 0 otherwise

    Integral:
        ∫_{-1/2}^{1/2} 1 du = 1

    Parameters
    ----------
    u : ndarray
        Array of values where the kernel is evaluated.

    Returns
    -------
    values : ndarray
        Kernel values at u.
    """
    return (np.abs(u) <= 0.5).astype(float)


def parzen_kernel_gaussian(u: np.ndarray) -> np.ndarray:
    """
    1D Gaussian kernel with mean 0, variance 1.

    k(u) = (1 / sqrt(2π)) * exp(-u^2 / 2)

    Integral over R is 1.

    Parameters
    ----------
    u : ndarray

    Returns
    -------
    values : ndarray
    """
    return (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(-0.5 * (u ** 2))


# --------------------------------------------------
# Parzen estimator (1D, point-wise)
# --------------------------------------------------
def parzen_estimate_1d(x_eval: float, data: np.ndarray, h: float, kernel_fn) -> float:
    """
    Parzen window density estimate in 1D, for a single point x_eval.

    Implements:
        p_hat(x_eval) = (1 / (N h)) * sum_n k((x_eval - x_n) / h)

    Parameters
    ----------
    x_eval : float
        Point where the density is estimated.
    data : ndarray, shape (N,)
        1D data samples.
    h : float
        Bandwidth (window width).
    kernel_fn : callable
        Kernel function K(u), applied elementwise on u.

    Returns
    -------
    f_hat : float
        Estimated pdf value at x_eval.
    """
    N = data.shape[0]
    u = (x_eval - data) / h
    return float(np.sum(kernel_fn(u)) / (N * h))


def evaluate_parzen(data: np.ndarray, h: float, kernel_fn) -> np.ndarray:
    """
    Evaluates the Parzen estimate at each sample in 'data' itself.

    For each x_i in data:
        p_hat(x_i) = (1 / (N h)) * sum_n k((x_i - x_n) / h)

    Parameters
    ----------
    data : ndarray, shape (N,)
        1D data samples.
    h : float
        Bandwidth.
    kernel_fn : callable
        Kernel function K(u).

    Returns
    -------
    estimates : ndarray, shape (N,)
        Estimated pdf values at each data point.
    """
    N = data.shape[0]
    estimates = np.zeros(N, dtype=float)

    for i in range(N):
        estimates[i] = parzen_estimate_1d(data[i], data, h, kernel_fn)

    return estimates


# --------------------------------------------------
# True pdf and error
# --------------------------------------------------
def true_normal_pdf_1d(x: np.ndarray, mu: float, var: float) -> np.ndarray:
    """
    True normal pdf N(mu, var) at points x (array).

    Parameters
    ----------
    x : ndarray
        Points where the pdf is evaluated.
    mu : float
        Mean
    var : float
        Variance

    Returns
    -------
    pdf : ndarray
        The normal pdf N(mu, var)
    """
    sigma = np.sqrt(var)
    coef = 1.0 / (np.sqrt(2.0 * np.pi) * sigma)
    z = (x - mu) / sigma
    return coef * np.exp(-0.5 * z * z)


def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Mean squared error between two arrays.

    Parameters
    ----------
    y_true : ndarray
        actual labels array
    y_pred : ndarray
        predicted labels array

    Returns
    -------
    err : float
        Mean squared error.
    """
    return float(np.mean((y_true - y_pred) ** 2))


def scan_bandwidths_parzen(
    data: np.ndarray, h_values: Sequence[float], kernel_fn, mu_true: float = 1.0, var_true: float = 4.0
) -> np.ndarray:
    """
    For each h in h_values, computes:

    - estimated pdf via Parzen (predicted likelihood)
    - true pdf via N(mu_true, var_true) (true likelihood)
    - MSE between them

    Parameters
    ----------
    data : ndarray, shape (N,)
        1D data samples.
    h_values : sequence of float
        Bandwidth values to test.
    kernel_fn : callable
        Kernel function K(u).
    mu_true : float
        True mean, default to 1.0.
    var_true : float
        True variance, default to 4.0.

    Returns
    -------
    errors : ndarray, shape (len(h_values),)
        MSE between estimated and true pdf as array of len(h_values)
    """
    true_values = true_normal_pdf_1d(data, mu=mu_true, var=var_true)
    errors_list = []

    for h in h_values:
        est_values = evaluate_parzen(data, h, kernel_fn)
        err = mse(true_values, est_values)
        errors_list.append(err)

    return np.array(errors_list, dtype=float)


# --------------------------------------------------
# Plotting helpers
# --------------------------------------------------
def plot_h_vs_error(h_values: np.ndarray, errors: np.ndarray, title: str) -> None:
    """
    Simple plot of bandwidth vs error.

    Parameters
    ----------
    h_values : ndarray
        bandwith values
    errors : ndarray
        error values
    title : str
        title
    """
    plt.figure(figsize=(8, 5))
    plt.plot(h_values, errors, marker='o')
    plt.xlabel("h")
    plt.ylabel("MSE")
    plt.title(title)
    plt.grid(True)
    plt.show()


def plot_histogram_with_pdf(
    data: np.ndarray, mu_true: float = 1.0, var_true: float = 4.0, bins: int = 30
) -> None:
    """
    Plots a histogram of the data and overlays the true N(mu_true, var_true) pdf.

    Parameters
    ----------
    data : ndarray
        1D data samples.
    mu_true : float
        True mean, default to 1.0.
    var_true : float
        True variance, default to 4.0.
    bins : int
        number of bins, default to 30.
    """
    plt.figure(figsize=(8, 5))

    plt.hist(data, bins=bins, density=True, alpha=0.5, label="Data histogram")

    x_min, x_max = np.min(data), np.max(data)
    x_plot = np.linspace(x_min, x_max, 200)
    pdf_true = true_normal_pdf_1d(x_plot, mu=mu_true, var=var_true)

    plt.plot(x_plot, pdf_true, label=f"True N({mu_true}, {var_true}) pdf")
    plt.xlabel("x")
    plt.ylabel("Density")
    plt.title(f"Dataset2 histogram vs true N({mu_true}, {var_true}) pdf")
    plt.legend()
    plt.grid(True)
    plt.show()


# --------------------------------------------------
# Part B: main runner
# --------------------------------------------------
if __name__ == "__main__":
    # Load dataset2 (from GitHub via toolbox)
    df2 = load_csv(dataset2, header=None)
    data2 = df2.iloc[:, 0].values

    mu = float(sys.argv[1]) if len(sys.argv) > 1 else 1.0
    var = float(sys.argv[2]) if len(sys.argv) > 2 else 4.0

    # Optional: histogram + true pdf
    plot_histogram_with_pdf(data2, mu_true=mu, var_true=var, bins=30)

    # Range of h: [0.1, 10] with step 0.1
    h_values = np.arange(0.1, 10.1, 0.1)

    # Uniform kernel (parzen)
    errors_uniform = scan_bandwidths_parzen(data2, h_values, parzen_kernel_uniform, mu_true=mu, var_true=var)
    best_h_uniform = h_values[np.argmin(errors_uniform)]

    # Gaussian kernel
    errors_gaussian = scan_bandwidths_parzen(data2, h_values, parzen_kernel_gaussian, mu_true=mu, var_true=var)
    best_h_gaussian = h_values[np.argmin(errors_gaussian)]

    print("Best h (uniform):", best_h_uniform, " with error: ", errors_uniform[np.argmin(errors_uniform)])
    print("Best h (gaussian):", best_h_gaussian, " with error: ", errors_gaussian[np.argmin(errors_gaussian)])

    plot_h_vs_error(h_values, errors_uniform, "Uniform kernel: h vs MSE")
    plot_h_vs_error(h_values, errors_gaussian, "Gaussian kernel: h vs MSE")