Part C local code added

2025-12-11 18:06:27 +02:00 · 2025-12-11 18:06:27 +02:00 · 1508d413d8
commit 1508d413d8
parent cc6b742553
4 changed files with 410 additions and 43 deletions
--- a/src/partA.py
+++ b/src/partA.py
@ -1,45 +1,29 @@
 # ------------------------------------------------------------
 # Part A - Gaussian Parameter Estimation (MLE) & Visualization
 # Pattern Recognition – Semester Assignment
 #
 # Author:
 #   Christos Choutouridis (ΑΕΜ 8997)
 #   cchoutou@ece.auth.gr
 #
 # Description:
 #   This module implements Part A of the assignment:
 #   - Loading and splitting the dataset into classes
 #   - MLE estimation of mean vectors and covariance matrices
 #   - Construction of Gaussian pdf surfaces
 #   - 3D visualization of class-conditional densities
 #
 # Notes:
 #   The implementation follows the theoretical formulation of
 #   multivariate Gaussian distributions and MLE parameter
 #   estimation as taught in class.
 # ------------------------------------------------------------
 import matplotlib.pyplot as plt
 import numpy as np
-from toolbox import *
+from toolbox import load_csv, split_dataset_by_class, dataset1
 from typing import Tuple, Dict
 from pandas import DataFrame
 # --------------------------------------------------
 # Part A: dataset splitting
 # --------------------------------------------------
 def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
    """
    Splits a dataset into features, labels and per-class subsets with the assumptions that:
    - All columns except the last are feature columns.
    - The last column is the class label.
    Parameters
    ----------
    df: DataFrame
        Data samples as DataFrame.
    Returns
    -------
    X : ndarray, shape (N, d), y : ndarray, shape (N,),   classes : dict:
        Feature matrix,
        Labels,
        Dictionary mapping each class label to the subset of X that belongs to that class.
    Example
    -------
        X, y, classes = split_dataset_by_class(df)
    """
    n_cols = df.shape[1]                # Number of columns
    X = df.iloc[:, :n_cols - 1].values  # Features = all columns except last
    y = df.iloc[:, n_cols - 1].values   # Labels = last column
    # Dictionary that maps class -> samples
    classes = {c: X[y == c] for c in np.unique(y) }
    return X, y, classes
 def mle_mean(X: np.ndarray) -> np.ndarray:
    """
@ -159,10 +143,13 @@ def compute_gaussian_grid(
    Returns
    -------
-    Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size)
+    tuple:
-        X Meshgrid coordinates for dimensions 0 and 1,
+        Xgrid: ndarray, shape (grid_size)
-        Y Meshgrid coordinates for dimensions 0 and 1,
+            X Meshgrid coordinates for dimensions 0 and 1
-        pdf values at each grid point.
+        Ygrid: ndarray, shape (grid_size)
            Y Meshgrid coordinates for dimensions 0 and 1,
        Z: ndarray, shape (grid_size)
            pdf values at each grid point.
    """
    # Range only on the first two dimensions
    x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
--- a/src/partB.py
+++ b/src/partB.py
@ -198,7 +198,9 @@ def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    Parameters
    ----------
    y_true : ndarray
        actual labels array
    y_pred : ndarray
        predicted labels array
    Returns
    -------
@ -257,8 +259,11 @@ def plot_h_vs_error(h_values: np.ndarray, errors: np.ndarray, title: str) -> Non
    Parameters
    ----------
    h_values : ndarray
        bandwith values
    errors : ndarray
        error values
    title : str
        title
    """
    plt.figure(figsize=(8, 5))
    plt.plot(h_values, errors, marker='o')
@ -274,6 +279,17 @@ def plot_histogram_with_pdf(
 ) -> None:
    """
    Plots a histogram of the data and overlays the true N(mu_true, var_true) pdf.
    Parameters
    ----------
    data : ndarray
        1D data samples.
    mu_true : float
        True mean, default to 1.0.
    var_true : float
        True variance, default to 4.0.
    bins : int
        number of bins, default to 30.
    """
    plt.figure(figsize=(8, 5))
--- a/src/partC.py
+++ b/src/partC.py
@ -0,0 +1,325 @@
 # ------------------------------------------------------------
 # Part C - k-Nearest Neighbors Classifier (k-NN)
 # Pattern Recognition – Semester Assignment
 #
 # Author:
 #   Christos Choutouridis (ΑΕΜ 8997)
 #   cchoutou@ece.auth.gr
 #
 # Description:
 #   This module implements Part C of the assignment:
 #   - Implementation of a simple k-NN classifier in 2D
 #   - Manual computation of Euclidean distances (no ML libraries)
 #   - Probability estimation for any number of classes
 #   - Accuracy evaluation for k ∈ [1, 30]
 #   - Decision boundary visualization for the best k
 # ------------------------------------------------------------
 from typing import Sequence, Tuple
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.lines import Line2D
 from matplotlib.patches import Patch
 from pandas import DataFrame
 from toolbox import load_csv, split_dataset_by_class, dataset3, testset
 # --------------------------------------------------
 # Dataset loading
 # --------------------------------------------------
 def load_data(dataset: DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    """
    Loads dataset and splits it into features and labels.
    Returns
    -------
    tuple:
        X (ndarray, shape (N, d)):
            Feature vectors.
        y (ndarray, shape (N,)):
            Corresponding class labels.
    """
    df = load_csv(dataset, header=None)
    X, y, _ = split_dataset_by_class(df)
    return X, y
 # --------------------------------------------------
 # k-NN core functions
 # --------------------------------------------------
 def eucl(x: np.ndarray, trainData: np.ndarray) -> np.ndarray:
    """
    Computes Euclidean distance of x from all training samples.
    Parameters
    ----------
    x : ndarray, shape (d,)
        Query point.
    trainData : ndarray, shape (N, d)
        Training feature vectors.
    Returns
    -------
    distances : ndarray, shape (N,)
        Euclidean distance from x to each training point.
    """
    diff = trainData - x              # shape (N, d)
    sq_dist = np.sum(diff * diff, axis=1)
    distances = np.sqrt(sq_dist)
    return distances
 def neighbors(x: np.ndarray, data: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Returns the indices and distances of the k nearest neighbors of x.
    Parameters
    ----------
    x : ndarray, shape (d,)
        data point
    data : ndarray, shape (N, d)
        dataset to search neighbors
    k : int
        Number of neighbors to consider
    Returns
    -------
    tuple:
        neighbor_indices : ndarray, shape (k,)
            Indices of the k nearest neighbors.
        neighbor_distances : ndarray, shape (k,)
            Distances of the k nearest neighbors (ascending order).
    """
    distances = eucl(x, data)
    sorted_indices = np.argsort(distances)
    neighbor_indices = sorted_indices[:k]
    neighbor_distances = distances[neighbor_indices]
    return neighbor_indices, neighbor_distances
 def predict(
    X_test: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, k: int
 ):
    """
    Predicts class probabilities and labels for each test sample using k-NN.
    Supports an arbitrary number of classes.
    Parameters
    ----------
    X_test : ndarray, shape (N_test, d)
        test features
    X_train : ndarray, shape (N_train, d)
        train features
    y_train : ndarray, shape (N_train,)
        Class labels (may be any discrete integers).
    k : int
        number of neighbors to consider
    Returns
    -------
    tuple:
        probs (ndarray, shape (N_test, C)):
            probs[i, j] = estimated probability of class classes[j] for sample i.
        y_pred (ndarray, shape (N_test,)):
            Predicted label for each test sample.
    """
    classes = np.unique(y_train)
    C = len(classes)
    N_test = X_test.shape[0]
    probs = np.zeros((N_test, C))
    y_pred = np.zeros(N_test, dtype=classes.dtype)
    for i in range(N_test):
        x = X_test[i]
        neighbor_indices, _ = neighbors(x, X_train, k)
        neighbor_labels = y_train[neighbor_indices]
        # Probabilities per class
        for j, c in enumerate(classes):
            probs[i, j] = np.sum(neighbor_labels == c) / k
        # Winner class
        y_pred[i] = classes[np.argmax(probs[i])]
    return probs, y_pred
 # --------------------------------------------------
 # Accuracy & model evaluation
 # --------------------------------------------------
 def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Classification accuracy.
    Parameters
    ----------
    y_true : ndarray
        actual labels
    y_pred : ndarray
        predicted labels
    Returns
    -------
    acc : float
        Fraction of correctly classified samples.
    """
    return float(np.mean(y_true == y_pred))
 def evaluate_over_k(
    X_train: np.ndarray, y_train: np.ndarray,
    X_test: np.ndarray, y_test: np.ndarray,
    k_values: Sequence[int],
 ) -> np.ndarray:
    """
    Evaluates k-NN accuracy for multiple values of k.
    Parameters
    ----------
    X_train, y_train:
        training set
    X_test, y_test:
        test set
    k_values :
        sequence of int
    Returns
    -------
    accuracies : ndarray, shape (len(k_values),)
        Accuracy for each value of k.
    """
    accuracies = np.zeros(len(k_values))
    for i, k in enumerate(k_values):
        _, y_pred = predict(X_test, X_train, y_train, k)
        accuracies[i] = accuracy(y_test, y_pred)
    return accuracies
 def plot_accuracy_vs_k(k_values: np.ndarray, accuracies: np.ndarray) -> None:
    """
    Plots k on the x-axis and accuracy on the y-axis.
    Parameters
    ----------
    k_values: np.ndarray
        sequence of int
    accuracies: np.ndarray
        accuracies array
    """
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, accuracies, marker="o")
    plt.xlabel("k")
    plt.ylabel("Accuracy")
    plt.title("k-NN accuracy over k")
    plt.grid(True)
    plt.show()
 # --------------------------------------------------
 # Decision boundary visualization
 # --------------------------------------------------
 def plot_decision_boundaries_2d(
    X_train: np.ndarray, y_train: np.ndarray, k: int, grid_size: int = 200
 ) -> None:
    """
    Plots the decision boundaries of the k-NN classifier in 2D using contourf.
    Supports any number of classes, but requires **exactly 2 features**.
    Parameters
    ----------
    X_train : ndarray, shape (N_train, 2)
        training features
    y_train : ndarray, shape (N_train,)
        training labels
    k : int
        Number of neighbors.
    grid_size : int
        Grid resolution for the contour.
    """
    # --- Check for 2D features ---
    if X_train.shape[1] != 2:
        raise ValueError(
            f"plot_decision_boundaries_2d supports only 2D features, "
            f"but got X_train with shape {X_train.shape}"
        )
    classes = np.unique(y_train)
    C = len(classes)
    class_to_idx = {c: idx for idx, c in enumerate(classes)}
    # Grid limits
    x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
    y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, grid_size),
        np.linspace(y_min, y_max, grid_size),
    )
    grid_points = np.column_stack([xx.ravel(), yy.ravel()])
    _, y_pred_grid = predict(grid_points, X_train, y_train, k)
    Z_idx = np.vectorize(class_to_idx.get)(y_pred_grid).reshape(xx.shape)
    # Discrete colormap
    cmap = plt.cm.get_cmap("Set2", C)
    levels = np.arange(C + 1) - 0.5
    plt.figure(figsize=(12, 8))
    # Filled boundaries
    plt.contourf(xx, yy, Z_idx, levels=levels, cmap=cmap, alpha=0.3)
    # Plot samples
    for c, idx in class_to_idx.items():
        mask = (y_train == c)
        plt.scatter(
            X_train[mask, 0], X_train[mask, 1],
            c=[cmap(idx)], edgecolors="k", s=30
        )
    # --- Custom legend: Region + Samples per class ---
    legend_elements = []
    for c, idx in class_to_idx.items():
        color = cmap(idx)
        legend_elements.append(Patch(facecolor=color, edgecolor="none",
                                     alpha=0.3, label=f"Region: class {c}"))
        legend_elements.append(Line2D([], [], marker="o", linestyle="",
                                      markerfacecolor=color,
                                      markeredgecolor="k",
                                      label=f"Samples: class {c}"))
    plt.legend(handles=legend_elements, loc="upper right", framealpha=0.9)
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.title(f"k-NN decision boundaries (k = {k})")
    plt.grid(True)
    plt.show()
 # --------------------------------------------------
 # Main runner
 # --------------------------------------------------
 if __name__ == "__main__":
    # Load training and test sets
    X_train, y_train = load_data(dataset=dataset3)
    X_test, y_test   = load_data(dataset=testset)
    # Evaluate over k
    k_values = np.arange(1, 31, 1)
    accuracies = evaluate_over_k(X_train, y_train, X_test, y_test, k_values)
    # Best k
    best_idx = np.argmax(accuracies)
    best_k = int(k_values[best_idx])
    best_acc = accuracies[best_idx]
    print(f"Best k: {best_k} with accuracy: {best_acc:.4f}")
    # Plots
    plot_accuracy_vs_k(k_values, accuracies)
    plot_decision_boundaries_2d(X_train, y_train, best_k, grid_size=200)
--- a/src/toolbox.py
+++ b/src/toolbox.py
@ -6,7 +6,11 @@
 #   cchoutou@ece.auth.gr
 # ------------------------------------------------------------
 from typing import Tuple, Dict
 import numpy as np
 import pandas as pd
 from pandas import DataFrame
 def github_raw(user, repo, branch, path):
@ -23,4 +27,39 @@ def load_csv(path, header=None):
    """
    Loads a CSV file and returns a pandas DataFrame.
    """
-    return pd.read_csv(path, header=header)
+    return pd.read_csv(path, header=header)
 def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
    """
    Splits a dataset into features, labels and per-class subsets with the assumptions that:
    - All columns except the last are feature columns.
    - The last column is the class label.
    Parameters
    ----------
    df: DataFrame
        Data samples as DataFrame.
    Returns
    -------
    tuple:
        X : ndarray, shape (N, d)
            Feature matrix.
        y : ndarray, shape (N,)
            Labels.
        classes : dict
            Dictionary mapping each class label to the subset of X that belongs to that class.
    Example
    -------
        X, y, classes = split_dataset_by_class(df)
    """
    n_cols = df.shape[1]                # Number of columns
    X = df.iloc[:, :n_cols - 1].values  # Features = all columns except last
    y = df.iloc[:, n_cols - 1].values   # Labels = last column
    classes = {c: X[y == c] for c in np.unique(y)}
    return X, y, classes