Part C local code added

2025-12-11 18:06:27 +02:00 · 2025-12-11 18:06:27 +02:00 · 1508d413d8
commit 1508d413d8
parent cc6b742553
4 changed files with 410 additions and 43 deletions
--- a/src/partA.py
+++ b/src/partA.py
@ -1,45 +1,29 @@
+# ------------------------------------------------------------
+# Part A - Gaussian Parameter Estimation (MLE) & Visualization
+# Pattern Recognition – Semester Assignment
+#
+# Author:
+#   Christos Choutouridis (ΑΕΜ 8997)
+#   cchoutou@ece.auth.gr
+#
+# Description:
+#   This module implements Part A of the assignment:
+#   - Loading and splitting the dataset into classes
+#   - MLE estimation of mean vectors and covariance matrices
+#   - Construction of Gaussian pdf surfaces
+#   - 3D visualization of class-conditional densities
+#
+# Notes:
+#   The implementation follows the theoretical formulation of
+#   multivariate Gaussian distributions and MLE parameter
+#   estimation as taught in class.
+# ------------------------------------------------------------
+
 import matplotlib.pyplot as plt
 import numpy as np
-from toolbox import *
+from toolbox import load_csv, split_dataset_by_class, dataset1

 from typing import Tuple, Dict
-from pandas import DataFrame
-
-
-# --------------------------------------------------
-# Part A: dataset splitting
-# --------------------------------------------------
-def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
-    """
-    Splits a dataset into features, labels and per-class subsets with the assumptions that:
-    - All columns except the last are feature columns.
-    - The last column is the class label.
-
-    Parameters
-    ----------
-    df: DataFrame
-        Data samples as DataFrame.
-
-    Returns
-    -------
-    X : ndarray, shape (N, d), y : ndarray, shape (N,),   classes : dict:
-        Feature matrix,
-        Labels,
-        Dictionary mapping each class label to the subset of X that belongs to that class.
-
-    Example
-    -------
-        X, y, classes = split_dataset_by_class(df)
-    """
-    n_cols = df.shape[1]                # Number of columns
-    X = df.iloc[:, :n_cols - 1].values  # Features = all columns except last
-    y = df.iloc[:, n_cols - 1].values   # Labels = last column
-
-    # Dictionary that maps class -> samples
-    classes = {c: X[y == c] for c in np.unique(y) }
-
-    return X, y, classes
-

 def mle_mean(X: np.ndarray) -> np.ndarray:
    """
@ -159,10 +143,13 @@ def compute_gaussian_grid(

    Returns
    -------
-    Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size)
-        X Meshgrid coordinates for dimensions 0 and 1,
-        Y Meshgrid coordinates for dimensions 0 and 1,
-        pdf values at each grid point.
+    tuple:
+        Xgrid: ndarray, shape (grid_size)
+            X Meshgrid coordinates for dimensions 0 and 1
+        Ygrid: ndarray, shape (grid_size)
+            Y Meshgrid coordinates for dimensions 0 and 1,
+        Z: ndarray, shape (grid_size)
+            pdf values at each grid point.
    """
    # Range only on the first two dimensions
    x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
--- a/src/partB.py
+++ b/src/partB.py
@ -198,7 +198,9 @@ def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    Parameters
    ----------
    y_true : ndarray
+        actual labels array
    y_pred : ndarray
+        predicted labels array

    Returns
    -------
@ -257,8 +259,11 @@ def plot_h_vs_error(h_values: np.ndarray, errors: np.ndarray, title: str) -> Non
    Parameters
    ----------
    h_values : ndarray
+        bandwith values
    errors : ndarray
+        error values
    title : str
+        title
    """
    plt.figure(figsize=(8, 5))
    plt.plot(h_values, errors, marker='o')
@ -274,6 +279,17 @@ def plot_histogram_with_pdf(
 ) -> None:
    """
    Plots a histogram of the data and overlays the true N(mu_true, var_true) pdf.
+
+    Parameters
+    ----------
+    data : ndarray
+        1D data samples.
+    mu_true : float
+        True mean, default to 1.0.
+    var_true : float
+        True variance, default to 4.0.
+    bins : int
+        number of bins, default to 30.
    """
    plt.figure(figsize=(8, 5))

--- a/src/partC.py
+++ b/src/partC.py
@ -0,0 +1,325 @@
+# ------------------------------------------------------------
+# Part C - k-Nearest Neighbors Classifier (k-NN)
+# Pattern Recognition – Semester Assignment
+#
+# Author:
+#   Christos Choutouridis (ΑΕΜ 8997)
+#   cchoutou@ece.auth.gr
+#
+# Description:
+#   This module implements Part C of the assignment:
+#   - Implementation of a simple k-NN classifier in 2D
+#   - Manual computation of Euclidean distances (no ML libraries)
+#   - Probability estimation for any number of classes
+#   - Accuracy evaluation for k ∈ [1, 30]
+#   - Decision boundary visualization for the best k
+# ------------------------------------------------------------
+
+from typing import Sequence, Tuple
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+from matplotlib.patches import Patch
+from pandas import DataFrame
+
+from toolbox import load_csv, split_dataset_by_class, dataset3, testset
+
+
+# --------------------------------------------------
+# Dataset loading
+# --------------------------------------------------
+def load_data(dataset: DataFrame) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Loads dataset and splits it into features and labels.
+
+    Returns
+    -------
+    tuple:
+        X (ndarray, shape (N, d)):
+            Feature vectors.
+        y (ndarray, shape (N,)):
+            Corresponding class labels.
+    """
+    df = load_csv(dataset, header=None)
+    X, y, _ = split_dataset_by_class(df)
+    return X, y
+
+
+# --------------------------------------------------
+# k-NN core functions
+# --------------------------------------------------
+def eucl(x: np.ndarray, trainData: np.ndarray) -> np.ndarray:
+    """
+    Computes Euclidean distance of x from all training samples.
+
+    Parameters
+    ----------
+    x : ndarray, shape (d,)
+        Query point.
+    trainData : ndarray, shape (N, d)
+        Training feature vectors.
+
+    Returns
+    -------
+    distances : ndarray, shape (N,)
+        Euclidean distance from x to each training point.
+    """
+    diff = trainData - x              # shape (N, d)
+    sq_dist = np.sum(diff * diff, axis=1)
+    distances = np.sqrt(sq_dist)
+    return distances
+
+
+def neighbors(x: np.ndarray, data: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Returns the indices and distances of the k nearest neighbors of x.
+
+    Parameters
+    ----------
+    x : ndarray, shape (d,)
+        data point
+    data : ndarray, shape (N, d)
+        dataset to search neighbors
+    k : int
+        Number of neighbors to consider
+
+    Returns
+    -------
+    tuple:
+        neighbor_indices : ndarray, shape (k,)
+            Indices of the k nearest neighbors.
+        neighbor_distances : ndarray, shape (k,)
+            Distances of the k nearest neighbors (ascending order).
+    """
+    distances = eucl(x, data)
+    sorted_indices = np.argsort(distances)
+    neighbor_indices = sorted_indices[:k]
+    neighbor_distances = distances[neighbor_indices]
+    return neighbor_indices, neighbor_distances
+
+
+def predict(
+    X_test: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, k: int
+):
+    """
+    Predicts class probabilities and labels for each test sample using k-NN.
+    Supports an arbitrary number of classes.
+
+    Parameters
+    ----------
+    X_test : ndarray, shape (N_test, d)
+        test features
+    X_train : ndarray, shape (N_train, d)
+        train features
+    y_train : ndarray, shape (N_train,)
+        Class labels (may be any discrete integers).
+    k : int
+        number of neighbors to consider
+
+    Returns
+    -------
+    tuple:
+        probs (ndarray, shape (N_test, C)):
+            probs[i, j] = estimated probability of class classes[j] for sample i.
+        y_pred (ndarray, shape (N_test,)):
+            Predicted label for each test sample.
+    """
+    classes = np.unique(y_train)
+    C = len(classes)
+    N_test = X_test.shape[0]
+
+    probs = np.zeros((N_test, C))
+    y_pred = np.zeros(N_test, dtype=classes.dtype)
+
+    for i in range(N_test):
+        x = X_test[i]
+        neighbor_indices, _ = neighbors(x, X_train, k)
+        neighbor_labels = y_train[neighbor_indices]
+
+        # Probabilities per class
+        for j, c in enumerate(classes):
+            probs[i, j] = np.sum(neighbor_labels == c) / k
+
+        # Winner class
+        y_pred[i] = classes[np.argmax(probs[i])]
+
+    return probs, y_pred
+
+
+# --------------------------------------------------
+# Accuracy & model evaluation
+# --------------------------------------------------
+def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    """
+    Classification accuracy.
+
+    Parameters
+    ----------
+    y_true : ndarray
+        actual labels
+    y_pred : ndarray
+        predicted labels
+
+    Returns
+    -------
+    acc : float
+        Fraction of correctly classified samples.
+    """
+    return float(np.mean(y_true == y_pred))
+
+
+def evaluate_over_k(
+    X_train: np.ndarray, y_train: np.ndarray,
+    X_test: np.ndarray, y_test: np.ndarray,
+    k_values: Sequence[int],
+) -> np.ndarray:
+    """
+    Evaluates k-NN accuracy for multiple values of k.
+
+    Parameters
+    ----------
+    X_train, y_train:
+        training set
+    X_test, y_test:
+        test set
+    k_values :
+        sequence of int
+
+    Returns
+    -------
+    accuracies : ndarray, shape (len(k_values),)
+        Accuracy for each value of k.
+    """
+    accuracies = np.zeros(len(k_values))
+
+    for i, k in enumerate(k_values):
+        _, y_pred = predict(X_test, X_train, y_train, k)
+        accuracies[i] = accuracy(y_test, y_pred)
+
+    return accuracies
+
+
+def plot_accuracy_vs_k(k_values: np.ndarray, accuracies: np.ndarray) -> None:
+    """
+    Plots k on the x-axis and accuracy on the y-axis.
+
+    Parameters
+    ----------
+    k_values: np.ndarray
+        sequence of int
+    accuracies: np.ndarray
+        accuracies array
+    """
+    plt.figure(figsize=(10, 6))
+    plt.plot(k_values, accuracies, marker="o")
+    plt.xlabel("k")
+    plt.ylabel("Accuracy")
+    plt.title("k-NN accuracy over k")
+    plt.grid(True)
+    plt.show()
+
+
+# --------------------------------------------------
+# Decision boundary visualization
+# --------------------------------------------------
+def plot_decision_boundaries_2d(
+    X_train: np.ndarray, y_train: np.ndarray, k: int, grid_size: int = 200
+) -> None:
+    """
+    Plots the decision boundaries of the k-NN classifier in 2D using contourf.
+    Supports any number of classes, but requires **exactly 2 features**.
+
+    Parameters
+    ----------
+    X_train : ndarray, shape (N_train, 2)
+        training features
+    y_train : ndarray, shape (N_train,)
+        training labels
+    k : int
+        Number of neighbors.
+    grid_size : int
+        Grid resolution for the contour.
+    """
+    # --- Check for 2D features ---
+    if X_train.shape[1] != 2:
+        raise ValueError(
+            f"plot_decision_boundaries_2d supports only 2D features, "
+            f"but got X_train with shape {X_train.shape}"
+        )
+
+    classes = np.unique(y_train)
+    C = len(classes)
+    class_to_idx = {c: idx for idx, c in enumerate(classes)}
+
+    # Grid limits
+    x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
+    y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
+
+    xx, yy = np.meshgrid(
+        np.linspace(x_min, x_max, grid_size),
+        np.linspace(y_min, y_max, grid_size),
+    )
+
+    grid_points = np.column_stack([xx.ravel(), yy.ravel()])
+    _, y_pred_grid = predict(grid_points, X_train, y_train, k)
+
+    Z_idx = np.vectorize(class_to_idx.get)(y_pred_grid).reshape(xx.shape)
+
+    # Discrete colormap
+    cmap = plt.cm.get_cmap("Set2", C)
+    levels = np.arange(C + 1) - 0.5
+
+    plt.figure(figsize=(12, 8))
+
+    # Filled boundaries
+    plt.contourf(xx, yy, Z_idx, levels=levels, cmap=cmap, alpha=0.3)
+
+    # Plot samples
+    for c, idx in class_to_idx.items():
+        mask = (y_train == c)
+        plt.scatter(
+            X_train[mask, 0], X_train[mask, 1],
+            c=[cmap(idx)], edgecolors="k", s=30
+        )
+
+    # --- Custom legend: Region + Samples per class ---
+    legend_elements = []
+    for c, idx in class_to_idx.items():
+        color = cmap(idx)
+        legend_elements.append(Patch(facecolor=color, edgecolor="none",
+                                     alpha=0.3, label=f"Region: class {c}"))
+        legend_elements.append(Line2D([], [], marker="o", linestyle="",
+                                      markerfacecolor=color,
+                                      markeredgecolor="k",
+                                      label=f"Samples: class {c}"))
+
+    plt.legend(handles=legend_elements, loc="upper right", framealpha=0.9)
+    plt.xlabel("x1")
+    plt.ylabel("x2")
+    plt.title(f"k-NN decision boundaries (k = {k})")
+    plt.grid(True)
+    plt.show()
+
+
+# --------------------------------------------------
+# Main runner
+# --------------------------------------------------
+if __name__ == "__main__":
+    # Load training and test sets
+    X_train, y_train = load_data(dataset=dataset3)
+    X_test, y_test   = load_data(dataset=testset)
+
+    # Evaluate over k
+    k_values = np.arange(1, 31, 1)
+    accuracies = evaluate_over_k(X_train, y_train, X_test, y_test, k_values)
+
+    # Best k
+    best_idx = np.argmax(accuracies)
+    best_k = int(k_values[best_idx])
+    best_acc = accuracies[best_idx]
+
+    print(f"Best k: {best_k} with accuracy: {best_acc:.4f}")
+
+    # Plots
+    plot_accuracy_vs_k(k_values, accuracies)
+    plot_decision_boundaries_2d(X_train, y_train, best_k, grid_size=200)
--- a/src/toolbox.py
+++ b/src/toolbox.py
@ -6,7 +6,11 @@
 #   cchoutou@ece.auth.gr
 # ------------------------------------------------------------

+from typing import Tuple, Dict
+
+import numpy as np
 import pandas as pd
+from pandas import DataFrame


 def github_raw(user, repo, branch, path):
@ -23,4 +27,39 @@ def load_csv(path, header=None):
    """
    Loads a CSV file and returns a pandas DataFrame.
    """
-    return pd.read_csv(path, header=header)
+    return pd.read_csv(path, header=header)
+
+
+def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
+    """
+    Splits a dataset into features, labels and per-class subsets with the assumptions that:
+
+    - All columns except the last are feature columns.
+    - The last column is the class label.
+
+    Parameters
+    ----------
+    df: DataFrame
+        Data samples as DataFrame.
+
+    Returns
+    -------
+    tuple:
+        X : ndarray, shape (N, d)
+            Feature matrix.
+        y : ndarray, shape (N,)
+            Labels.
+        classes : dict
+            Dictionary mapping each class label to the subset of X that belongs to that class.
+
+    Example
+    -------
+        X, y, classes = split_dataset_by_class(df)
+    """
+    n_cols = df.shape[1]                # Number of columns
+    X = df.iloc[:, :n_cols - 1].values  # Features = all columns except last
+    y = df.iloc[:, n_cols - 1].values   # Labels = last column
+
+    classes = {c: X[y == c] for c in np.unique(y)}
+
+    return X, y, classes