From 1508d413d868a9b448d34e0f33094e3b58226a27 Mon Sep 17 00:00:00 2001 From: Christos Choutouridis Date: Thu, 11 Dec 2025 18:06:27 +0200 Subject: [PATCH] Part C local code added --- src/partA.py | 71 +++++------ src/partB.py | 16 +++ src/partC.py | 325 +++++++++++++++++++++++++++++++++++++++++++++++++ src/toolbox.py | 41 ++++++- 4 files changed, 410 insertions(+), 43 deletions(-) create mode 100644 src/partC.py diff --git a/src/partA.py b/src/partA.py index 468cd5b..140851f 100644 --- a/src/partA.py +++ b/src/partA.py @@ -1,45 +1,29 @@ +# ------------------------------------------------------------ +# Part A - Gaussian Parameter Estimation (MLE) & Visualization +# Pattern Recognition – Semester Assignment +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements Part A of the assignment: +# - Loading and splitting the dataset into classes +# - MLE estimation of mean vectors and covariance matrices +# - Construction of Gaussian pdf surfaces +# - 3D visualization of class-conditional densities +# +# Notes: +# The implementation follows the theoretical formulation of +# multivariate Gaussian distributions and MLE parameter +# estimation as taught in class. +# ------------------------------------------------------------ + import matplotlib.pyplot as plt import numpy as np -from toolbox import * +from toolbox import load_csv, split_dataset_by_class, dataset1 from typing import Tuple, Dict -from pandas import DataFrame - - -# -------------------------------------------------- -# Part A: dataset splitting -# -------------------------------------------------- -def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]: - """ - Splits a dataset into features, labels and per-class subsets with the assumptions that: - - All columns except the last are feature columns. - - The last column is the class label. - - Parameters - ---------- - df: DataFrame - Data samples as DataFrame. - - Returns - ------- - X : ndarray, shape (N, d), y : ndarray, shape (N,), classes : dict: - Feature matrix, - Labels, - Dictionary mapping each class label to the subset of X that belongs to that class. - - Example - ------- - X, y, classes = split_dataset_by_class(df) - """ - n_cols = df.shape[1] # Number of columns - X = df.iloc[:, :n_cols - 1].values # Features = all columns except last - y = df.iloc[:, n_cols - 1].values # Labels = last column - - # Dictionary that maps class -> samples - classes = {c: X[y == c] for c in np.unique(y) } - - return X, y, classes - def mle_mean(X: np.ndarray) -> np.ndarray: """ @@ -159,10 +143,13 @@ def compute_gaussian_grid( Returns ------- - Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size) - X Meshgrid coordinates for dimensions 0 and 1, - Y Meshgrid coordinates for dimensions 0 and 1, - pdf values at each grid point. + tuple: + Xgrid: ndarray, shape (grid_size) + X Meshgrid coordinates for dimensions 0 and 1 + Ygrid: ndarray, shape (grid_size) + Y Meshgrid coordinates for dimensions 0 and 1, + Z: ndarray, shape (grid_size) + pdf values at each grid point. """ # Range only on the first two dimensions x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size) diff --git a/src/partB.py b/src/partB.py index 913d49f..c95bad9 100644 --- a/src/partB.py +++ b/src/partB.py @@ -198,7 +198,9 @@ def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float: Parameters ---------- y_true : ndarray + actual labels array y_pred : ndarray + predicted labels array Returns ------- @@ -257,8 +259,11 @@ def plot_h_vs_error(h_values: np.ndarray, errors: np.ndarray, title: str) -> Non Parameters ---------- h_values : ndarray + bandwith values errors : ndarray + error values title : str + title """ plt.figure(figsize=(8, 5)) plt.plot(h_values, errors, marker='o') @@ -274,6 +279,17 @@ def plot_histogram_with_pdf( ) -> None: """ Plots a histogram of the data and overlays the true N(mu_true, var_true) pdf. + + Parameters + ---------- + data : ndarray + 1D data samples. + mu_true : float + True mean, default to 1.0. + var_true : float + True variance, default to 4.0. + bins : int + number of bins, default to 30. """ plt.figure(figsize=(8, 5)) diff --git a/src/partC.py b/src/partC.py new file mode 100644 index 0000000..1fa2d66 --- /dev/null +++ b/src/partC.py @@ -0,0 +1,325 @@ +# ------------------------------------------------------------ +# Part C - k-Nearest Neighbors Classifier (k-NN) +# Pattern Recognition – Semester Assignment +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements Part C of the assignment: +# - Implementation of a simple k-NN classifier in 2D +# - Manual computation of Euclidean distances (no ML libraries) +# - Probability estimation for any number of classes +# - Accuracy evaluation for k ∈ [1, 30] +# - Decision boundary visualization for the best k +# ------------------------------------------------------------ + +from typing import Sequence, Tuple +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.lines import Line2D +from matplotlib.patches import Patch +from pandas import DataFrame + +from toolbox import load_csv, split_dataset_by_class, dataset3, testset + + +# -------------------------------------------------- +# Dataset loading +# -------------------------------------------------- +def load_data(dataset: DataFrame) -> Tuple[np.ndarray, np.ndarray]: + """ + Loads dataset and splits it into features and labels. + + Returns + ------- + tuple: + X (ndarray, shape (N, d)): + Feature vectors. + y (ndarray, shape (N,)): + Corresponding class labels. + """ + df = load_csv(dataset, header=None) + X, y, _ = split_dataset_by_class(df) + return X, y + + +# -------------------------------------------------- +# k-NN core functions +# -------------------------------------------------- +def eucl(x: np.ndarray, trainData: np.ndarray) -> np.ndarray: + """ + Computes Euclidean distance of x from all training samples. + + Parameters + ---------- + x : ndarray, shape (d,) + Query point. + trainData : ndarray, shape (N, d) + Training feature vectors. + + Returns + ------- + distances : ndarray, shape (N,) + Euclidean distance from x to each training point. + """ + diff = trainData - x # shape (N, d) + sq_dist = np.sum(diff * diff, axis=1) + distances = np.sqrt(sq_dist) + return distances + + +def neighbors(x: np.ndarray, data: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: + """ + Returns the indices and distances of the k nearest neighbors of x. + + Parameters + ---------- + x : ndarray, shape (d,) + data point + data : ndarray, shape (N, d) + dataset to search neighbors + k : int + Number of neighbors to consider + + Returns + ------- + tuple: + neighbor_indices : ndarray, shape (k,) + Indices of the k nearest neighbors. + neighbor_distances : ndarray, shape (k,) + Distances of the k nearest neighbors (ascending order). + """ + distances = eucl(x, data) + sorted_indices = np.argsort(distances) + neighbor_indices = sorted_indices[:k] + neighbor_distances = distances[neighbor_indices] + return neighbor_indices, neighbor_distances + + +def predict( + X_test: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, k: int +): + """ + Predicts class probabilities and labels for each test sample using k-NN. + Supports an arbitrary number of classes. + + Parameters + ---------- + X_test : ndarray, shape (N_test, d) + test features + X_train : ndarray, shape (N_train, d) + train features + y_train : ndarray, shape (N_train,) + Class labels (may be any discrete integers). + k : int + number of neighbors to consider + + Returns + ------- + tuple: + probs (ndarray, shape (N_test, C)): + probs[i, j] = estimated probability of class classes[j] for sample i. + y_pred (ndarray, shape (N_test,)): + Predicted label for each test sample. + """ + classes = np.unique(y_train) + C = len(classes) + N_test = X_test.shape[0] + + probs = np.zeros((N_test, C)) + y_pred = np.zeros(N_test, dtype=classes.dtype) + + for i in range(N_test): + x = X_test[i] + neighbor_indices, _ = neighbors(x, X_train, k) + neighbor_labels = y_train[neighbor_indices] + + # Probabilities per class + for j, c in enumerate(classes): + probs[i, j] = np.sum(neighbor_labels == c) / k + + # Winner class + y_pred[i] = classes[np.argmax(probs[i])] + + return probs, y_pred + + +# -------------------------------------------------- +# Accuracy & model evaluation +# -------------------------------------------------- +def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float: + """ + Classification accuracy. + + Parameters + ---------- + y_true : ndarray + actual labels + y_pred : ndarray + predicted labels + + Returns + ------- + acc : float + Fraction of correctly classified samples. + """ + return float(np.mean(y_true == y_pred)) + + +def evaluate_over_k( + X_train: np.ndarray, y_train: np.ndarray, + X_test: np.ndarray, y_test: np.ndarray, + k_values: Sequence[int], +) -> np.ndarray: + """ + Evaluates k-NN accuracy for multiple values of k. + + Parameters + ---------- + X_train, y_train: + training set + X_test, y_test: + test set + k_values : + sequence of int + + Returns + ------- + accuracies : ndarray, shape (len(k_values),) + Accuracy for each value of k. + """ + accuracies = np.zeros(len(k_values)) + + for i, k in enumerate(k_values): + _, y_pred = predict(X_test, X_train, y_train, k) + accuracies[i] = accuracy(y_test, y_pred) + + return accuracies + + +def plot_accuracy_vs_k(k_values: np.ndarray, accuracies: np.ndarray) -> None: + """ + Plots k on the x-axis and accuracy on the y-axis. + + Parameters + ---------- + k_values: np.ndarray + sequence of int + accuracies: np.ndarray + accuracies array + """ + plt.figure(figsize=(10, 6)) + plt.plot(k_values, accuracies, marker="o") + plt.xlabel("k") + plt.ylabel("Accuracy") + plt.title("k-NN accuracy over k") + plt.grid(True) + plt.show() + + +# -------------------------------------------------- +# Decision boundary visualization +# -------------------------------------------------- +def plot_decision_boundaries_2d( + X_train: np.ndarray, y_train: np.ndarray, k: int, grid_size: int = 200 +) -> None: + """ + Plots the decision boundaries of the k-NN classifier in 2D using contourf. + Supports any number of classes, but requires **exactly 2 features**. + + Parameters + ---------- + X_train : ndarray, shape (N_train, 2) + training features + y_train : ndarray, shape (N_train,) + training labels + k : int + Number of neighbors. + grid_size : int + Grid resolution for the contour. + """ + # --- Check for 2D features --- + if X_train.shape[1] != 2: + raise ValueError( + f"plot_decision_boundaries_2d supports only 2D features, " + f"but got X_train with shape {X_train.shape}" + ) + + classes = np.unique(y_train) + C = len(classes) + class_to_idx = {c: idx for idx, c in enumerate(classes)} + + # Grid limits + x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5 + y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5 + + xx, yy = np.meshgrid( + np.linspace(x_min, x_max, grid_size), + np.linspace(y_min, y_max, grid_size), + ) + + grid_points = np.column_stack([xx.ravel(), yy.ravel()]) + _, y_pred_grid = predict(grid_points, X_train, y_train, k) + + Z_idx = np.vectorize(class_to_idx.get)(y_pred_grid).reshape(xx.shape) + + # Discrete colormap + cmap = plt.cm.get_cmap("Set2", C) + levels = np.arange(C + 1) - 0.5 + + plt.figure(figsize=(12, 8)) + + # Filled boundaries + plt.contourf(xx, yy, Z_idx, levels=levels, cmap=cmap, alpha=0.3) + + # Plot samples + for c, idx in class_to_idx.items(): + mask = (y_train == c) + plt.scatter( + X_train[mask, 0], X_train[mask, 1], + c=[cmap(idx)], edgecolors="k", s=30 + ) + + # --- Custom legend: Region + Samples per class --- + legend_elements = [] + for c, idx in class_to_idx.items(): + color = cmap(idx) + legend_elements.append(Patch(facecolor=color, edgecolor="none", + alpha=0.3, label=f"Region: class {c}")) + legend_elements.append(Line2D([], [], marker="o", linestyle="", + markerfacecolor=color, + markeredgecolor="k", + label=f"Samples: class {c}")) + + plt.legend(handles=legend_elements, loc="upper right", framealpha=0.9) + plt.xlabel("x1") + plt.ylabel("x2") + plt.title(f"k-NN decision boundaries (k = {k})") + plt.grid(True) + plt.show() + + +# -------------------------------------------------- +# Main runner +# -------------------------------------------------- +if __name__ == "__main__": + # Load training and test sets + X_train, y_train = load_data(dataset=dataset3) + X_test, y_test = load_data(dataset=testset) + + # Evaluate over k + k_values = np.arange(1, 31, 1) + accuracies = evaluate_over_k(X_train, y_train, X_test, y_test, k_values) + + # Best k + best_idx = np.argmax(accuracies) + best_k = int(k_values[best_idx]) + best_acc = accuracies[best_idx] + + print(f"Best k: {best_k} with accuracy: {best_acc:.4f}") + + # Plots + plot_accuracy_vs_k(k_values, accuracies) + plot_decision_boundaries_2d(X_train, y_train, best_k, grid_size=200) diff --git a/src/toolbox.py b/src/toolbox.py index 969c87e..f29073a 100644 --- a/src/toolbox.py +++ b/src/toolbox.py @@ -6,7 +6,11 @@ # cchoutou@ece.auth.gr # ------------------------------------------------------------ +from typing import Tuple, Dict + +import numpy as np import pandas as pd +from pandas import DataFrame def github_raw(user, repo, branch, path): @@ -23,4 +27,39 @@ def load_csv(path, header=None): """ Loads a CSV file and returns a pandas DataFrame. """ - return pd.read_csv(path, header=header) \ No newline at end of file + return pd.read_csv(path, header=header) + + +def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]: + """ + Splits a dataset into features, labels and per-class subsets with the assumptions that: + + - All columns except the last are feature columns. + - The last column is the class label. + + Parameters + ---------- + df: DataFrame + Data samples as DataFrame. + + Returns + ------- + tuple: + X : ndarray, shape (N, d) + Feature matrix. + y : ndarray, shape (N,) + Labels. + classes : dict + Dictionary mapping each class label to the subset of X that belongs to that class. + + Example + ------- + X, y, classes = split_dataset_by_class(df) + """ + n_cols = df.shape[1] # Number of columns + X = df.iloc[:, :n_cols - 1].values # Features = all columns except last + y = df.iloc[:, n_cols - 1].values # Labels = last column + + classes = {c: X[y == c] for c in np.unique(y)} + + return X, y, classes