# ------------------------------------------------------------ # Part C - k-Nearest Neighbors Classifier (k-NN) # Pattern Recognition – Semester Assignment # # Author: # Christos Choutouridis (ΑΕΜ 8997) # cchoutou@ece.auth.gr # # Description: # This module implements Part C of the assignment: # - Implementation of a simple k-NN classifier in 2D # - Manual computation of Euclidean distances (no ML libraries) # - Probability estimation for any number of classes # - Accuracy evaluation for k ∈ [1, 30] # - Decision boundary visualization for the best k # ------------------------------------------------------------ from typing import Sequence, Tuple import numpy as np import matplotlib.pyplot as plt from matplotlib.lines import Line2D from matplotlib.patches import Patch from pandas import DataFrame from toolbox import load_csv, split_dataset_by_class, dataset3, testset # -------------------------------------------------- # Dataset loading # -------------------------------------------------- def load_data(dataset: DataFrame) -> Tuple[np.ndarray, np.ndarray]: """ Loads dataset and splits it into features and labels. Returns ------- tuple: X (ndarray, shape (N, d)): Feature vectors. y (ndarray, shape (N,)): Corresponding class labels. """ df = load_csv(dataset, header=None) X, y, _ = split_dataset_by_class(df) return X, y # -------------------------------------------------- # k-NN core functions # -------------------------------------------------- def eucl(x: np.ndarray, trainData: np.ndarray) -> np.ndarray: """ Computes Euclidean distance of x from all training samples. Parameters ---------- x : ndarray, shape (d,) Query point. trainData : ndarray, shape (N, d) Training feature vectors. Returns ------- distances : ndarray, shape (N,) Euclidean distance from x to each training point. """ diff = trainData - x # shape (N, d) sq_dist = np.sum(diff * diff, axis=1) distances = np.sqrt(sq_dist) return distances def neighbors(x: np.ndarray, data: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: """ Returns the indices and distances of the k nearest neighbors of x. Parameters ---------- x : ndarray, shape (d,) data point data : ndarray, shape (N, d) dataset to search neighbors k : int Number of neighbors to consider Returns ------- tuple: neighbor_indices : ndarray, shape (k,) Indices of the k nearest neighbors. neighbor_distances : ndarray, shape (k,) Distances of the k nearest neighbors (ascending order). """ distances = eucl(x, data) sorted_indices = np.argsort(distances) neighbor_indices = sorted_indices[:k] neighbor_distances = distances[neighbor_indices] return neighbor_indices, neighbor_distances def predict( X_test: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, k: int ): """ Predicts class probabilities and labels for each test sample using k-NN. Supports an arbitrary number of classes. Parameters ---------- X_test : ndarray, shape (N_test, d) test features X_train : ndarray, shape (N_train, d) train features y_train : ndarray, shape (N_train,) Class labels (may be any discrete integers). k : int number of neighbors to consider Returns ------- tuple: probs (ndarray, shape (N_test, C)): probs[i, j] = estimated probability of class classes[j] for sample i. y_pred (ndarray, shape (N_test,)): Predicted label for each test sample. """ classes = np.unique(y_train) C = len(classes) N_test = X_test.shape[0] probs = np.zeros((N_test, C)) y_pred = np.zeros(N_test, dtype=classes.dtype) for i in range(N_test): x = X_test[i] neighbor_indices, _ = neighbors(x, X_train, k) neighbor_labels = y_train[neighbor_indices] # Probabilities per class for j, c in enumerate(classes): probs[i, j] = np.sum(neighbor_labels == c) / k # Winner class y_pred[i] = classes[np.argmax(probs[i])] return probs, y_pred # -------------------------------------------------- # Accuracy & model evaluation # -------------------------------------------------- def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float: """ Classification accuracy. Parameters ---------- y_true : ndarray actual labels y_pred : ndarray predicted labels Returns ------- acc : float Fraction of correctly classified samples. """ return float(np.mean(y_true == y_pred)) def evaluate_over_k( X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, k_values: Sequence[int], ) -> np.ndarray: """ Evaluates k-NN accuracy for multiple values of k. Parameters ---------- X_train, y_train: training set X_test, y_test: test set k_values : sequence of int Returns ------- accuracies : ndarray, shape (len(k_values),) Accuracy for each value of k. """ accuracies = np.zeros(len(k_values)) for i, k in enumerate(k_values): _, y_pred = predict(X_test, X_train, y_train, k) accuracies[i] = accuracy(y_test, y_pred) return accuracies def plot_accuracy_vs_k(k_values: np.ndarray, accuracies: np.ndarray) -> None: """ Plots k on the x-axis and accuracy on the y-axis. Parameters ---------- k_values: np.ndarray sequence of int accuracies: np.ndarray accuracies array """ plt.figure(figsize=(10, 6)) plt.plot(k_values, accuracies, marker="o") plt.xlabel("k") plt.ylabel("Accuracy") plt.title("k-NN accuracy over k") plt.grid(True) plt.show() # -------------------------------------------------- # Decision boundary visualization # -------------------------------------------------- def plot_decision_boundaries_2d( X_train: np.ndarray, y_train: np.ndarray, k: int, grid_size: int = 200 ) -> None: """ Plots the decision boundaries of the k-NN classifier in 2D using contourf. Supports any number of classes, but requires **exactly 2 features**. Parameters ---------- X_train : ndarray, shape (N_train, 2) training features y_train : ndarray, shape (N_train,) training labels k : int Number of neighbors. grid_size : int Grid resolution for the contour. """ # --- Check for 2D features --- if X_train.shape[1] != 2: raise ValueError( f"plot_decision_boundaries_2d supports only 2D features, " f"but got X_train with shape {X_train.shape}" ) classes = np.unique(y_train) C = len(classes) class_to_idx = {c: idx for idx, c in enumerate(classes)} # Grid limits x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5 y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5 xx, yy = np.meshgrid( np.linspace(x_min, x_max, grid_size), np.linspace(y_min, y_max, grid_size), ) grid_points = np.column_stack([xx.ravel(), yy.ravel()]) _, y_pred_grid = predict(grid_points, X_train, y_train, k) Z_idx = np.vectorize(class_to_idx.get)(y_pred_grid).reshape(xx.shape) # Discrete colormap cmap = plt.cm.get_cmap("Set2", C) levels = np.arange(C + 1) - 0.5 plt.figure(figsize=(12, 8)) # Filled boundaries plt.contourf(xx, yy, Z_idx, levels=levels, cmap=cmap, alpha=0.3) # Plot samples for c, idx in class_to_idx.items(): mask = (y_train == c) plt.scatter( X_train[mask, 0], X_train[mask, 1], c=[cmap(idx)], edgecolors="k", s=30 ) # --- Custom legend: Region + Samples per class --- legend_elements = [] for c, idx in class_to_idx.items(): color = cmap(idx) legend_elements.append(Patch(facecolor=color, edgecolor="none", alpha=0.3, label=f"Region: class {c}")) legend_elements.append(Line2D([], [], marker="o", linestyle="", markerfacecolor=color, markeredgecolor="k", label=f"Samples: class {c}")) plt.legend(handles=legend_elements, loc="upper right", framealpha=0.9) plt.xlabel("x1") plt.ylabel("x2") plt.title(f"k-NN decision boundaries (k = {k})") plt.grid(True) plt.show() # -------------------------------------------------- # Main runner # -------------------------------------------------- if __name__ == "__main__": # Load training and test sets X_train, y_train = load_data(dataset=dataset3) X_test, y_test = load_data(dataset=testset) # Evaluate over k k_values = np.arange(1, 31, 1) accuracies = evaluate_over_k(X_train, y_train, X_test, y_test, k_values) # Best k best_idx = np.argmax(accuracies) best_k = int(k_values[best_idx]) best_acc = accuracies[best_idx] print(f"Best k: {best_k} with accuracy: {best_acc:.4f}") # Plots plot_accuracy_vs_k(k_values, accuracies) plot_decision_boundaries_2d(X_train, y_train, best_k, grid_size=200)