326 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ------------------------------------------------------------
# Part C - k-Nearest Neighbors Classifier (k-NN)
# Pattern Recognition Semester Assignment
#
# Author:
# Christos Choutouridis (ΑΕΜ 8997)
# cchoutou@ece.auth.gr
#
# Description:
# This module implements Part C of the assignment:
# - Implementation of a simple k-NN classifier in 2D
# - Manual computation of Euclidean distances (no ML libraries)
# - Probability estimation for any number of classes
# - Accuracy evaluation for k ∈ [1, 30]
# - Decision boundary visualization for the best k
# ------------------------------------------------------------
from typing import Sequence, Tuple
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from pandas import DataFrame
from toolbox import load_csv, split_dataset_by_class, dataset3, testset
# --------------------------------------------------
# Dataset loading
# --------------------------------------------------
def load_data(dataset: DataFrame) -> Tuple[np.ndarray, np.ndarray]:
"""
Loads dataset and splits it into features and labels.
Returns
-------
tuple:
X (ndarray, shape (N, d)):
Feature vectors.
y (ndarray, shape (N,)):
Corresponding class labels.
"""
df = load_csv(dataset, header=None)
X, y, _ = split_dataset_by_class(df)
return X, y
# --------------------------------------------------
# k-NN core functions
# --------------------------------------------------
def eucl(x: np.ndarray, trainData: np.ndarray) -> np.ndarray:
"""
Computes Euclidean distance of x from all training samples.
Parameters
----------
x : ndarray, shape (d,)
Query point.
trainData : ndarray, shape (N, d)
Training feature vectors.
Returns
-------
distances : ndarray, shape (N,)
Euclidean distance from x to each training point.
"""
diff = trainData - x # shape (N, d)
sq_dist = np.sum(diff * diff, axis=1)
distances = np.sqrt(sq_dist)
return distances
def neighbors(x: np.ndarray, data: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
"""
Returns the indices and distances of the k nearest neighbors of x.
Parameters
----------
x : ndarray, shape (d,)
data point
data : ndarray, shape (N, d)
dataset to search neighbors
k : int
Number of neighbors to consider
Returns
-------
tuple:
neighbor_indices : ndarray, shape (k,)
Indices of the k nearest neighbors.
neighbor_distances : ndarray, shape (k,)
Distances of the k nearest neighbors (ascending order).
"""
distances = eucl(x, data)
sorted_indices = np.argsort(distances)
neighbor_indices = sorted_indices[:k]
neighbor_distances = distances[neighbor_indices]
return neighbor_indices, neighbor_distances
def predict(
X_test: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, k: int
):
"""
Predicts class probabilities and labels for each test sample using k-NN.
Supports an arbitrary number of classes.
Parameters
----------
X_test : ndarray, shape (N_test, d)
test features
X_train : ndarray, shape (N_train, d)
train features
y_train : ndarray, shape (N_train,)
Class labels (may be any discrete integers).
k : int
number of neighbors to consider
Returns
-------
tuple:
probs (ndarray, shape (N_test, C)):
probs[i, j] = estimated probability of class classes[j] for sample i.
y_pred (ndarray, shape (N_test,)):
Predicted label for each test sample.
"""
classes = np.unique(y_train)
C = len(classes)
N_test = X_test.shape[0]
probs = np.zeros((N_test, C))
y_pred = np.zeros(N_test, dtype=classes.dtype)
for i in range(N_test):
x = X_test[i]
neighbor_indices, _ = neighbors(x, X_train, k)
neighbor_labels = y_train[neighbor_indices]
# Probabilities per class
for j, c in enumerate(classes):
probs[i, j] = np.sum(neighbor_labels == c) / k
# Winner class
y_pred[i] = classes[np.argmax(probs[i])]
return probs, y_pred
# --------------------------------------------------
# Accuracy & model evaluation
# --------------------------------------------------
def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
"""
Classification accuracy.
Parameters
----------
y_true : ndarray
actual labels
y_pred : ndarray
predicted labels
Returns
-------
acc : float
Fraction of correctly classified samples.
"""
return float(np.mean(y_true == y_pred))
def evaluate_over_k(
X_train: np.ndarray, y_train: np.ndarray,
X_test: np.ndarray, y_test: np.ndarray,
k_values: Sequence[int],
) -> np.ndarray:
"""
Evaluates k-NN accuracy for multiple values of k.
Parameters
----------
X_train, y_train:
training set
X_test, y_test:
test set
k_values :
sequence of int
Returns
-------
accuracies : ndarray, shape (len(k_values),)
Accuracy for each value of k.
"""
accuracies = np.zeros(len(k_values))
for i, k in enumerate(k_values):
_, y_pred = predict(X_test, X_train, y_train, k)
accuracies[i] = accuracy(y_test, y_pred)
return accuracies
def plot_accuracy_vs_k(k_values: np.ndarray, accuracies: np.ndarray) -> None:
"""
Plots k on the x-axis and accuracy on the y-axis.
Parameters
----------
k_values: np.ndarray
sequence of int
accuracies: np.ndarray
accuracies array
"""
plt.figure(figsize=(10, 6))
plt.plot(k_values, accuracies, marker="o")
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.title("k-NN accuracy over k")
plt.grid(True)
plt.show()
# --------------------------------------------------
# Decision boundary visualization
# --------------------------------------------------
def plot_decision_boundaries_2d(
X_train: np.ndarray, y_train: np.ndarray, k: int, grid_size: int = 200
) -> None:
"""
Plots the decision boundaries of the k-NN classifier in 2D using contourf.
Supports any number of classes, but requires **exactly 2 features**.
Parameters
----------
X_train : ndarray, shape (N_train, 2)
training features
y_train : ndarray, shape (N_train,)
training labels
k : int
Number of neighbors.
grid_size : int
Grid resolution for the contour.
"""
# --- Check for 2D features ---
if X_train.shape[1] != 2:
raise ValueError(
f"plot_decision_boundaries_2d supports only 2D features, "
f"but got X_train with shape {X_train.shape}"
)
classes = np.unique(y_train)
C = len(classes)
class_to_idx = {c: idx for idx, c in enumerate(classes)}
# Grid limits
x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
xx, yy = np.meshgrid(
np.linspace(x_min, x_max, grid_size),
np.linspace(y_min, y_max, grid_size),
)
grid_points = np.column_stack([xx.ravel(), yy.ravel()])
_, y_pred_grid = predict(grid_points, X_train, y_train, k)
Z_idx = np.vectorize(class_to_idx.get)(y_pred_grid).reshape(xx.shape)
# Discrete colormap
cmap = plt.cm.get_cmap("Set2", C)
levels = np.arange(C + 1) - 0.5
plt.figure(figsize=(12, 8))
# Filled boundaries
plt.contourf(xx, yy, Z_idx, levels=levels, cmap=cmap, alpha=0.3)
# Plot samples
for c, idx in class_to_idx.items():
mask = (y_train == c)
plt.scatter(
X_train[mask, 0], X_train[mask, 1],
c=[cmap(idx)], edgecolors="k", s=30
)
# --- Custom legend: Region + Samples per class ---
legend_elements = []
for c, idx in class_to_idx.items():
color = cmap(idx)
legend_elements.append(Patch(facecolor=color, edgecolor="none",
alpha=0.3, label=f"Region: class {c}"))
legend_elements.append(Line2D([], [], marker="o", linestyle="",
markerfacecolor=color,
markeredgecolor="k",
label=f"Samples: class {c}"))
plt.legend(handles=legend_elements, loc="upper right", framealpha=0.9)
plt.xlabel("x1")
plt.ylabel("x2")
plt.title(f"k-NN decision boundaries (k = {k})")
plt.grid(True)
plt.show()
# --------------------------------------------------
# Main runner
# --------------------------------------------------
if __name__ == "__main__":
# Load training and test sets
X_train, y_train = load_data(dataset=dataset3)
X_test, y_test = load_data(dataset=testset)
# Evaluate over k
k_values = np.arange(1, 31, 1)
accuracies = evaluate_over_k(X_train, y_train, X_test, y_test, k_values)
# Best k
best_idx = np.argmax(accuracies)
best_k = int(k_values[best_idx])
best_acc = accuracies[best_idx]
print(f"Best k: {best_k} with accuracy: {best_acc:.4f}")
# Plots
plot_accuracy_vs_k(k_values, accuracies)
plot_decision_boundaries_2d(X_train, y_train, best_k, grid_size=200)