Part C local code added
This commit is contained in:
parent
cc6b742553
commit
1508d413d8
71
src/partA.py
71
src/partA.py
@ -1,45 +1,29 @@
|
||||
# ------------------------------------------------------------
|
||||
# Part A - Gaussian Parameter Estimation (MLE) & Visualization
|
||||
# Pattern Recognition – Semester Assignment
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# This module implements Part A of the assignment:
|
||||
# - Loading and splitting the dataset into classes
|
||||
# - MLE estimation of mean vectors and covariance matrices
|
||||
# - Construction of Gaussian pdf surfaces
|
||||
# - 3D visualization of class-conditional densities
|
||||
#
|
||||
# Notes:
|
||||
# The implementation follows the theoretical formulation of
|
||||
# multivariate Gaussian distributions and MLE parameter
|
||||
# estimation as taught in class.
|
||||
# ------------------------------------------------------------
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from toolbox import *
|
||||
from toolbox import load_csv, split_dataset_by_class, dataset1
|
||||
|
||||
from typing import Tuple, Dict
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Part A: dataset splitting
|
||||
# --------------------------------------------------
|
||||
def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
|
||||
"""
|
||||
Splits a dataset into features, labels and per-class subsets with the assumptions that:
|
||||
- All columns except the last are feature columns.
|
||||
- The last column is the class label.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df: DataFrame
|
||||
Data samples as DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray, shape (N, d), y : ndarray, shape (N,), classes : dict:
|
||||
Feature matrix,
|
||||
Labels,
|
||||
Dictionary mapping each class label to the subset of X that belongs to that class.
|
||||
|
||||
Example
|
||||
-------
|
||||
X, y, classes = split_dataset_by_class(df)
|
||||
"""
|
||||
n_cols = df.shape[1] # Number of columns
|
||||
X = df.iloc[:, :n_cols - 1].values # Features = all columns except last
|
||||
y = df.iloc[:, n_cols - 1].values # Labels = last column
|
||||
|
||||
# Dictionary that maps class -> samples
|
||||
classes = {c: X[y == c] for c in np.unique(y) }
|
||||
|
||||
return X, y, classes
|
||||
|
||||
|
||||
def mle_mean(X: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
@ -159,10 +143,13 @@ def compute_gaussian_grid(
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size)
|
||||
X Meshgrid coordinates for dimensions 0 and 1,
|
||||
Y Meshgrid coordinates for dimensions 0 and 1,
|
||||
pdf values at each grid point.
|
||||
tuple:
|
||||
Xgrid: ndarray, shape (grid_size)
|
||||
X Meshgrid coordinates for dimensions 0 and 1
|
||||
Ygrid: ndarray, shape (grid_size)
|
||||
Y Meshgrid coordinates for dimensions 0 and 1,
|
||||
Z: ndarray, shape (grid_size)
|
||||
pdf values at each grid point.
|
||||
"""
|
||||
# Range only on the first two dimensions
|
||||
x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
|
||||
|
||||
16
src/partB.py
16
src/partB.py
@ -198,7 +198,9 @@ def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
||||
Parameters
|
||||
----------
|
||||
y_true : ndarray
|
||||
actual labels array
|
||||
y_pred : ndarray
|
||||
predicted labels array
|
||||
|
||||
Returns
|
||||
-------
|
||||
@ -257,8 +259,11 @@ def plot_h_vs_error(h_values: np.ndarray, errors: np.ndarray, title: str) -> Non
|
||||
Parameters
|
||||
----------
|
||||
h_values : ndarray
|
||||
bandwith values
|
||||
errors : ndarray
|
||||
error values
|
||||
title : str
|
||||
title
|
||||
"""
|
||||
plt.figure(figsize=(8, 5))
|
||||
plt.plot(h_values, errors, marker='o')
|
||||
@ -274,6 +279,17 @@ def plot_histogram_with_pdf(
|
||||
) -> None:
|
||||
"""
|
||||
Plots a histogram of the data and overlays the true N(mu_true, var_true) pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
1D data samples.
|
||||
mu_true : float
|
||||
True mean, default to 1.0.
|
||||
var_true : float
|
||||
True variance, default to 4.0.
|
||||
bins : int
|
||||
number of bins, default to 30.
|
||||
"""
|
||||
plt.figure(figsize=(8, 5))
|
||||
|
||||
|
||||
325
src/partC.py
Normal file
325
src/partC.py
Normal file
@ -0,0 +1,325 @@
|
||||
# ------------------------------------------------------------
|
||||
# Part C - k-Nearest Neighbors Classifier (k-NN)
|
||||
# Pattern Recognition – Semester Assignment
|
||||
#
|
||||
# Author:
|
||||
# Christos Choutouridis (ΑΕΜ 8997)
|
||||
# cchoutou@ece.auth.gr
|
||||
#
|
||||
# Description:
|
||||
# This module implements Part C of the assignment:
|
||||
# - Implementation of a simple k-NN classifier in 2D
|
||||
# - Manual computation of Euclidean distances (no ML libraries)
|
||||
# - Probability estimation for any number of classes
|
||||
# - Accuracy evaluation for k ∈ [1, 30]
|
||||
# - Decision boundary visualization for the best k
|
||||
# ------------------------------------------------------------
|
||||
|
||||
from typing import Sequence, Tuple
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.lines import Line2D
|
||||
from matplotlib.patches import Patch
|
||||
from pandas import DataFrame
|
||||
|
||||
from toolbox import load_csv, split_dataset_by_class, dataset3, testset
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Dataset loading
|
||||
# --------------------------------------------------
|
||||
def load_data(dataset: DataFrame) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Loads dataset and splits it into features and labels.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple:
|
||||
X (ndarray, shape (N, d)):
|
||||
Feature vectors.
|
||||
y (ndarray, shape (N,)):
|
||||
Corresponding class labels.
|
||||
"""
|
||||
df = load_csv(dataset, header=None)
|
||||
X, y, _ = split_dataset_by_class(df)
|
||||
return X, y
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# k-NN core functions
|
||||
# --------------------------------------------------
|
||||
def eucl(x: np.ndarray, trainData: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Computes Euclidean distance of x from all training samples.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : ndarray, shape (d,)
|
||||
Query point.
|
||||
trainData : ndarray, shape (N, d)
|
||||
Training feature vectors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distances : ndarray, shape (N,)
|
||||
Euclidean distance from x to each training point.
|
||||
"""
|
||||
diff = trainData - x # shape (N, d)
|
||||
sq_dist = np.sum(diff * diff, axis=1)
|
||||
distances = np.sqrt(sq_dist)
|
||||
return distances
|
||||
|
||||
|
||||
def neighbors(x: np.ndarray, data: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Returns the indices and distances of the k nearest neighbors of x.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : ndarray, shape (d,)
|
||||
data point
|
||||
data : ndarray, shape (N, d)
|
||||
dataset to search neighbors
|
||||
k : int
|
||||
Number of neighbors to consider
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple:
|
||||
neighbor_indices : ndarray, shape (k,)
|
||||
Indices of the k nearest neighbors.
|
||||
neighbor_distances : ndarray, shape (k,)
|
||||
Distances of the k nearest neighbors (ascending order).
|
||||
"""
|
||||
distances = eucl(x, data)
|
||||
sorted_indices = np.argsort(distances)
|
||||
neighbor_indices = sorted_indices[:k]
|
||||
neighbor_distances = distances[neighbor_indices]
|
||||
return neighbor_indices, neighbor_distances
|
||||
|
||||
|
||||
def predict(
|
||||
X_test: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, k: int
|
||||
):
|
||||
"""
|
||||
Predicts class probabilities and labels for each test sample using k-NN.
|
||||
Supports an arbitrary number of classes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_test : ndarray, shape (N_test, d)
|
||||
test features
|
||||
X_train : ndarray, shape (N_train, d)
|
||||
train features
|
||||
y_train : ndarray, shape (N_train,)
|
||||
Class labels (may be any discrete integers).
|
||||
k : int
|
||||
number of neighbors to consider
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple:
|
||||
probs (ndarray, shape (N_test, C)):
|
||||
probs[i, j] = estimated probability of class classes[j] for sample i.
|
||||
y_pred (ndarray, shape (N_test,)):
|
||||
Predicted label for each test sample.
|
||||
"""
|
||||
classes = np.unique(y_train)
|
||||
C = len(classes)
|
||||
N_test = X_test.shape[0]
|
||||
|
||||
probs = np.zeros((N_test, C))
|
||||
y_pred = np.zeros(N_test, dtype=classes.dtype)
|
||||
|
||||
for i in range(N_test):
|
||||
x = X_test[i]
|
||||
neighbor_indices, _ = neighbors(x, X_train, k)
|
||||
neighbor_labels = y_train[neighbor_indices]
|
||||
|
||||
# Probabilities per class
|
||||
for j, c in enumerate(classes):
|
||||
probs[i, j] = np.sum(neighbor_labels == c) / k
|
||||
|
||||
# Winner class
|
||||
y_pred[i] = classes[np.argmax(probs[i])]
|
||||
|
||||
return probs, y_pred
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Accuracy & model evaluation
|
||||
# --------------------------------------------------
|
||||
def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
||||
"""
|
||||
Classification accuracy.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : ndarray
|
||||
actual labels
|
||||
y_pred : ndarray
|
||||
predicted labels
|
||||
|
||||
Returns
|
||||
-------
|
||||
acc : float
|
||||
Fraction of correctly classified samples.
|
||||
"""
|
||||
return float(np.mean(y_true == y_pred))
|
||||
|
||||
|
||||
def evaluate_over_k(
|
||||
X_train: np.ndarray, y_train: np.ndarray,
|
||||
X_test: np.ndarray, y_test: np.ndarray,
|
||||
k_values: Sequence[int],
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Evaluates k-NN accuracy for multiple values of k.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_train, y_train:
|
||||
training set
|
||||
X_test, y_test:
|
||||
test set
|
||||
k_values :
|
||||
sequence of int
|
||||
|
||||
Returns
|
||||
-------
|
||||
accuracies : ndarray, shape (len(k_values),)
|
||||
Accuracy for each value of k.
|
||||
"""
|
||||
accuracies = np.zeros(len(k_values))
|
||||
|
||||
for i, k in enumerate(k_values):
|
||||
_, y_pred = predict(X_test, X_train, y_train, k)
|
||||
accuracies[i] = accuracy(y_test, y_pred)
|
||||
|
||||
return accuracies
|
||||
|
||||
|
||||
def plot_accuracy_vs_k(k_values: np.ndarray, accuracies: np.ndarray) -> None:
|
||||
"""
|
||||
Plots k on the x-axis and accuracy on the y-axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k_values: np.ndarray
|
||||
sequence of int
|
||||
accuracies: np.ndarray
|
||||
accuracies array
|
||||
"""
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.plot(k_values, accuracies, marker="o")
|
||||
plt.xlabel("k")
|
||||
plt.ylabel("Accuracy")
|
||||
plt.title("k-NN accuracy over k")
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Decision boundary visualization
|
||||
# --------------------------------------------------
|
||||
def plot_decision_boundaries_2d(
|
||||
X_train: np.ndarray, y_train: np.ndarray, k: int, grid_size: int = 200
|
||||
) -> None:
|
||||
"""
|
||||
Plots the decision boundaries of the k-NN classifier in 2D using contourf.
|
||||
Supports any number of classes, but requires **exactly 2 features**.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_train : ndarray, shape (N_train, 2)
|
||||
training features
|
||||
y_train : ndarray, shape (N_train,)
|
||||
training labels
|
||||
k : int
|
||||
Number of neighbors.
|
||||
grid_size : int
|
||||
Grid resolution for the contour.
|
||||
"""
|
||||
# --- Check for 2D features ---
|
||||
if X_train.shape[1] != 2:
|
||||
raise ValueError(
|
||||
f"plot_decision_boundaries_2d supports only 2D features, "
|
||||
f"but got X_train with shape {X_train.shape}"
|
||||
)
|
||||
|
||||
classes = np.unique(y_train)
|
||||
C = len(classes)
|
||||
class_to_idx = {c: idx for idx, c in enumerate(classes)}
|
||||
|
||||
# Grid limits
|
||||
x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
|
||||
y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
|
||||
|
||||
xx, yy = np.meshgrid(
|
||||
np.linspace(x_min, x_max, grid_size),
|
||||
np.linspace(y_min, y_max, grid_size),
|
||||
)
|
||||
|
||||
grid_points = np.column_stack([xx.ravel(), yy.ravel()])
|
||||
_, y_pred_grid = predict(grid_points, X_train, y_train, k)
|
||||
|
||||
Z_idx = np.vectorize(class_to_idx.get)(y_pred_grid).reshape(xx.shape)
|
||||
|
||||
# Discrete colormap
|
||||
cmap = plt.cm.get_cmap("Set2", C)
|
||||
levels = np.arange(C + 1) - 0.5
|
||||
|
||||
plt.figure(figsize=(12, 8))
|
||||
|
||||
# Filled boundaries
|
||||
plt.contourf(xx, yy, Z_idx, levels=levels, cmap=cmap, alpha=0.3)
|
||||
|
||||
# Plot samples
|
||||
for c, idx in class_to_idx.items():
|
||||
mask = (y_train == c)
|
||||
plt.scatter(
|
||||
X_train[mask, 0], X_train[mask, 1],
|
||||
c=[cmap(idx)], edgecolors="k", s=30
|
||||
)
|
||||
|
||||
# --- Custom legend: Region + Samples per class ---
|
||||
legend_elements = []
|
||||
for c, idx in class_to_idx.items():
|
||||
color = cmap(idx)
|
||||
legend_elements.append(Patch(facecolor=color, edgecolor="none",
|
||||
alpha=0.3, label=f"Region: class {c}"))
|
||||
legend_elements.append(Line2D([], [], marker="o", linestyle="",
|
||||
markerfacecolor=color,
|
||||
markeredgecolor="k",
|
||||
label=f"Samples: class {c}"))
|
||||
|
||||
plt.legend(handles=legend_elements, loc="upper right", framealpha=0.9)
|
||||
plt.xlabel("x1")
|
||||
plt.ylabel("x2")
|
||||
plt.title(f"k-NN decision boundaries (k = {k})")
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Main runner
|
||||
# --------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
# Load training and test sets
|
||||
X_train, y_train = load_data(dataset=dataset3)
|
||||
X_test, y_test = load_data(dataset=testset)
|
||||
|
||||
# Evaluate over k
|
||||
k_values = np.arange(1, 31, 1)
|
||||
accuracies = evaluate_over_k(X_train, y_train, X_test, y_test, k_values)
|
||||
|
||||
# Best k
|
||||
best_idx = np.argmax(accuracies)
|
||||
best_k = int(k_values[best_idx])
|
||||
best_acc = accuracies[best_idx]
|
||||
|
||||
print(f"Best k: {best_k} with accuracy: {best_acc:.4f}")
|
||||
|
||||
# Plots
|
||||
plot_accuracy_vs_k(k_values, accuracies)
|
||||
plot_decision_boundaries_2d(X_train, y_train, best_k, grid_size=200)
|
||||
@ -6,7 +6,11 @@
|
||||
# cchoutou@ece.auth.gr
|
||||
# ------------------------------------------------------------
|
||||
|
||||
from typing import Tuple, Dict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
def github_raw(user, repo, branch, path):
|
||||
@ -23,4 +27,39 @@ def load_csv(path, header=None):
|
||||
"""
|
||||
Loads a CSV file and returns a pandas DataFrame.
|
||||
"""
|
||||
return pd.read_csv(path, header=header)
|
||||
return pd.read_csv(path, header=header)
|
||||
|
||||
|
||||
def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
|
||||
"""
|
||||
Splits a dataset into features, labels and per-class subsets with the assumptions that:
|
||||
|
||||
- All columns except the last are feature columns.
|
||||
- The last column is the class label.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df: DataFrame
|
||||
Data samples as DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple:
|
||||
X : ndarray, shape (N, d)
|
||||
Feature matrix.
|
||||
y : ndarray, shape (N,)
|
||||
Labels.
|
||||
classes : dict
|
||||
Dictionary mapping each class label to the subset of X that belongs to that class.
|
||||
|
||||
Example
|
||||
-------
|
||||
X, y, classes = split_dataset_by_class(df)
|
||||
"""
|
||||
n_cols = df.shape[1] # Number of columns
|
||||
X = df.iloc[:, :n_cols - 1].values # Features = all columns except last
|
||||
y = df.iloc[:, n_cols - 1].values # Labels = last column
|
||||
|
||||
classes = {c: X[y == c] for c in np.unique(y)}
|
||||
|
||||
return X, y, classes
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user