Part C local code added
This commit is contained in:
parent
cc6b742553
commit
1508d413d8
71
src/partA.py
71
src/partA.py
@ -1,45 +1,29 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# Part A - Gaussian Parameter Estimation (MLE) & Visualization
|
||||||
|
# Pattern Recognition – Semester Assignment
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module implements Part A of the assignment:
|
||||||
|
# - Loading and splitting the dataset into classes
|
||||||
|
# - MLE estimation of mean vectors and covariance matrices
|
||||||
|
# - Construction of Gaussian pdf surfaces
|
||||||
|
# - 3D visualization of class-conditional densities
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# The implementation follows the theoretical formulation of
|
||||||
|
# multivariate Gaussian distributions and MLE parameter
|
||||||
|
# estimation as taught in class.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from toolbox import *
|
from toolbox import load_csv, split_dataset_by_class, dataset1
|
||||||
|
|
||||||
from typing import Tuple, Dict
|
from typing import Tuple, Dict
|
||||||
from pandas import DataFrame
|
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
|
||||||
# Part A: dataset splitting
|
|
||||||
# --------------------------------------------------
|
|
||||||
def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
|
|
||||||
"""
|
|
||||||
Splits a dataset into features, labels and per-class subsets with the assumptions that:
|
|
||||||
- All columns except the last are feature columns.
|
|
||||||
- The last column is the class label.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
df: DataFrame
|
|
||||||
Data samples as DataFrame.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
X : ndarray, shape (N, d), y : ndarray, shape (N,), classes : dict:
|
|
||||||
Feature matrix,
|
|
||||||
Labels,
|
|
||||||
Dictionary mapping each class label to the subset of X that belongs to that class.
|
|
||||||
|
|
||||||
Example
|
|
||||||
-------
|
|
||||||
X, y, classes = split_dataset_by_class(df)
|
|
||||||
"""
|
|
||||||
n_cols = df.shape[1] # Number of columns
|
|
||||||
X = df.iloc[:, :n_cols - 1].values # Features = all columns except last
|
|
||||||
y = df.iloc[:, n_cols - 1].values # Labels = last column
|
|
||||||
|
|
||||||
# Dictionary that maps class -> samples
|
|
||||||
classes = {c: X[y == c] for c in np.unique(y) }
|
|
||||||
|
|
||||||
return X, y, classes
|
|
||||||
|
|
||||||
|
|
||||||
def mle_mean(X: np.ndarray) -> np.ndarray:
|
def mle_mean(X: np.ndarray) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
@ -159,10 +143,13 @@ def compute_gaussian_grid(
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size)
|
tuple:
|
||||||
X Meshgrid coordinates for dimensions 0 and 1,
|
Xgrid: ndarray, shape (grid_size)
|
||||||
Y Meshgrid coordinates for dimensions 0 and 1,
|
X Meshgrid coordinates for dimensions 0 and 1
|
||||||
pdf values at each grid point.
|
Ygrid: ndarray, shape (grid_size)
|
||||||
|
Y Meshgrid coordinates for dimensions 0 and 1,
|
||||||
|
Z: ndarray, shape (grid_size)
|
||||||
|
pdf values at each grid point.
|
||||||
"""
|
"""
|
||||||
# Range only on the first two dimensions
|
# Range only on the first two dimensions
|
||||||
x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
|
x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
|
||||||
|
|||||||
16
src/partB.py
16
src/partB.py
@ -198,7 +198,9 @@ def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
y_true : ndarray
|
y_true : ndarray
|
||||||
|
actual labels array
|
||||||
y_pred : ndarray
|
y_pred : ndarray
|
||||||
|
predicted labels array
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -257,8 +259,11 @@ def plot_h_vs_error(h_values: np.ndarray, errors: np.ndarray, title: str) -> Non
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
h_values : ndarray
|
h_values : ndarray
|
||||||
|
bandwith values
|
||||||
errors : ndarray
|
errors : ndarray
|
||||||
|
error values
|
||||||
title : str
|
title : str
|
||||||
|
title
|
||||||
"""
|
"""
|
||||||
plt.figure(figsize=(8, 5))
|
plt.figure(figsize=(8, 5))
|
||||||
plt.plot(h_values, errors, marker='o')
|
plt.plot(h_values, errors, marker='o')
|
||||||
@ -274,6 +279,17 @@ def plot_histogram_with_pdf(
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Plots a histogram of the data and overlays the true N(mu_true, var_true) pdf.
|
Plots a histogram of the data and overlays the true N(mu_true, var_true) pdf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : ndarray
|
||||||
|
1D data samples.
|
||||||
|
mu_true : float
|
||||||
|
True mean, default to 1.0.
|
||||||
|
var_true : float
|
||||||
|
True variance, default to 4.0.
|
||||||
|
bins : int
|
||||||
|
number of bins, default to 30.
|
||||||
"""
|
"""
|
||||||
plt.figure(figsize=(8, 5))
|
plt.figure(figsize=(8, 5))
|
||||||
|
|
||||||
|
|||||||
325
src/partC.py
Normal file
325
src/partC.py
Normal file
@ -0,0 +1,325 @@
|
|||||||
|
# ------------------------------------------------------------
|
||||||
|
# Part C - k-Nearest Neighbors Classifier (k-NN)
|
||||||
|
# Pattern Recognition – Semester Assignment
|
||||||
|
#
|
||||||
|
# Author:
|
||||||
|
# Christos Choutouridis (ΑΕΜ 8997)
|
||||||
|
# cchoutou@ece.auth.gr
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# This module implements Part C of the assignment:
|
||||||
|
# - Implementation of a simple k-NN classifier in 2D
|
||||||
|
# - Manual computation of Euclidean distances (no ML libraries)
|
||||||
|
# - Probability estimation for any number of classes
|
||||||
|
# - Accuracy evaluation for k ∈ [1, 30]
|
||||||
|
# - Decision boundary visualization for the best k
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
from typing import Sequence, Tuple
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from matplotlib.lines import Line2D
|
||||||
|
from matplotlib.patches import Patch
|
||||||
|
from pandas import DataFrame
|
||||||
|
|
||||||
|
from toolbox import load_csv, split_dataset_by_class, dataset3, testset
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Dataset loading
|
||||||
|
# --------------------------------------------------
|
||||||
|
def load_data(dataset: DataFrame) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""
|
||||||
|
Loads dataset and splits it into features and labels.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple:
|
||||||
|
X (ndarray, shape (N, d)):
|
||||||
|
Feature vectors.
|
||||||
|
y (ndarray, shape (N,)):
|
||||||
|
Corresponding class labels.
|
||||||
|
"""
|
||||||
|
df = load_csv(dataset, header=None)
|
||||||
|
X, y, _ = split_dataset_by_class(df)
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# k-NN core functions
|
||||||
|
# --------------------------------------------------
|
||||||
|
def eucl(x: np.ndarray, trainData: np.ndarray) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Computes Euclidean distance of x from all training samples.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : ndarray, shape (d,)
|
||||||
|
Query point.
|
||||||
|
trainData : ndarray, shape (N, d)
|
||||||
|
Training feature vectors.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
distances : ndarray, shape (N,)
|
||||||
|
Euclidean distance from x to each training point.
|
||||||
|
"""
|
||||||
|
diff = trainData - x # shape (N, d)
|
||||||
|
sq_dist = np.sum(diff * diff, axis=1)
|
||||||
|
distances = np.sqrt(sq_dist)
|
||||||
|
return distances
|
||||||
|
|
||||||
|
|
||||||
|
def neighbors(x: np.ndarray, data: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""
|
||||||
|
Returns the indices and distances of the k nearest neighbors of x.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : ndarray, shape (d,)
|
||||||
|
data point
|
||||||
|
data : ndarray, shape (N, d)
|
||||||
|
dataset to search neighbors
|
||||||
|
k : int
|
||||||
|
Number of neighbors to consider
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple:
|
||||||
|
neighbor_indices : ndarray, shape (k,)
|
||||||
|
Indices of the k nearest neighbors.
|
||||||
|
neighbor_distances : ndarray, shape (k,)
|
||||||
|
Distances of the k nearest neighbors (ascending order).
|
||||||
|
"""
|
||||||
|
distances = eucl(x, data)
|
||||||
|
sorted_indices = np.argsort(distances)
|
||||||
|
neighbor_indices = sorted_indices[:k]
|
||||||
|
neighbor_distances = distances[neighbor_indices]
|
||||||
|
return neighbor_indices, neighbor_distances
|
||||||
|
|
||||||
|
|
||||||
|
def predict(
|
||||||
|
X_test: np.ndarray, X_train: np.ndarray, y_train: np.ndarray, k: int
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Predicts class probabilities and labels for each test sample using k-NN.
|
||||||
|
Supports an arbitrary number of classes.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X_test : ndarray, shape (N_test, d)
|
||||||
|
test features
|
||||||
|
X_train : ndarray, shape (N_train, d)
|
||||||
|
train features
|
||||||
|
y_train : ndarray, shape (N_train,)
|
||||||
|
Class labels (may be any discrete integers).
|
||||||
|
k : int
|
||||||
|
number of neighbors to consider
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple:
|
||||||
|
probs (ndarray, shape (N_test, C)):
|
||||||
|
probs[i, j] = estimated probability of class classes[j] for sample i.
|
||||||
|
y_pred (ndarray, shape (N_test,)):
|
||||||
|
Predicted label for each test sample.
|
||||||
|
"""
|
||||||
|
classes = np.unique(y_train)
|
||||||
|
C = len(classes)
|
||||||
|
N_test = X_test.shape[0]
|
||||||
|
|
||||||
|
probs = np.zeros((N_test, C))
|
||||||
|
y_pred = np.zeros(N_test, dtype=classes.dtype)
|
||||||
|
|
||||||
|
for i in range(N_test):
|
||||||
|
x = X_test[i]
|
||||||
|
neighbor_indices, _ = neighbors(x, X_train, k)
|
||||||
|
neighbor_labels = y_train[neighbor_indices]
|
||||||
|
|
||||||
|
# Probabilities per class
|
||||||
|
for j, c in enumerate(classes):
|
||||||
|
probs[i, j] = np.sum(neighbor_labels == c) / k
|
||||||
|
|
||||||
|
# Winner class
|
||||||
|
y_pred[i] = classes[np.argmax(probs[i])]
|
||||||
|
|
||||||
|
return probs, y_pred
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Accuracy & model evaluation
|
||||||
|
# --------------------------------------------------
|
||||||
|
def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
||||||
|
"""
|
||||||
|
Classification accuracy.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y_true : ndarray
|
||||||
|
actual labels
|
||||||
|
y_pred : ndarray
|
||||||
|
predicted labels
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
acc : float
|
||||||
|
Fraction of correctly classified samples.
|
||||||
|
"""
|
||||||
|
return float(np.mean(y_true == y_pred))
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_over_k(
|
||||||
|
X_train: np.ndarray, y_train: np.ndarray,
|
||||||
|
X_test: np.ndarray, y_test: np.ndarray,
|
||||||
|
k_values: Sequence[int],
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Evaluates k-NN accuracy for multiple values of k.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X_train, y_train:
|
||||||
|
training set
|
||||||
|
X_test, y_test:
|
||||||
|
test set
|
||||||
|
k_values :
|
||||||
|
sequence of int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
accuracies : ndarray, shape (len(k_values),)
|
||||||
|
Accuracy for each value of k.
|
||||||
|
"""
|
||||||
|
accuracies = np.zeros(len(k_values))
|
||||||
|
|
||||||
|
for i, k in enumerate(k_values):
|
||||||
|
_, y_pred = predict(X_test, X_train, y_train, k)
|
||||||
|
accuracies[i] = accuracy(y_test, y_pred)
|
||||||
|
|
||||||
|
return accuracies
|
||||||
|
|
||||||
|
|
||||||
|
def plot_accuracy_vs_k(k_values: np.ndarray, accuracies: np.ndarray) -> None:
|
||||||
|
"""
|
||||||
|
Plots k on the x-axis and accuracy on the y-axis.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
k_values: np.ndarray
|
||||||
|
sequence of int
|
||||||
|
accuracies: np.ndarray
|
||||||
|
accuracies array
|
||||||
|
"""
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
plt.plot(k_values, accuracies, marker="o")
|
||||||
|
plt.xlabel("k")
|
||||||
|
plt.ylabel("Accuracy")
|
||||||
|
plt.title("k-NN accuracy over k")
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Decision boundary visualization
|
||||||
|
# --------------------------------------------------
|
||||||
|
def plot_decision_boundaries_2d(
|
||||||
|
X_train: np.ndarray, y_train: np.ndarray, k: int, grid_size: int = 200
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Plots the decision boundaries of the k-NN classifier in 2D using contourf.
|
||||||
|
Supports any number of classes, but requires **exactly 2 features**.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X_train : ndarray, shape (N_train, 2)
|
||||||
|
training features
|
||||||
|
y_train : ndarray, shape (N_train,)
|
||||||
|
training labels
|
||||||
|
k : int
|
||||||
|
Number of neighbors.
|
||||||
|
grid_size : int
|
||||||
|
Grid resolution for the contour.
|
||||||
|
"""
|
||||||
|
# --- Check for 2D features ---
|
||||||
|
if X_train.shape[1] != 2:
|
||||||
|
raise ValueError(
|
||||||
|
f"plot_decision_boundaries_2d supports only 2D features, "
|
||||||
|
f"but got X_train with shape {X_train.shape}"
|
||||||
|
)
|
||||||
|
|
||||||
|
classes = np.unique(y_train)
|
||||||
|
C = len(classes)
|
||||||
|
class_to_idx = {c: idx for idx, c in enumerate(classes)}
|
||||||
|
|
||||||
|
# Grid limits
|
||||||
|
x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
|
||||||
|
y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
|
||||||
|
|
||||||
|
xx, yy = np.meshgrid(
|
||||||
|
np.linspace(x_min, x_max, grid_size),
|
||||||
|
np.linspace(y_min, y_max, grid_size),
|
||||||
|
)
|
||||||
|
|
||||||
|
grid_points = np.column_stack([xx.ravel(), yy.ravel()])
|
||||||
|
_, y_pred_grid = predict(grid_points, X_train, y_train, k)
|
||||||
|
|
||||||
|
Z_idx = np.vectorize(class_to_idx.get)(y_pred_grid).reshape(xx.shape)
|
||||||
|
|
||||||
|
# Discrete colormap
|
||||||
|
cmap = plt.cm.get_cmap("Set2", C)
|
||||||
|
levels = np.arange(C + 1) - 0.5
|
||||||
|
|
||||||
|
plt.figure(figsize=(12, 8))
|
||||||
|
|
||||||
|
# Filled boundaries
|
||||||
|
plt.contourf(xx, yy, Z_idx, levels=levels, cmap=cmap, alpha=0.3)
|
||||||
|
|
||||||
|
# Plot samples
|
||||||
|
for c, idx in class_to_idx.items():
|
||||||
|
mask = (y_train == c)
|
||||||
|
plt.scatter(
|
||||||
|
X_train[mask, 0], X_train[mask, 1],
|
||||||
|
c=[cmap(idx)], edgecolors="k", s=30
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Custom legend: Region + Samples per class ---
|
||||||
|
legend_elements = []
|
||||||
|
for c, idx in class_to_idx.items():
|
||||||
|
color = cmap(idx)
|
||||||
|
legend_elements.append(Patch(facecolor=color, edgecolor="none",
|
||||||
|
alpha=0.3, label=f"Region: class {c}"))
|
||||||
|
legend_elements.append(Line2D([], [], marker="o", linestyle="",
|
||||||
|
markerfacecolor=color,
|
||||||
|
markeredgecolor="k",
|
||||||
|
label=f"Samples: class {c}"))
|
||||||
|
|
||||||
|
plt.legend(handles=legend_elements, loc="upper right", framealpha=0.9)
|
||||||
|
plt.xlabel("x1")
|
||||||
|
plt.ylabel("x2")
|
||||||
|
plt.title(f"k-NN decision boundaries (k = {k})")
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Main runner
|
||||||
|
# --------------------------------------------------
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Load training and test sets
|
||||||
|
X_train, y_train = load_data(dataset=dataset3)
|
||||||
|
X_test, y_test = load_data(dataset=testset)
|
||||||
|
|
||||||
|
# Evaluate over k
|
||||||
|
k_values = np.arange(1, 31, 1)
|
||||||
|
accuracies = evaluate_over_k(X_train, y_train, X_test, y_test, k_values)
|
||||||
|
|
||||||
|
# Best k
|
||||||
|
best_idx = np.argmax(accuracies)
|
||||||
|
best_k = int(k_values[best_idx])
|
||||||
|
best_acc = accuracies[best_idx]
|
||||||
|
|
||||||
|
print(f"Best k: {best_k} with accuracy: {best_acc:.4f}")
|
||||||
|
|
||||||
|
# Plots
|
||||||
|
plot_accuracy_vs_k(k_values, accuracies)
|
||||||
|
plot_decision_boundaries_2d(X_train, y_train, best_k, grid_size=200)
|
||||||
@ -6,7 +6,11 @@
|
|||||||
# cchoutou@ece.auth.gr
|
# cchoutou@ece.auth.gr
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
from typing import Tuple, Dict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
|
||||||
|
|
||||||
def github_raw(user, repo, branch, path):
|
def github_raw(user, repo, branch, path):
|
||||||
@ -23,4 +27,39 @@ def load_csv(path, header=None):
|
|||||||
"""
|
"""
|
||||||
Loads a CSV file and returns a pandas DataFrame.
|
Loads a CSV file and returns a pandas DataFrame.
|
||||||
"""
|
"""
|
||||||
return pd.read_csv(path, header=header)
|
return pd.read_csv(path, header=header)
|
||||||
|
|
||||||
|
|
||||||
|
def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
|
||||||
|
"""
|
||||||
|
Splits a dataset into features, labels and per-class subsets with the assumptions that:
|
||||||
|
|
||||||
|
- All columns except the last are feature columns.
|
||||||
|
- The last column is the class label.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df: DataFrame
|
||||||
|
Data samples as DataFrame.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple:
|
||||||
|
X : ndarray, shape (N, d)
|
||||||
|
Feature matrix.
|
||||||
|
y : ndarray, shape (N,)
|
||||||
|
Labels.
|
||||||
|
classes : dict
|
||||||
|
Dictionary mapping each class label to the subset of X that belongs to that class.
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
X, y, classes = split_dataset_by_class(df)
|
||||||
|
"""
|
||||||
|
n_cols = df.shape[1] # Number of columns
|
||||||
|
X = df.iloc[:, :n_cols - 1].values # Features = all columns except last
|
||||||
|
y = df.iloc[:, n_cols - 1].values # Labels = last column
|
||||||
|
|
||||||
|
classes = {c: X[y == c] for c in np.unique(y)}
|
||||||
|
|
||||||
|
return X, y, classes
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user