Part A local code added

This commit is contained in:
Christos Choutouridis 2025-12-09 20:08:04 +02:00
parent 8b67205562
commit 49948887c1
5 changed files with 269 additions and 0 deletions

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
# Python
.venv/*
# IDEs
.idea/*

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
numpy
pandas
matplotlib

3
src/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Python
__pycache__/*

239
src/partA.py Normal file
View File

@ -0,0 +1,239 @@
import matplotlib.pyplot as plt
import numpy as np
from toolbox import *
from typing import Tuple, Dict
from pandas import DataFrame
# --------------------------------------------------
# Part A: dataset splitting
# --------------------------------------------------
def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
"""
Splits a dataset into features, labels and per-class subsets with the assumptions that:
- All columns except the last are feature columns.
- The last column is the class label.
Parameters
----------
df: DataFrame
Data samples as DataFrame.
Returns
-------
X : ndarray, shape (N, d), y : ndarray, shape (N,), classes : dict:
Feature matrix,
Labels,
Dictionary mapping each class label to the subset of X that belongs to that class.
Example
-------
X, y, classes = split_dataset_by_class(df)
"""
n_cols = df.shape[1] # Number of columns
X = df.iloc[:, :n_cols - 1].values # Features = all columns except last
y = df.iloc[:, n_cols - 1].values # Labels = last column
# Dictionary that maps class -> samples
classes = {c: X[y == c] for c in np.unique(y) }
return X, y, classes
def mle_mean(X: np.ndarray) -> np.ndarray:
"""
MLE estimate of the mean vector.
Parameters
----------
X : ndarray, shape (N, d)
Data samples.
Returns
-------
mu : ndarray, shape (d,)
Estimated mean vector.
"""
return np.sum(X, axis=0) / X.shape[0]
def mle_covariance(X: np.ndarray, mu: np.ndarray) -> np.ndarray:
"""
MLE estimate of the covariance matrix.
(Divide by N, not N-1)
Parameters
----------
X : ndarray, shape (N, d)
Data samples.
mu : ndarray, shape (d,)
Mean vector.
Returns
-------
cov : ndarray, shape (d, d)
Covariance matrix.
"""
N = X.shape[0]
diff = X - mu
cov = (diff.T @ diff) / N
return cov
def estimate_gaussians_mle(classes: Dict[int, np.ndarray]) -> Dict[int, Tuple[np.ndarray, np.ndarray]]:
"""
Estimates mean and covariance (MLE) for each class.
Parameters
----------
classes : dict
Dictionary mapping class label -> samples of that class.
Returns
-------
params : dict
Dictionary mapping class label -> (mu, cov),
where mu has shape (d,) and cov has shape (d,d).
"""
params: Dict[int, Tuple[np.ndarray, np.ndarray]] = {}
for c, Xc in classes.items():
mu_c = mle_mean(Xc)
cov_c = mle_covariance(Xc, mu_c)
params[c] = (mu_c, cov_c)
return params
# --------------------------------------------------
# Part A: Gaussian pdf and grid computation
# --------------------------------------------------
def gaussian_pdf(point: np.ndarray, mu: np.ndarray, cov: np.ndarray) -> float:
"""
Multivariate Gaussian pdf at a single point (general dimension).
Parameters
----------
point : ndarray, shape (d,)
feature data of the point
mu : ndarray, shape (d,)
mean vector
cov : ndarray, shape (d,d)
covariance array
Returns
-------
value : float
pdf value at `point`.
"""
d = mu.shape[0] # dimension
diff = point - mu
det = np.linalg.det(cov)
inv = np.linalg.inv(cov)
# (2π)^(d/2) * sqrt(det Σ)
norm_const = 1.0 / np.sqrt(((2 * np.pi) ** d) * det)
exponent = -0.5 * diff.T @ inv @ diff
return float(norm_const * np.exp(exponent))
def compute_gaussian_grid(
X: np.ndarray, mu: np.ndarray, cov: np.ndarray, grid_size: int = 50
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Creates a 2D grid over the range of the first two dimensions of X
and computes pdf values using the multivariate Gaussian pdf.
Parameters
----------
X : ndarray, shape (N, d)
Data samples (only used to define plotting range for dims 0 and 1).
mu : ndarray, shape (d,)
mean vector value
cov : ndarray, shape (d,d)
covariance
grid_size : int
Number of points per axis.
Returns
-------
Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size)
X Meshgrid coordinates for dimensions 0 and 1,
Y Meshgrid coordinates for dimensions 0 and 1,
pdf values at each grid point.
"""
# Range only on the first two dimensions
x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
y_vals = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), grid_size)
Xgrid, Ygrid = np.meshgrid(x_vals, y_vals)
Z = np.zeros_like(Xgrid, dtype=float)
for i in range(Xgrid.shape[0]):
for j in range(Xgrid.shape[1]):
point = np.array([Xgrid[i, j], Ygrid[i, j]])
Z[i, j] = gaussian_pdf(point, mu, cov)
return Xgrid, Ygrid, Z
# --------------------------------------------------
# Part A: 3D plotting for multiple classes
# --------------------------------------------------
def plot_gaussians_3d(
X: np.ndarray, params: Dict[int, Tuple[np.ndarray, np.ndarray]], grid_size: int = 50
) -> None:
"""
Plots the Gaussian pdfs (MLE estimates) for all classes on a single 3D figure.
Parameters
----------
X : ndarray, shape (N, 2)
All data samples (used to define the plotting range).
params : dict
Dictionary mapping class label -> (mu, cov).
grid_size : int
Resolution of the grid for pdf evaluation.
"""
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
for idx, (c, (mu_c, cov_c)) in enumerate(params.items()):
Xgrid, Ygrid, Z = compute_gaussian_grid(X, mu_c, cov_c, grid_size=grid_size)
ax.plot_surface(Xgrid, Ygrid, Z, alpha=0.6, label=f"Class {c}")
ax.set_title("MLE Estimated 2D Gaussians (all classes)")
ax.set_xlabel("X1")
ax.set_ylabel("X2")
ax.set_zlabel("pdf")
plt.show()
# --------------------------------------------------
# Part A: convenience runner (optional)
# --------------------------------------------------
if __name__ == "__main__":
"""
Convenience function to run the whole Part A pipeline:
- load dataset
- split by class
- estimate Gaussian parameters (MLE) per class
- plot 3D pdf surfaces
"""
df = load_csv(dataset1, header=None)
X, y, classes = split_dataset_by_class(df)
params = estimate_gaussians_mle(classes)
# Optional parameters printing
for c, (mu_c, cov_c) in params.items():
print(f"Class {c}:")
print(" mu =", mu_c)
print(" cov =\n", cov_c)
print()
# Plot 3D surfaces
plot_gaussians_3d(X, params, grid_size=50)

20
src/toolbox.py Normal file
View File

@ -0,0 +1,20 @@
# -----------------------------
# Toolbox
# -----------------------------
import pandas as pd
def github_raw(user, repo, branch, path):
return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{path}"
dataset1 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset1.csv"))
dataset2 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset2.csv"))
dataset3 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset3.csv"))
testset = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/testset.csv"))
def load_csv(path, header=None):
"""
Loads a CSV file and returns a pandas DataFrame.
"""
return pd.read_csv(path, header=header)