Part A local code added
This commit is contained in:
parent
8b67205562
commit
49948887c1
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,6 @@
|
||||
# Python
|
||||
.venv/*
|
||||
|
||||
# IDEs
|
||||
.idea/*
|
||||
|
||||
|
||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
numpy
|
||||
pandas
|
||||
matplotlib
|
||||
|
||||
3
src/.gitignore
vendored
Normal file
3
src/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Python
|
||||
__pycache__/*
|
||||
|
||||
239
src/partA.py
Normal file
239
src/partA.py
Normal file
@ -0,0 +1,239 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from toolbox import *
|
||||
|
||||
from typing import Tuple, Dict
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Part A: dataset splitting
|
||||
# --------------------------------------------------
|
||||
def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
|
||||
"""
|
||||
Splits a dataset into features, labels and per-class subsets with the assumptions that:
|
||||
- All columns except the last are feature columns.
|
||||
- The last column is the class label.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df: DataFrame
|
||||
Data samples as DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray, shape (N, d), y : ndarray, shape (N,), classes : dict:
|
||||
Feature matrix,
|
||||
Labels,
|
||||
Dictionary mapping each class label to the subset of X that belongs to that class.
|
||||
|
||||
Example
|
||||
-------
|
||||
X, y, classes = split_dataset_by_class(df)
|
||||
"""
|
||||
n_cols = df.shape[1] # Number of columns
|
||||
X = df.iloc[:, :n_cols - 1].values # Features = all columns except last
|
||||
y = df.iloc[:, n_cols - 1].values # Labels = last column
|
||||
|
||||
# Dictionary that maps class -> samples
|
||||
classes = {c: X[y == c] for c in np.unique(y) }
|
||||
|
||||
return X, y, classes
|
||||
|
||||
|
||||
def mle_mean(X: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
MLE estimate of the mean vector.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (N, d)
|
||||
Data samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mu : ndarray, shape (d,)
|
||||
Estimated mean vector.
|
||||
"""
|
||||
return np.sum(X, axis=0) / X.shape[0]
|
||||
|
||||
|
||||
def mle_covariance(X: np.ndarray, mu: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
MLE estimate of the covariance matrix.
|
||||
(Divide by N, not N-1)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (N, d)
|
||||
Data samples.
|
||||
mu : ndarray, shape (d,)
|
||||
Mean vector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cov : ndarray, shape (d, d)
|
||||
Covariance matrix.
|
||||
"""
|
||||
N = X.shape[0]
|
||||
diff = X - mu
|
||||
cov = (diff.T @ diff) / N
|
||||
return cov
|
||||
|
||||
|
||||
def estimate_gaussians_mle(classes: Dict[int, np.ndarray]) -> Dict[int, Tuple[np.ndarray, np.ndarray]]:
|
||||
"""
|
||||
Estimates mean and covariance (MLE) for each class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
classes : dict
|
||||
Dictionary mapping class label -> samples of that class.
|
||||
|
||||
Returns
|
||||
-------
|
||||
params : dict
|
||||
Dictionary mapping class label -> (mu, cov),
|
||||
where mu has shape (d,) and cov has shape (d,d).
|
||||
"""
|
||||
params: Dict[int, Tuple[np.ndarray, np.ndarray]] = {}
|
||||
|
||||
for c, Xc in classes.items():
|
||||
mu_c = mle_mean(Xc)
|
||||
cov_c = mle_covariance(Xc, mu_c)
|
||||
params[c] = (mu_c, cov_c)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Part A: Gaussian pdf and grid computation
|
||||
# --------------------------------------------------
|
||||
def gaussian_pdf(point: np.ndarray, mu: np.ndarray, cov: np.ndarray) -> float:
|
||||
"""
|
||||
Multivariate Gaussian pdf at a single point (general dimension).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
point : ndarray, shape (d,)
|
||||
feature data of the point
|
||||
mu : ndarray, shape (d,)
|
||||
mean vector
|
||||
cov : ndarray, shape (d,d)
|
||||
covariance array
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : float
|
||||
pdf value at `point`.
|
||||
"""
|
||||
d = mu.shape[0] # dimension
|
||||
diff = point - mu
|
||||
det = np.linalg.det(cov)
|
||||
inv = np.linalg.inv(cov)
|
||||
|
||||
# (2π)^(d/2) * sqrt(det Σ)
|
||||
norm_const = 1.0 / np.sqrt(((2 * np.pi) ** d) * det)
|
||||
exponent = -0.5 * diff.T @ inv @ diff
|
||||
|
||||
return float(norm_const * np.exp(exponent))
|
||||
|
||||
|
||||
def compute_gaussian_grid(
|
||||
X: np.ndarray, mu: np.ndarray, cov: np.ndarray, grid_size: int = 50
|
||||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Creates a 2D grid over the range of the first two dimensions of X
|
||||
and computes pdf values using the multivariate Gaussian pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (N, d)
|
||||
Data samples (only used to define plotting range for dims 0 and 1).
|
||||
mu : ndarray, shape (d,)
|
||||
mean vector value
|
||||
cov : ndarray, shape (d,d)
|
||||
covariance
|
||||
grid_size : int
|
||||
Number of points per axis.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size)
|
||||
X Meshgrid coordinates for dimensions 0 and 1,
|
||||
Y Meshgrid coordinates for dimensions 0 and 1,
|
||||
pdf values at each grid point.
|
||||
"""
|
||||
# Range only on the first two dimensions
|
||||
x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
|
||||
y_vals = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), grid_size)
|
||||
|
||||
Xgrid, Ygrid = np.meshgrid(x_vals, y_vals)
|
||||
Z = np.zeros_like(Xgrid, dtype=float)
|
||||
|
||||
for i in range(Xgrid.shape[0]):
|
||||
for j in range(Xgrid.shape[1]):
|
||||
point = np.array([Xgrid[i, j], Ygrid[i, j]])
|
||||
Z[i, j] = gaussian_pdf(point, mu, cov)
|
||||
|
||||
return Xgrid, Ygrid, Z
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Part A: 3D plotting for multiple classes
|
||||
# --------------------------------------------------
|
||||
def plot_gaussians_3d(
|
||||
X: np.ndarray, params: Dict[int, Tuple[np.ndarray, np.ndarray]], grid_size: int = 50
|
||||
) -> None:
|
||||
"""
|
||||
Plots the Gaussian pdfs (MLE estimates) for all classes on a single 3D figure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (N, 2)
|
||||
All data samples (used to define the plotting range).
|
||||
params : dict
|
||||
Dictionary mapping class label -> (mu, cov).
|
||||
grid_size : int
|
||||
Resolution of the grid for pdf evaluation.
|
||||
"""
|
||||
fig = plt.figure(figsize=(12, 8))
|
||||
ax = fig.add_subplot(111, projection='3d')
|
||||
|
||||
for idx, (c, (mu_c, cov_c)) in enumerate(params.items()):
|
||||
Xgrid, Ygrid, Z = compute_gaussian_grid(X, mu_c, cov_c, grid_size=grid_size)
|
||||
ax.plot_surface(Xgrid, Ygrid, Z, alpha=0.6, label=f"Class {c}")
|
||||
|
||||
ax.set_title("MLE Estimated 2D Gaussians (all classes)")
|
||||
ax.set_xlabel("X1")
|
||||
ax.set_ylabel("X2")
|
||||
ax.set_zlabel("pdf")
|
||||
plt.show()
|
||||
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
# Part A: convenience runner (optional)
|
||||
# --------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Convenience function to run the whole Part A pipeline:
|
||||
- load dataset
|
||||
- split by class
|
||||
- estimate Gaussian parameters (MLE) per class
|
||||
- plot 3D pdf surfaces
|
||||
"""
|
||||
df = load_csv(dataset1, header=None)
|
||||
|
||||
X, y, classes = split_dataset_by_class(df)
|
||||
params = estimate_gaussians_mle(classes)
|
||||
|
||||
# Optional parameters printing
|
||||
for c, (mu_c, cov_c) in params.items():
|
||||
print(f"Class {c}:")
|
||||
print(" mu =", mu_c)
|
||||
print(" cov =\n", cov_c)
|
||||
print()
|
||||
|
||||
# Plot 3D surfaces
|
||||
plot_gaussians_3d(X, params, grid_size=50)
|
||||
20
src/toolbox.py
Normal file
20
src/toolbox.py
Normal file
@ -0,0 +1,20 @@
|
||||
# -----------------------------
|
||||
# Toolbox
|
||||
# -----------------------------
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def github_raw(user, repo, branch, path):
|
||||
return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{path}"
|
||||
|
||||
dataset1 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset1.csv"))
|
||||
dataset2 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset2.csv"))
|
||||
dataset3 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset3.csv"))
|
||||
testset = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/testset.csv"))
|
||||
|
||||
def load_csv(path, header=None):
|
||||
"""
|
||||
Loads a CSV file and returns a pandas DataFrame.
|
||||
"""
|
||||
return pd.read_csv(path, header=header)
|
||||
Loading…
x
Reference in New Issue
Block a user