From 49948887c195dc19a9718072a20b44dddf5f92db Mon Sep 17 00:00:00 2001
From: Christos Choutouridis <hoo2@hoo2.net>
Date: Tue, 9 Dec 2025 20:08:04 +0200
Subject: [PATCH] Part A local code added

---
 .gitignore       |   3 +
 requirements.txt |   4 +
 src/.gitignore   |   3 +
 src/partA.py     | 239 +++++++++++++++++++++++++++++++++++++++++++++++
 src/toolbox.py   |  20 ++++
 5 files changed, 269 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 src/.gitignore
 create mode 100644 src/partA.py
 create mode 100644 src/toolbox.py

diff --git a/.gitignore b/.gitignore
index 0af73dc..b1486cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Python
+.venv/*
+
 # IDEs
 .idea/*
 
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f30f7b3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+pandas
+matplotlib
+
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..18a258a
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,3 @@
+# Python
+__pycache__/*
+
diff --git a/src/partA.py b/src/partA.py
new file mode 100644
index 0000000..68f1961
--- /dev/null
+++ b/src/partA.py
@@ -0,0 +1,239 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from toolbox import *
+
+from typing import Tuple, Dict
+from pandas import DataFrame
+
+
+# --------------------------------------------------
+# Part A: dataset splitting
+# --------------------------------------------------
+def split_dataset_by_class(df: DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict[int, np.ndarray]]:
+    """
+    Splits a dataset into features, labels and per-class subsets with the assumptions that:
+    - All columns except the last are feature columns.
+    - The last column is the class label.
+
+    Parameters
+    ----------
+    df: DataFrame
+        Data samples as DataFrame.
+
+    Returns
+    -------
+    X : ndarray, shape (N, d), y : ndarray, shape (N,),   classes : dict:
+        Feature matrix,
+        Labels,
+        Dictionary mapping each class label to the subset of X that belongs to that class.
+
+    Example
+    -------
+        X, y, classes = split_dataset_by_class(df)
+    """
+    n_cols = df.shape[1]                # Number of columns
+    X = df.iloc[:, :n_cols - 1].values  # Features = all columns except last
+    y = df.iloc[:, n_cols - 1].values   # Labels = last column
+
+    # Dictionary that maps class -> samples
+    classes = {c: X[y == c] for c in np.unique(y) }
+
+    return X, y, classes
+
+
+def mle_mean(X: np.ndarray) -> np.ndarray:
+    """
+    MLE estimate of the mean vector.
+
+    Parameters
+    ----------
+    X : ndarray, shape (N, d)
+        Data samples.
+
+    Returns
+    -------
+    mu : ndarray, shape (d,)
+        Estimated mean vector.
+    """
+    return np.sum(X, axis=0) / X.shape[0]
+
+
+def mle_covariance(X: np.ndarray, mu: np.ndarray) -> np.ndarray:
+    """
+    MLE estimate of the covariance matrix.
+    (Divide by N, not N-1)
+
+    Parameters
+    ----------
+    X : ndarray, shape (N, d)
+        Data samples.
+    mu : ndarray, shape (d,)
+        Mean vector.
+
+    Returns
+    -------
+    cov : ndarray, shape (d, d)
+        Covariance matrix.
+    """
+    N = X.shape[0]
+    diff = X - mu
+    cov = (diff.T @ diff) / N
+    return cov
+
+
+def estimate_gaussians_mle(classes: Dict[int, np.ndarray]) -> Dict[int, Tuple[np.ndarray, np.ndarray]]:
+    """
+    Estimates mean and covariance (MLE) for each class.
+
+    Parameters
+    ----------
+    classes : dict
+        Dictionary mapping class label -> samples of that class.
+
+    Returns
+    -------
+    params : dict
+        Dictionary mapping class label -> (mu, cov),
+        where mu has shape (d,) and cov has shape (d,d).
+    """
+    params: Dict[int, Tuple[np.ndarray, np.ndarray]] = {}
+
+    for c, Xc in classes.items():
+        mu_c = mle_mean(Xc)
+        cov_c = mle_covariance(Xc, mu_c)
+        params[c] = (mu_c, cov_c)
+
+    return params
+
+
+# --------------------------------------------------
+# Part A: Gaussian pdf and grid computation
+# --------------------------------------------------
+def gaussian_pdf(point: np.ndarray, mu: np.ndarray, cov: np.ndarray) -> float:
+    """
+    Multivariate Gaussian pdf at a single point (general dimension).
+
+    Parameters
+    ----------
+    point : ndarray, shape (d,)
+        feature data of the point
+    mu : ndarray, shape (d,)
+        mean vector
+    cov : ndarray, shape (d,d)
+        covariance array
+
+    Returns
+    -------
+    value : float
+        pdf value at `point`.
+    """
+    d = mu.shape[0]              # dimension
+    diff = point - mu
+    det = np.linalg.det(cov)
+    inv = np.linalg.inv(cov)
+
+    # (2π)^(d/2) * sqrt(det Σ)
+    norm_const = 1.0 / np.sqrt(((2 * np.pi) ** d) * det)
+    exponent = -0.5 * diff.T @ inv @ diff
+
+    return float(norm_const * np.exp(exponent))
+
+
+def compute_gaussian_grid(
+    X: np.ndarray, mu: np.ndarray, cov: np.ndarray, grid_size: int = 50
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Creates a 2D grid over the range of the first two dimensions of X
+    and computes pdf values using the multivariate Gaussian pdf.
+
+    Parameters
+    ----------
+    X : ndarray, shape (N, d)
+        Data samples (only used to define plotting range for dims 0 and 1).
+    mu : ndarray, shape (d,)
+        mean vector value
+    cov : ndarray, shape (d,d)
+        covariance
+    grid_size : int
+        Number of points per axis.
+
+    Returns
+    -------
+    Xgrid, Ygrid, Z : ndarray, shape (grid_size, grid_size, grid_size)
+        X Meshgrid coordinates for dimensions 0 and 1,
+        Y Meshgrid coordinates for dimensions 0 and 1,
+        pdf values at each grid point.
+    """
+    # Range only on the first two dimensions
+    x_vals = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), grid_size)
+    y_vals = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), grid_size)
+
+    Xgrid, Ygrid = np.meshgrid(x_vals, y_vals)
+    Z = np.zeros_like(Xgrid, dtype=float)
+
+    for i in range(Xgrid.shape[0]):
+        for j in range(Xgrid.shape[1]):
+            point = np.array([Xgrid[i, j], Ygrid[i, j]])
+            Z[i, j] = gaussian_pdf(point, mu, cov)
+
+    return Xgrid, Ygrid, Z
+
+
+# --------------------------------------------------
+# Part A: 3D plotting for multiple classes
+# --------------------------------------------------
+def plot_gaussians_3d(
+    X: np.ndarray, params: Dict[int, Tuple[np.ndarray, np.ndarray]], grid_size: int = 50
+) -> None:
+    """
+    Plots the Gaussian pdfs (MLE estimates) for all classes on a single 3D figure.
+
+    Parameters
+    ----------
+    X : ndarray, shape (N, 2)
+        All data samples (used to define the plotting range).
+    params : dict
+        Dictionary mapping class label -> (mu, cov).
+    grid_size : int
+        Resolution of the grid for pdf evaluation.
+    """
+    fig = plt.figure(figsize=(12, 8))
+    ax = fig.add_subplot(111, projection='3d')
+
+    for idx, (c, (mu_c, cov_c)) in enumerate(params.items()):
+        Xgrid, Ygrid, Z = compute_gaussian_grid(X, mu_c, cov_c, grid_size=grid_size)
+        ax.plot_surface(Xgrid, Ygrid, Z, alpha=0.6, label=f"Class {c}")
+
+    ax.set_title("MLE Estimated 2D Gaussians (all classes)")
+    ax.set_xlabel("X1")
+    ax.set_ylabel("X2")
+    ax.set_zlabel("pdf")
+    plt.show()
+
+
+
+# --------------------------------------------------
+# Part A: convenience runner (optional)
+# --------------------------------------------------
+if __name__ == "__main__":
+    """
+    Convenience function to run the whole Part A pipeline:
+    - load dataset
+    - split by class
+    - estimate Gaussian parameters (MLE) per class
+    - plot 3D pdf surfaces
+    """
+    df = load_csv(dataset1, header=None)
+
+    X, y, classes = split_dataset_by_class(df)
+    params = estimate_gaussians_mle(classes)
+
+    # Optional parameters printing
+    for c, (mu_c, cov_c) in params.items():
+        print(f"Class {c}:")
+        print("  mu  =", mu_c)
+        print("  cov =\n", cov_c)
+        print()
+
+    # Plot 3D surfaces
+    plot_gaussians_3d(X, params, grid_size=50)
\ No newline at end of file
diff --git a/src/toolbox.py b/src/toolbox.py
new file mode 100644
index 0000000..2f4eb21
--- /dev/null
+++ b/src/toolbox.py
@@ -0,0 +1,20 @@
+# -----------------------------
+# Toolbox
+# -----------------------------
+
+import pandas as pd
+
+
+def github_raw(user, repo, branch, path):
+    return f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{path}"
+
+dataset1 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset1.csv"))
+dataset2 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset2.csv"))
+dataset3 = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset3.csv"))
+testset = (github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/testset.csv"))
+
+def load_csv(path, header=None):
+    """
+    Loads a CSV file and returns a pandas DataFrame.
+    """
+    return pd.read_csv(path, header=header)
\ No newline at end of file