From d2d715c7cf7d9354f92dcd9d6aef1d21589d2de1 Mon Sep 17 00:00:00 2001 From: Christos Choutouridis Date: Thu, 11 Dec 2025 19:11:12 +0200 Subject: [PATCH] A first prof of concept of part D added --- requirements.txt | 2 +- src/partD.py | 250 +++++++++++++++++++++++++++++++++++++++++++++++ src/toolbox.py | 3 +- 3 files changed, 253 insertions(+), 2 deletions(-) create mode 100644 src/partD.py diff --git a/requirements.txt b/requirements.txt index f30f7b3..7c7275e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy pandas matplotlib - +scikit-learn diff --git a/src/partD.py b/src/partD.py new file mode 100644 index 0000000..bdf8c5c --- /dev/null +++ b/src/partD.py @@ -0,0 +1,250 @@ +# ------------------------------------------------------------ +# Part D - TV Dataset Classifier +# Pattern Recognition – Semester Assignment +# +# Author: +# Christos Choutouridis (ΑΕΜ 8997) +# cchoutou@ece.auth.gr +# +# Description: +# This module implements a complete classification pipeline +# for the high-dimensional TV dataset (Part D): +# - Loading training and test data +# - Basic preprocessing (scaling, optional dimensionality reduction) +# - Training a supervised classifier +# - Evaluating on a validation split +# - Predicting labels for the provided test set +# - Saving labels to labelsX.npy as required by the assignment +# +# Notes: +# The exact choice of classifier and preprocessing steps can +# be modified. The current skeleton uses a RandomForest model +# as a robust default for high-dimensional data. +# ------------------------------------------------------------ + +from typing import Tuple + +import numpy as np +import pandas as pd + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import RandomForestClassifier +# from sklearn.decomposition import PCA # Optional, if you decide to use PCA + +from toolbox import load_csv, datasetTV, datasetTest + + +# -------------------------------------------------- +# Data loading +# -------------------------------------------------- +def load_tv_training() -> Tuple[np.ndarray, np.ndarray]: + """ + Loads the TV training dataset (Part D) and splits it into + features and labels. + + Returns + ------- + tuple: + X_train (ndarray, shape (N_train, D)): + Training feature matrix. + y_train (ndarray, shape (N_train,)): + Training class labels (1..5). + """ + df = load_csv(datasetTV, header=None) + data = df.values + X_train = data[:, :-1] + y_train = data[:, -1].astype(int) + return X_train, y_train + + +def load_tv_test() -> np.ndarray: + """ + Loads the TV test dataset (Part D) without labels. + + Returns + ------- + X_test (ndarray, shape (N_test, D)): + Test feature matrix (no labels). + """ + df = load_csv(datasetTest, header=None) + X_test = df.values + return X_test + + +# -------------------------------------------------- +# Preprocessing +# -------------------------------------------------- +def preprocess_features( + X_train: np.ndarray, + X_test: np.ndarray | None = None, +) -> Tuple[np.ndarray, np.ndarray | None, StandardScaler]: + """ + Applies basic preprocessing to the feature matrices. + By default, standardizes features (zero mean, unit variance). + + Parameters + ---------- + X_train : ndarray, shape (N_train, D) + Training features. + X_test : ndarray, shape (N_test, D) or None + Test features, if available. + + Returns + ------- + tuple: + X_train_proc (ndarray): + Preprocessed training features. + X_test_proc (ndarray or None): + Preprocessed test features (if X_test is not None). + scaler (StandardScaler): + Fitted scaler object (can be reused later). + """ + scaler = StandardScaler() + X_train_proc = scaler.fit_transform(X_train) + + if X_test is not None: + X_test_proc = scaler.transform(X_test) + else: + X_test_proc = None + + # If later θέλεις PCA: + # pca = PCA(n_components=some_k) + # X_train_proc = pca.fit_transform(X_train_proc) + # if X_test_proc is not None: + # X_test_proc = pca.transform(X_test_proc) + + return X_train_proc, X_test_proc, scaler + + +# -------------------------------------------------- +# Model training & evaluation +# -------------------------------------------------- +def train_classifier(X_train: np.ndarray, y_train: np.ndarray) -> RandomForestClassifier: + """ + Trains a supervised classifier on the given features and labels. + + Currently uses a RandomForestClassifier as a robust default, + but this can be replaced with any other model. + + Parameters + ---------- + X_train : ndarray, shape (N_train, D) + y_train : ndarray, shape (N_train,) + + Returns + ------- + model (RandomForestClassifier): + Trained classifier. + """ + model = RandomForestClassifier( + n_estimators=200, + max_depth=None, + random_state=0, + n_jobs=-1, + ) + model.fit(X_train, y_train) + return model + + +def evaluate_classifier( + model, + X_val: np.ndarray, + y_val: np.ndarray, +) -> float: + """ + Evaluates a trained classifier on a validation set. + + Parameters + ---------- + model : + Any scikit-learn-like classifier with .predict method. + X_val : ndarray, shape (N_val, D) + y_val : ndarray, shape (N_val,) + + Returns + ------- + acc : float + Classification accuracy on the validation set. + """ + y_pred = model.predict(X_val) + acc = float(np.mean(y_pred == y_val)) + return acc + + +# -------------------------------------------------- +# Prediction & saving labels +# -------------------------------------------------- +def predict_labels( + model, + X_test: np.ndarray, +) -> np.ndarray: + """ + Predicts labels for the TV test set. + + Parameters + ---------- + model : + Trained classifier. + X_test : ndarray, shape (N_test, D) + + Returns + ------- + labels (ndarray, shape (N_test,)): + Predicted class labels for each test sample. + """ + labels = model.predict(X_test) + return labels.astype(int) + + +def save_labels(labels: np.ndarray, filename: str = "labelsX.npy") -> None: + """ + Saves predicted labels to a .npy file as required by the assignment. + + Parameters + ---------- + labels : ndarray, shape (N_test,) + Predicted class labels. + filename : str + Output filename (default: "labelsX.npy"). + """ + np.save(filename, labels) + print(f"Saved labels to {filename} with shape {labels.shape}") + + +# -------------------------------------------------- +# Main pipeline for Part D +# -------------------------------------------------- +if __name__ == "__main__": + # 1. Load training and test sets + X_train_raw, y_train = load_tv_training() + X_test_raw = load_tv_test() + + # 2. Train/validation split on the training data + X_tr, X_val, y_tr, y_val = train_test_split( + X_train_raw, + y_train, + test_size=0.2, + random_state=0, + stratify=y_train, + ) + + # 3. Preprocess features (scaling, optional PCA) + X_tr_proc, X_val_proc, scaler = preprocess_features(X_tr, X_val) + + # 4. Train classifier + model = train_classifier(X_tr_proc, y_tr) + + # 5. Evaluate on validation set + val_acc = evaluate_classifier(model, X_val_proc, y_val) + print(f"Validation accuracy: {val_acc:.4f}") + + # 6. Retrain on full training set (optional but συνήθως καλό) + X_full_proc, X_test_proc, _ = preprocess_features(X_train_raw, X_test_raw) + final_model = train_classifier(X_full_proc, y_train) + + # 7. Predict labels for official test set + labels = predict_labels(final_model, X_test_proc) + + # 8. Save labels to labelsX.npy + save_labels(labels, filename="labelsX.npy") diff --git a/src/toolbox.py b/src/toolbox.py index f29073a..41e0399 100644 --- a/src/toolbox.py +++ b/src/toolbox.py @@ -21,7 +21,8 @@ dataset1 = github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/datase dataset2 = github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset2.csv") dataset3 = github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/dataset3.csv") testset = github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/testset.csv") - +datasetTV = github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/datasetTV.csv") +datasetTest = github_raw("hoo2", "PR-Assignment2025_26", "master", "datasets/datasetTest.csv") def load_csv(path, header=None): """