Update code with the latest local version

2026-01-13 23:54:52 +02:00 · 2026-01-13 23:54:52 +02:00 · 7acc63bbe9
commit 7acc63bbe9
parent 7af1d66a8f
17 changed files with 306 additions and 55 deletions
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/Confusion
+++ b/src/figures/Confusion
--- a/src/figures/tuning_results.csv
+++ b/src/figures/tuning_results.csv
@ -0,0 +1,2 @@
+config,preprocess,model,params,mean_acc,std_acc
+scale + svm,scale,svm,"{'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}",0.8581719138625145,0.013348223889216927
--- a/src/labelsX.npy
+++ b/src/labelsX.npy
--- a/src/labelsX_scale_mlp.npy
+++ b/src/labelsX_scale_mlp.npy
--- a/src/labelsX_scale_pca_85_knn.npy
+++ b/src/labelsX_scale_pca_85_knn.npy
--- a/src/labelsX_scale_rf.npy
+++ b/src/labelsX_scale_rf.npy
--- a/src/labelsX_scale_svm.npy
+++ b/src/labelsX_scale_svm.npy
--- a/src/output.txt
+++ b/src/output.txt
@ -0,0 +1,185 @@
+/home/hoo2/Work/AUTh/PatternRecognition/Assignment_2025-26/.venv/bin/python /home/hoo2/Work/AUTh/PatternRecognition/Assignment_2025-26/src/partD.py all 
+[       scale] [       gnb] val_acc=0.7095
+[       scale] [        rf] val_acc=0.8205
+[       scale] [    logreg] val_acc=0.7730
+[       scale] [linear_svm] val_acc=0.7707
+[       scale] [       svm] val_acc=0.8593
+[       scale] [       mlp] val_acc=0.8382
+[       scale] [       knn] val_acc=0.8342
+[       scale] [  adaboost] val_acc=0.6832
+[scale_pca_66] [       gnb] val_acc=0.7524
+[scale_pca_66] [        rf] val_acc=0.8096
+[scale_pca_66] [    logreg] val_acc=0.7862
+[scale_pca_66] [linear_svm] val_acc=0.7736
+[scale_pca_66] [       svm] val_acc=0.8582
+[scale_pca_66] [       mlp] val_acc=0.8359
+[scale_pca_66] [       knn] val_acc=0.8370
+[scale_pca_66] [  adaboost] val_acc=0.6878
+[scale_pca_75] [       gnb] val_acc=0.7547
+[scale_pca_75] [        rf] val_acc=0.8130
+[scale_pca_75] [    logreg] val_acc=0.7839
+[scale_pca_75] [linear_svm] val_acc=0.7696
+[scale_pca_75] [       svm] val_acc=0.8565
+[scale_pca_75] [       mlp] val_acc=0.8216
+[scale_pca_75] [       knn] val_acc=0.8370
+[scale_pca_75] [  adaboost] val_acc=0.6878
+[scale_pca_85] [       gnb] val_acc=0.7501
+[scale_pca_85] [        rf] val_acc=0.8033
+[scale_pca_85] [    logreg] val_acc=0.7810
+[scale_pca_85] [linear_svm] val_acc=0.7662
+[scale_pca_85] [       svm] val_acc=0.8588
+[scale_pca_85] [       mlp] val_acc=0.8188
+[scale_pca_85] [       knn] val_acc=0.8388
+[scale_pca_85] [  adaboost] val_acc=0.6998
+
+=== Investigation summary ===
+model
+svm           0.859348
+knn           0.838765
+mlp           0.838193
+rf            0.820469
+logreg        0.786164
+linear_svm    0.773585
+gnb           0.754717
+adaboost      0.699828
+
+Selected top-3 models for further analysis: ['svm', 'knn', 'mlp']
+
+Best configuration overall: preprocess=scale, model=svm, val_acc=0.8593
+
+Classification report (best config):
+              precision    recall  f1-score   support
+
+           1       0.94      0.96      0.95       354
+           2       0.76      0.73      0.75       344
+           3       0.92      0.93      0.93       351
+           4       0.91      0.91      0.91       343
+           5       0.75      0.77      0.76       357
+
+    accuracy                           0.86      1749
+   macro avg       0.86      0.86      0.86      1749
+weighted avg       0.86      0.86      0.86      1749
+
+
+[TUNING] scale + rf (cv=5) ...
+[scale | rf] combo   1/1 mean=0.8228 params={'n_estimators': 400, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 1}
+  best mean_acc=0.8228 (std=0.0121) params={'n_estimators': 400, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 1}
+
+[TUNING] scale + mlp (cv=5) ...
+[scale | mlp] combo   1/1 mean=0.8407 params={'hidden_layer_sizes': (128,), 'alpha': 0.001, 'learning_rate_init': 0.01, 'activation': 'relu', 'solver': 'adam'}
+  best mean_acc=0.8407 (std=0.0098) params={'hidden_layer_sizes': (128,), 'alpha': 0.001, 'learning_rate_init': 0.01, 'activation': 'relu', 'solver': 'adam'}
+
+[TUNING] scale_pca_85 + knn (cv=5) ...
+[scale_pca_85 | knn] combo   1/1 mean=0.8313 params={'n_neighbors': 9, 'weights': 'distance', 'p': 2}
+  best mean_acc=0.8313 (std=0.0117) params={'n_neighbors': 9, 'weights': 'distance', 'p': 2}
+
+[TUNING] scale + svm (cv=5) ...
+[scale | svm] combo   1/1 mean=0.8582 params={'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}
+  best mean_acc=0.8582 (std=0.0133) params={'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}
+
+=== Tuning summary (best overall) ===
+{'name': 'scale + svm', 'preprocess_spec': {'type': 'pipeline', 'steps': [{'type': 'scaler', 'params': {}}]}, 'preprocess_name': 'scale', 'model': 'svm', 'params': {'kernel': 'rbf', 'C': 4, 'gamma': 'scale', 'class_weight': None}, 'mean_acc': 0.8581719138625145, 'std_acc': 0.013348223889216927}
+
+============================================================
+[FINAL - VALIDATION] scale + rf
+Confusion matrix:
+[[338   6   5   3   2]
+ [  4 239  11  12  78]
+ [ 11   2 316  21   1]
+ [  3  12  17 299  12]
+ [ 13  71   4   9 260]]
+
+Classification report:
+              precision    recall  f1-score   support
+
+           1       0.92      0.95      0.93       354
+           2       0.72      0.69      0.71       344
+           3       0.90      0.90      0.90       351
+           4       0.87      0.87      0.87       343
+           5       0.74      0.73      0.73       357
+
+    accuracy                           0.83      1749
+   macro avg       0.83      0.83      0.83      1749
+weighted avg       0.83      0.83      0.83      1749
+
+============================================================
+[FINAL] scale_rf: saved labelsX_scale_rf.npy shape=(6955,)
+
+============================================================
+[FINAL - VALIDATION] scale + mlp
+Confusion matrix:
+[[338   1   9   2   4]
+ [  5 244  13   7  75]
+ [ 10   3 320  16   2]
+ [  0  14  16 302  11]
+ [  8  74   1  16 258]]
+
+Classification report:
+              precision    recall  f1-score   support
+
+           1       0.94      0.95      0.95       354
+           2       0.73      0.71      0.72       344
+           3       0.89      0.91      0.90       351
+           4       0.88      0.88      0.88       343
+           5       0.74      0.72      0.73       357
+
+    accuracy                           0.84      1749
+   macro avg       0.83      0.84      0.83      1749
+weighted avg       0.83      0.84      0.84      1749
+
+============================================================
+[FINAL] scale_mlp: saved labelsX_scale_mlp.npy shape=(6955,)
+
+============================================================
+[FINAL - VALIDATION] scale_pca_85 + knn
+Confusion matrix:
+[[346   2   5   0   1]
+ [  5 193   9   7 130]
+ [ 19   1 319  11   1]
+ [  4   9  17 301  12]
+ [  8  33   1   6 309]]
+
+Classification report:
+              precision    recall  f1-score   support
+
+           1       0.91      0.98      0.94       354
+           2       0.81      0.56      0.66       344
+           3       0.91      0.91      0.91       351
+           4       0.93      0.88      0.90       343
+           5       0.68      0.87      0.76       357
+
+    accuracy                           0.84      1749
+   macro avg       0.85      0.84      0.84      1749
+weighted avg       0.85      0.84      0.84      1749
+
+============================================================
+[FINAL] scale_pca_85_knn: saved labelsX_scale_pca_85_knn.npy shape=(6955,)
+
+============================================================
+[FINAL - VALIDATION] scale + svm
+Confusion matrix:
+[[340   2   8   1   3]
+ [  3 251   9   6  75]
+ [  7   1 327  14   2]
+ [  0  12   9 311  11]
+ [ 11  63   1   8 274]]
+
+Classification report:
+              precision    recall  f1-score   support
+
+           1       0.94      0.96      0.95       354
+           2       0.76      0.73      0.75       344
+           3       0.92      0.93      0.93       351
+           4       0.91      0.91      0.91       343
+           5       0.75      0.77      0.76       357
+
+    accuracy                           0.86      1749
+   macro avg       0.86      0.86      0.86      1749
+weighted avg       0.86      0.86      0.86      1749
+
+============================================================
+[FINAL] scale_svm: saved labelsX_scale_svm.npy shape=(6955,)
+Saved labels to labelsX.npy with shape (6955,)
+
+Process finished with exit code 0
+
--- a/src/partA.py
+++ b/src/partA.py
@ -197,6 +197,17 @@ def plot_gaussians_3d(
    ax.set_zlabel("pdf")
    plt.show()

+    # plt.figure(figsize=(6, 5))
+    # plt.scatter(X[:, 0], X[:, 1], s=10, alpha=0.35)
+    # plt.contour(Xgrid, Ygrid, Z, levels=8, linewidths=1.5)
+    #
+    # plt.title("Estimated Gaussian density (ML)")
+    # plt.xlabel("x₁")
+    # plt.ylabel("x₂")
+    #
+    # plt.tight_layout()
+    # plt.show()
+


 # --------------------------------------------------
--- a/src/partB.py
+++ b/src/partB.py
@ -302,7 +302,7 @@ def plot_histogram_with_pdf(
    plt.plot(x_plot, pdf_true, label=f"True N({mu_true}, {var_true}) pdf")
    plt.xlabel("x")
    plt.ylabel("Density")
-    plt.title("Dataset2 histogram vs true N({mu_true}, {var_true}) pdf")
+    plt.title(f"Dataset2 histogram vs true N({mu_true}, {var_true}) pdf")
    plt.legend()
    plt.grid(True)
    plt.show()
--- a/src/partD.py
+++ b/src/partD.py
@ -33,6 +33,8 @@ import matplotlib as mpl
 import matplotlib.pyplot as plt

 from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
@ -389,8 +391,9 @@ def plot_accuracy_bars(df: pd.DataFrame, title: str) -> None:
    plt.grid(True, axis="y", alpha=0.3)
    plt.legend()
    plt.tight_layout()
-    plt.show(block=False)
    plt.savefig(f"figures/" + title + ".png", dpi=300)
+    plt.show(block=False)
+    plt.pause(2)
    plt.close()


@ -404,8 +407,9 @@ def plot_confusion(y_true: np.ndarray, y_pred: np.ndarray, title: str) -> None:
    disp.plot(ax=ax, cmap="Blues", colorbar=True)
    ax.set_title(title)
    plt.tight_layout()
-    plt.show(block=False)
    plt.savefig(f"figures/" + title + ".png", dpi=300)
+    plt.show(block=False)
+    plt.pause(2)
    plt.close()


@ -457,7 +461,7 @@ def plot_pca_scatter_2d(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(0.001)
+    plt.pause(2)
    plt.close()


@ -497,7 +501,7 @@ def plot_feature_separability(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(0.001)
+    plt.pause(2)
    plt.close()

    # Plot worst
@ -514,7 +518,7 @@ def plot_feature_separability(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(0.001)
+    plt.pause(2)
    plt.close()

    return best_idx, worst_idx
@ -554,7 +558,7 @@ def plot_feature_distributions_grid(
        bbox_inches="tight",
    )
    plt.show(block=False)
-    plt.pause(0.001)
+    plt.pause(2)
    plt.close()


@ -711,6 +715,16 @@ def final_training_for_all_best_configs(
        model = train_classifier(X_tr_p, y_tr, model_spec)
        y_val_pred = model.predict(X_val_p).astype(int)

+        # --- console output: confusion matrix + report ---
+        cm = confusion_matrix(y_val, y_val_pred)
+        print("\n" + "=" * 60)
+        print(f"[FINAL - VALIDATION] {preprocess_name} + {model_key}")
+        print("Confusion matrix:")
+        print(cm)
+        print("\nClassification report:")
+        print(classification_report(y_val, y_val_pred))
+        print("=" * 60)
+
        plot_confusion(
            y_val,
            y_val_pred,
@ -776,12 +790,12 @@ def train_final_and_predict(
 # --------------------------------------------------
 # Helpers
 # --------------------------------------------------
-def effect_size_per_feature(X2: np.ndarray, X5: np.ndarray, eps: float = 1e-12) -> np.ndarray:
+def effect_size_per_feature(Xa: np.ndarray, Xb: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    """
    Computes a simple per-feature separability score between two classes.

    Score (Cohen-like d):
-        d_j = |mu2 - mu5| / sqrt( (var2 + var5)/2 )
+        d_j = |mu_a - mu_b| / sqrt( (var_a + var_b)/2 )

    Larger d => better separation (less overlap).
    Smaller d => stronger overlap.
@ -791,14 +805,14 @@ def effect_size_per_feature(X2: np.ndarray, X5: np.ndarray, eps: float = 1e-12)
    d : ndarray, shape (D,)
        Per-feature separability scores.
    """
-    mu2 = np.mean(X2, axis=0)
-    mu5 = np.mean(X5, axis=0)
+    mu_a = np.mean(Xa, axis=0)
+    mu_b = np.mean(Xb, axis=0)

-    var2 = np.var(X2, axis=0)
-    var5 = np.var(X5, axis=0)
+    var_a = np.var(Xa, axis=0)
+    var_b = np.var(Xb, axis=0)

-    pooled = np.sqrt(0.5 * (var2 + var5) + eps)
-    d = np.abs(mu2 - mu5) / pooled
+    pooled = np.sqrt(0.5 * (var_a + var_b) + eps)
+    d = np.abs(mu_a - mu_b) / pooled
    return d


@ -814,8 +828,6 @@ def expand_param_grid(param_grid: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
    return combos


-from sklearn.model_selection import StratifiedKFold
-
 def stratified_kfold_indices(y: np.ndarray, n_splits: int, seed: int = 0):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    return list(skf.split(np.zeros_like(y), y))
@ -1184,52 +1196,93 @@ TUNING_SPECS = [
    #     "preprocess": PREPROCESS_SPECS["scale"],
    #     "model": "rf",
    #     "param_grid": {
-    #         "n_estimators": [400, 800, 1200, 1400], #[200, 400, 800],
-    #         "max_depth": [None], #[None, 20, 40, 80],
-    #         "max_features": ["sqrt"], #["sqrt", "log2", 0.5],
-    #         "min_samples_split": [2, 4, 8, 10],#[2, 5, 10],
-    #         "min_samples_leaf": [1, 2, 4], #[1, 2, 4],
+    #         # Tuned values
+    #         "n_estimators":      [400],
+    #         "max_depth":         [None],
+    #         "max_features":      ["sqrt"],
+    #         "min_samples_split": [4],
+    #         "min_samples_leaf":  [1],
+    #         #
+    #         # Tuned with the values below
+    #         #   Note:
+    #         #       Uncomment the following if you want to run the entire tuning process again!
+    #         #       ** Take a LOT of time **
+    #         # "n_estimators":      [200, 400, 800, 1200. 1400],
+    #         # "max_depth":         [None, 20, 40, 80],
+    #         # "max_features":      ["sqrt", "log2", 0.5],
+    #         # "min_samples_split": [2, 4, 5, 8, 10],
+    #         # "min_samples_leaf":  [1, 2, 4],
+    #     },
+    #     "cv": 5,
+    # },
+    # {
+    #     "name": "scale + mlp",
+    #     "preprocess_name": "scale",
+    #     "preprocess": PREPROCESS_SPECS["scale"],
+    #     "model": "mlp",
+    #     "param_grid": {
+    #         # Tuned values
+    #         "hidden_layer_sizes": [(128,)],
+    #         "alpha":              [0.001],
+    #         "learning_rate_init": [0.01],
+    #         "activation":         ["relu"],
+    #         "solver":             ["adam"],
+    #         #
+    #         # Tuned with the values below
+    #         #   Note:
+    #         #       Uncomment the following if you want to run the entire tuning process again!
+    #         #       ** Take a LOT of time **
+    #         # "hidden_layer_sizes": [(128, ), (128, 64), (256, 128), (128, 64, 32)],
+    #         # "alpha":              [1e-5, 1e-4, 1e-3, 0.01],
+    #         # "learning_rate_init": [1e-4, 1e-3, 0.01, 0.02],
+    #         # "activation":         ["relu", "tanh"],
+    #         # # "max_iter":           [2000],
+    #         # "solver":             ["adam", "sgd"],
+    #     },
+    #     "cv": 5,
+    # },
+    # {
+    #     "name": "scale_pca_85 + knn",
+    #     "preprocess_name": "scale_pca_85",
+    #     "preprocess": PREPROCESS_SPECS["scale_pca_85"],
+    #     "model": "knn",
+    #     "param_grid": {
+    #         # Tuned values
+    #         "n_neighbors": [9],
+    #         "weights": ["distance"],
+    #         "p": [2],
+    #         #
+    #         # Tuned with the values below
+    #         #   Note:
+    #         #       Uncomment the following if you want to run the entire tuning process again!
+    #         #       ** Take a LOT of time **
+    #         # "n_neighbors": [5, 7, 8, 9, 10, 11, 15, 31, 42],
+    #         # "weights":     ["uniform", "distance"],
+    #         # "p":           [1, 2],
    #     },
    #     "cv": 5,
    # },
-    {
-        "name": "scale + mlp",
-        "preprocess_name": "scale",
-        "preprocess": PREPROCESS_SPECS["scale"],
-        "model": "mlp",
-        "param_grid": {
-            "hidden_layer_sizes": [(128, ), (128, 64), (256, 128), (128, 64, 32)],
-            "alpha": [1e-5, 1e-4, 1e-3],
-            "learning_rate_init": [1e-3, 0.01, 0.02],
-            "activation": ["relu"], #["relu", "tanh"],
-            # "max_iter": [2000],
-            "solver": ["adam"], #["adam", "sgd"],
-        },
-        "cv": 5,
-    },
-    {
-        "name": "scale_pca_85 + knn",
-        "preprocess_name": "scale_pca_85",
-        "preprocess": PREPROCESS_SPECS["scale_pca_85"],
-        "model": "knn",
-        "param_grid": {
-            "n_neighbors": [7, 8, 9, 10, 11, 15, 31, 42],
-            "weights": ["uniform", "distance"],
-            "p": [1, 2],
-        },
-        "cv": 5,
-    },
    {
        "name": "scale + svm",
        "preprocess_name": "scale",
        "preprocess": PREPROCESS_SPECS["scale"],
        "model": "svm",
        "param_grid": {
-            "kernel": ["rbf", "poly"],
-            "C": [3, 4, 5, 5.5, 6, 10],
-            "degree": [2, 3, 5],
-            "gamma": ["scale", "auto"],
+            # Tuned values
+            "kernel":       ["rbf"],
+            "C":            [4],
+            "gamma":        ["scale"],
            "class_weight": [None],
+            #
+            # Tuned with the values below
+            #   Note:
+            #       Uncomment the following if you want to run the entire tuning process again!
+            #       ** Take a LOT of time **
+            # "kernel":       ["rbf", "poly"],
+            # "C":            [0.1, 0.3, 1, 3, 4, 5, 5.5, 6, 10, 30],
+            # # "degree":       [2, 3, 5], (only for "poly")
+            # "gamma":        ["scale", "auto", 0.1, 0.03, 0.01, 0.003, 0.001],
+            # "class_weight": [None, "balanced"],
        },
        "cv": 5,
    },
@ -1262,7 +1315,7 @@ if __name__ == "__main__":
        # Phase 1.2: visualization
        visualization_phase(results, df)

-        # Phase 1,3: problem demo
+        # Phase 1.3: problem demo
        problem_demonstration_phase(X_train_raw, y_train, class_a=2, class_b=5, top_k=9)

    if param == "phase2" or param == "all":
@ -1284,7 +1337,7 @@ if __name__ == "__main__":
            seed=0,
        )

-        # (Optional) also train/predict only for the best overall and save as the official submission file
+        # Also train/predict only for the best overall and save as the official submission file
        y_test_pred = train_final_and_predict(
            X_train_raw, y_train, X_test_raw, best_overall, labels_path="labelsX.npy"
        )
--- a/src/run1.zip
+++ b/src/run1.zip
--- a/src/run2.zip
+++ b/src/run2.zip
--- a/src/run3.zip
+++ b/src/run3.zip