Spaces:

Bachstelze
/

github_sync

Sleeping

App Files Files Community

Reem commited on 19 days ago

Commit

08843cc

1 Parent(s): 216ab79

pca-analysis-&-cluster-visualization

Browse files

Files changed (3) hide show

A6/A6_Classification.ipynb +443 -442
A7/A7_PCA_with_clusters.csv +0 -0
A7/A7_Report.ipynb +0 -0

A6/A6_Classification.ipynb CHANGED Viewed

@@ -1,461 +1,462 @@
 {
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983",
-      "metadata": {
-        "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "import pickle\n",
-        "import warnings\n",
-        "import numpy as np\n",
-        "import pandas as pd\n",
-        "import matplotlib.pyplot as plt\n",
-        "import seaborn as sns\n",
-        "from pathlib import Path\n",
-        "from scipy import stats\n",
-        "from sklearn.svm import SVC\n",
-        "from sklearn.model_selection import GridSearchCV\n",
-        "from time import time\n",
-        "\n",
-        "from sklearn.model_selection import (\n",
-        "    StratifiedKFold, cross_validate\n",
-        ")\n",
-        "from sklearn.pipeline import Pipeline\n",
-        "from sklearn.model_selection import cross_val_score\n",
-        "from sklearn.preprocessing import StandardScaler\n",
-        "from sklearn.metrics import (\n",
-        "    accuracy_score, precision_score, recall_score, f1_score,\n",
-        "    classification_report, confusion_matrix\n",
-        ")\n",
-        "from sklearn.linear_model import LogisticRegression\n",
-        "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
-        "from sklearn.neighbors import KNeighborsClassifier\n",
-        "from sklearn.naive_bayes import GaussianNB\n",
-        "from sklearn.ensemble import (\n",
-        "    RandomForestClassifier,\n",
-        "    VotingClassifier,\n",
-        "    BaggingClassifier,\n",
-        "    StackingClassifier,\n",
-        ")\n",
-        "import xgboost as xgb\n",
-        "import lightgbm as lgb\n",
-        "import pickle\n",
-        "warnings.filterwarnings('ignore')\n",
-        "np.random.seed(42)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d",
-      "metadata": {
-        "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d"
-      },
-      "outputs": [],
-      "source": [
-        "REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
-        "DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')\n",
-        "OUT_DIR      = Path('models')\n",
-        "OUT_DIR.mkdir(exist_ok=True)\n",
-        "\n",
-        "RANDOM_STATE = 42\n",
-        "N_SPLITS     = 5\n",
-        "CHAMPION_F1  = 0.6484   # Score from A5b"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "id": "26dc4267-d9d1-4481-90af-7da28143b033",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "26dc4267-d9d1-4481-90af-7da28143b033",
-        "outputId": "494d8880-3d67-4cdc-f9b1-545751653d5a"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Movement features shape: (2094, 43)\n",
-            "Weak link scores shape: (2096, 17)\n",
-            "Shape after duplicate removal: (2094, 38)\n",
-            "Weakest Link class distribution:\n",
-            "WeakestLink\n",
-            "LeftArmFallForward              616\n",
-            "RightArmFallForward             458\n",
-            "RightKneeMovesOutward           274\n",
-            "RightShoulderElevation          245\n",
-            "ExcessiveForwardLean            128\n",
-            "ForwardHead                     109\n",
-            "LeftAsymmetricalWeightShift      80\n",
-            "LeftShoulderElevation            55\n",
-            "LeftKneeMovesOutward             54\n",
-            "RightKneeMovesInward             45\n",
-            "RightAsymmetricalWeightShift     20\n",
-            "LeftHeelRises                     7\n",
-            "LeftKneeMovesInward               3\n",
-            "RightHeelRises                    2\n",
-            "Name: count, dtype: int64\n"
-          ]
-        }
-      ],
-      "source": [
-        "movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))\n",
-        "weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))\n",
-        "\n",
-        "print('Movement features shape:', movement_features_df.shape)\n",
-        "print('Weak link scores shape:', weaklink_scores_df.shape)\n",
-        "\n",
-        "DUPLICATE_NASM_COLS = [\n",
-        "    'No_1_NASM_Deviation',\n",
-        "    'No_2_NASM_Deviation',\n",
-        "    'No_3_NASM_Deviation',\n",
-        "    'No_4_NASM_Deviation',\n",
-        "    'No_5_NASM_Deviation',\n",
-        "]\n",
-        "\n",
-        "movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)\n",
-        "print('Shape after duplicate removal:', movement_features_df.shape)\n",
-        "\n",
-        "weaklink_categories = [\n",
-        "    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',\n",
-        "    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',\n",
-        "    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',\n",
-        "    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',\n",
-        "    'RightKneeMovesOutward', 'RightShoulderElevation',\n",
-        "]\n",
-        "\n",
-        "weaklink_scores_df['WeakestLink'] = (\n",
-        "    weaklink_scores_df[weaklink_categories].idxmax(axis=1)\n",
-        ")\n",
-        "print('Weakest Link class distribution:')\n",
-        "print(weaklink_scores_df['WeakestLink'].value_counts())"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
-        "outputId": "fa4dacb3-82fd-410e-c3b2-942cd53eed8c"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Merged dataset shape: (2094, 39)\n",
-            "Feature matrix shape : (2094, 36)\n",
-            "Number of features   : 36\n",
-            "Number of classes    : 14\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Merge Datasets\n",
-        "target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()\n",
-        "merged_df = movement_features_df.merge(target_df, on='ID', how='inner')\n",
-        "print('Merged dataset shape:', merged_df.shape)\n",
-        "\n",
-        "EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']\n",
-        "feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]\n",
-        "\n",
-        "X = merged_df[feature_columns].values\n",
-        "y = merged_df['WeakestLink'].values\n",
-        "\n",
-        "print(f'Feature matrix shape : {X.shape}')\n",
-        "print(f'Number of features   : {len(feature_columns)}')\n",
-        "print(f'Number of classes    : {len(np.unique(y))}')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf",
-      "metadata": {
-        "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf"
-      },
-      "outputs": [],
-      "source": [
-        "C_range     = [2**i for i in range(-5, 10, 4)]\n",
-        "gamma_range = [2**i for i in range(-10, 4, 4)]\n",
-        "\n",
-        "svm_param_grid = [\n",
-        "    {'svm__kernel': ['rbf'],    'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__class_weight': ['balanced']},\n",
-        "    {'svm__kernel': ['poly'],   'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__degree': [2, 3], 'svm__class_weight': ['balanced']},\n",
-        "    {'svm__kernel': ['linear'], 'svm__C': C_range, 'svm__class_weight': ['balanced']},\n",
-        "]"
-      ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 10,
-      "id": "qBUGqPVmp-TH",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "qBUGqPVmp-TH",
-        "outputId": "f3b9186e-5f25-4b14-a380-69df6232fc2b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Per-fold F1 : [0.5938 0.5981 0.5761 0.6399 0.6123]\n",
-            "Mean F1     : 0.6040 +/- 0.0213\n"
-          ]
-        }
-      ],
-      "source": [
-        "outer_cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)\n",
-        "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)\n",
-        "\n",
-        "# Pipeline keeps scaler inside each fold\n",
-        "svm_pipeline = Pipeline([\n",
-        "    ('scaler', StandardScaler()),\n",
-        "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
-        "])\n",
-        "\n",
-        "nested_svm = GridSearchCV(\n",
-        "    estimator  = svm_pipeline,\n",
-        "    param_grid = svm_param_grid,\n",
-        "    cv         = inner_cv,\n",
-        "    scoring    = 'f1_weighted',\n",
-        "    n_jobs     = -1,\n",
-        "    verbose    = 0,\n",
-        "    refit      = True,\n",
-        ")\n",
-        "nested_svm_scores = cross_val_score(\n",
-        "    nested_svm, X, y,\n",
-        "    cv      = outer_cv,\n",
-        "    scoring = 'f1_weighted',\n",
-        "    n_jobs  = -1,\n",
-        ")\n",
-        "\n",
-        "print(f'Per-fold F1 : {np.round(nested_svm_scores, 4)}')\n",
-        "print(f'Mean F1     : {nested_svm_scores.mean():.4f} +/- {nested_svm_scores.std():.4f}')"
-      ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 11,
-      "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
-        "outputId": "56380093-2371-4284-a3b5-10622ec44adc"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Running CV for Soft Voting champion\n",
-            "Per-fold F1 : [0.6316 0.6433 0.6289 0.7063 0.6331]\n",
-            "Mean F1     : 0.6486 +/- 0.0292\n"
-          ]
-        }
-      ],
-      "source": [
-        "\n",
-        "soft_voting = VotingClassifier(\n",
-        "    estimators=[\n",
-        "        ('rf',  RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',\n",
-        "                                       random_state=RANDOM_STATE, n_jobs=-1)),\n",
-        "        ('lr',  LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),\n",
-        "        ('xgb', xgb.XGBClassifier(  n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,\n",
-        "                                    colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),\n",
-        "        ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,\n",
-        "                                    random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),\n",
-        "        ('knn', KNeighborsClassifier(n_neighbors=7)),\n",
-        "        ('lda', LinearDiscriminantAnalysis()),\n",
-        "    ],\n",
-        "    voting='soft',\n",
-        "    n_jobs=-1,\n",
-        ")\n",
-        "sv_pipeline = Pipeline([\n",
-        "    ('scaler', StandardScaler()),\n",
-        "    ('voting', soft_voting),\n",
-        "])\n",
-        "\n",
-        "print('Running CV for Soft Voting champion')\n",
-        "sv_scores = cross_val_score(sv_pipeline, X, y, cv=outer_cv, scoring='f1_weighted', n_jobs=-1)\n",
-        "print(f'Per-fold F1 : {np.round(sv_scores, 4)}')\n",
-        "print(f'Mean F1     : {sv_scores.mean():.4f} +/- {sv_scores.std():.4f}')"
-      ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 12,
-      "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
-        "outputId": "3b908043-6c47-428c-f434-abcacd15da08"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "                    Model  F1_mean   F1_std vs_A5b\n",
-            "A5 Champion (Soft Voting) 0.648627 0.029224  +0.0%\n",
-            "          SVM (Nested CV) 0.604041 0.021310  -6.8%\n"
-          ]
-        }
-      ],
-      "source": [
-        "CHAMPION_F1 = 0.6484  # A5b reported score\n",
-        "\n",
-        "results = [\n",
-        "    {'Model': 'SVM (Nested CV)',           'F1_mean': nested_svm_scores.mean(), 'F1_std': nested_svm_scores.std(), '_scores': nested_svm_scores},\n",
-        "    {'Model': 'A5 Champion (Soft Voting)', 'F1_mean': sv_scores.mean(),         'F1_std': sv_scores.std(),         '_scores': sv_scores},\n",
-        "]\n",
-        "\n",
-        "results_df = pd.DataFrame([{k:v for k,v in r.items() if k != '_scores'} for r in results])\n",
-        "results_df = results_df.sort_values('F1_mean', ascending=False).reset_index(drop=True)\n",
-        "results_df['vs_A5b'] = results_df['F1_mean'].apply(lambda f: f'{(f - CHAMPION_F1)/CHAMPION_F1*100:+.1f}%')\n",
-        "print(results_df[['Model','F1_mean','F1_std','vs_A5b']].to_string(index=False))"
-      ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 13,
-      "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
-        "outputId": "8beb76a7-854d-4960-8e9d-1c88850792d5"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "SVM (Nested CV) vs A5 Champion: t=-3.913, p=0.0173  -> Significant\n"
-          ]
-        }
-      ],
-      "source": [
-        "from scipy import stats\n",
-        "\n",
-        "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
-        "    k       = len(scores_a)\n",
-        "    diff    = scores_a - scores_b\n",
-        "    d_bar   = diff.mean()\n",
-        "    s_sq    = diff.var(ddof=1)\n",
-        "    var_corr = (1/k + n_test/n_train) * s_sq\n",
-        "    t_stat  = d_bar / np.sqrt(var_corr)\n",
-        "    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))\n",
-        "    return float(t_stat), float(p_value)\n",
-        "\n",
-        "n_total      = len(X)\n",
-        "n_test_fold  = n_total // N_SPLITS\n",
-        "n_train_fold = n_total - n_test_fold\n",
-        "\n",
-        "score_map = {r['Model']: r['_scores'] for r in results}\n",
-        "sv_f1     = score_map['A5 Champion (Soft Voting)']\n",
-        "svm_f1    = score_map['SVM (Nested CV)']\n",
-        "\n",
-        "t, p = corrected_resampled_ttest(svm_f1, sv_f1, n_train_fold, n_test_fold)\n",
-        "sig  = 'Significant' if p < 0.05 else 'Not significant'\n",
-        "print(f'SVM (Nested CV) vs A5 Champion: t={t:+.3f}, p={p:.4f}  -> {sig}')"
-      ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 14,
-      "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
-        "outputId": "bfd2d8ec-e390-43f5-99bc-bbb517f1935b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Fitting 5 folds for each of 52 candidates, totalling 260 fits\n",
-            "Best params: {'svm__C': 8, 'svm__class_weight': 'balanced', 'svm__gamma': 0.015625, 'svm__kernel': 'rbf'}\n",
-            "Model saved to champion_svm.pkl\n"
-          ]
-        }
-      ],
-      "source": [
-        "final_pipeline = Pipeline([\n",
-        "    ('scaler', StandardScaler()),\n",
-        "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
-        "])\n",
-        "\n",
-        "final_grid = GridSearchCV(\n",
-        "    final_pipeline, svm_param_grid,\n",
-        "    cv      = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),\n",
-        "    scoring = 'f1_weighted',\n",
-        "    n_jobs  = -1, verbose=1,\n",
-        ")\n",
-        "final_grid.fit(X, y)\n",
-        "print(f'Best params: {final_grid.best_params_}')\n",
-        "\n",
-        "with open(OUT_DIR / 'champion_svm.pkl', 'wb') as f:\n",
-        "    pickle.dump(final_grid.best_estimator_, f)\n",
-        "print('Model saved to champion_svm.pkl')"
-      ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "YLYSUEj82IXQ",
-      "metadata": {
-        "id": "YLYSUEj82IXQ"
-      },
-      "outputs": [],
-      "source": []
     }
-  ],
-  "metadata": {
     "colab": {
-      "provenance": []
     },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.11"
     }
   },
-  "nbformat": 4,
-  "nbformat_minor": 5
 }

 {
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983",
+   "metadata": {
+    "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pickle\n",
+    "import warnings\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from pathlib import Path\n",
+    "from scipy import stats\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from time import time\n",
+    "\n",
+    "from sklearn.model_selection import (\n",
+    "    StratifiedKFold, cross_validate\n",
+    ")\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.metrics import (\n",
+    "    accuracy_score, precision_score, recall_score, f1_score,\n",
+    "    classification_report, confusion_matrix\n",
+    ")\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.ensemble import (\n",
+    "    RandomForestClassifier,\n",
+    "    VotingClassifier,\n",
+    "    BaggingClassifier,\n",
+    "    StackingClassifier,\n",
+    ")\n",
+    "import xgboost as xgb\n",
+    "import lightgbm as lgb\n",
+    "import pickle\n",
+    "warnings.filterwarnings('ignore')\n",
+    "np.random.seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d",
+   "metadata": {
+    "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d"
+   },
+   "outputs": [],
+   "source": [
+    "REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+    "DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')\n",
+    "OUT_DIR      = Path('models')\n",
+    "OUT_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "RANDOM_STATE = 42\n",
+    "N_SPLITS     = 5\n",
+    "CHAMPION_F1  = 0.6484   # Score from A5b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "26dc4267-d9d1-4481-90af-7da28143b033",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "26dc4267-d9d1-4481-90af-7da28143b033",
+    "outputId": "494d8880-3d67-4cdc-f9b1-545751653d5a"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Movement features shape: (2094, 43)\n",
+      "Weak link scores shape: (2096, 17)\n",
+      "Shape after duplicate removal: (2094, 38)\n",
+      "Weakest Link class distribution:\n",
+      "WeakestLink\n",
+      "LeftArmFallForward              616\n",
+      "RightArmFallForward             458\n",
+      "RightKneeMovesOutward           274\n",
+      "RightShoulderElevation          245\n",
+      "ExcessiveForwardLean            128\n",
+      "ForwardHead                     109\n",
+      "LeftAsymmetricalWeightShift      80\n",
+      "LeftShoulderElevation            55\n",
+      "LeftKneeMovesOutward             54\n",
+      "RightKneeMovesInward             45\n",
+      "RightAsymmetricalWeightShift     20\n",
+      "LeftHeelRises                     7\n",
+      "LeftKneeMovesInward               3\n",
+      "RightHeelRises                    2\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))\n",
+    "weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))\n",
+    "\n",
+    "print('Movement features shape:', movement_features_df.shape)\n",
+    "print('Weak link scores shape:', weaklink_scores_df.shape)\n",
+    "\n",
+    "DUPLICATE_NASM_COLS = [\n",
+    "    'No_1_NASM_Deviation',\n",
+    "    'No_2_NASM_Deviation',\n",
+    "    'No_3_NASM_Deviation',\n",
+    "    'No_4_NASM_Deviation',\n",
+    "    'No_5_NASM_Deviation',\n",
+    "]\n",
+    "\n",
+    "movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)\n",
+    "print('Shape after duplicate removal:', movement_features_df.shape)\n",
+    "\n",
+    "weaklink_categories = [\n",
+    "    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',\n",
+    "    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',\n",
+    "    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',\n",
+    "    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',\n",
+    "    'RightKneeMovesOutward', 'RightShoulderElevation',\n",
+    "]\n",
+    "\n",
+    "weaklink_scores_df['WeakestLink'] = (\n",
+    "    weaklink_scores_df[weaklink_categories].idxmax(axis=1)\n",
+    ")\n",
+    "print('Weakest Link class distribution:')\n",
+    "print(weaklink_scores_df['WeakestLink'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
+    "outputId": "fa4dacb3-82fd-410e-c3b2-942cd53eed8c"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Merged dataset shape: (2094, 39)\n",
+      "Feature matrix shape : (2094, 36)\n",
+      "Number of features   : 36\n",
+      "Number of classes    : 14\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Merge Datasets\n",
+    "target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()\n",
+    "merged_df = movement_features_df.merge(target_df, on='ID', how='inner')\n",
+    "print('Merged dataset shape:', merged_df.shape)\n",
+    "\n",
+    "EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']\n",
+    "feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]\n",
+    "\n",
+    "X = merged_df[feature_columns].values\n",
+    "y = merged_df['WeakestLink'].values\n",
+    "\n",
+    "print(f'Feature matrix shape : {X.shape}')\n",
+    "print(f'Number of features   : {len(feature_columns)}')\n",
+    "print(f'Number of classes    : {len(np.unique(y))}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf",
+   "metadata": {
+    "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf"
+   },
+   "outputs": [],
+   "source": [
+    "C_range     = [2**i for i in range(-5, 10, 4)]\n",
+    "gamma_range = [2**i for i in range(-10, 4, 4)]\n",
+    "\n",
+    "svm_param_grid = [\n",
+    "    {'svm__kernel': ['rbf'],    'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__class_weight': ['balanced']},\n",
+    "    {'svm__kernel': ['poly'],   'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__degree': [2, 3], 'svm__class_weight': ['balanced']},\n",
+    "    {'svm__kernel': ['linear'], 'svm__C': C_range, 'svm__class_weight': ['balanced']},\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "qBUGqPVmp-TH",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "qBUGqPVmp-TH",
+    "outputId": "f3b9186e-5f25-4b14-a380-69df6232fc2b"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Per-fold F1 : [0.5938 0.5981 0.5761 0.6399 0.6123]\n",
+      "Mean F1     : 0.6040 +/- 0.0213\n"
+     ]
+    }
+   ],
+   "source": [
+    "outer_cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)\n",
+    "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)\n",
+    "\n",
+    "# Pipeline keeps scaler inside each fold\n",
+    "svm_pipeline = Pipeline([\n",
+    "    ('scaler', StandardScaler()),\n",
+    "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
+    "])\n",
+    "\n",
+    "nested_svm = GridSearchCV(\n",
+    "    estimator  = svm_pipeline,\n",
+    "    param_grid = svm_param_grid,\n",
+    "    cv         = inner_cv,\n",
+    "    scoring    = 'f1_weighted',\n",
+    "    n_jobs     = -1,\n",
+    "    verbose    = 0,\n",
+    "    refit      = True,\n",
+    ")\n",
+    "nested_svm_scores = cross_val_score(\n",
+    "    nested_svm, X, y,\n",
+    "    cv      = outer_cv,\n",
+    "    scoring = 'f1_weighted',\n",
+    "    n_jobs  = -1,\n",
+    ")\n",
+    "\n",
+    "print(f'Per-fold F1 : {np.round(nested_svm_scores, 4)}')\n",
+    "print(f'Mean F1     : {nested_svm_scores.mean():.4f} +/- {nested_svm_scores.std():.4f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
+    "outputId": "56380093-2371-4284-a3b5-10622ec44adc"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running CV for Soft Voting champion\n",
+      "Per-fold F1 : [0.6316 0.6433 0.6289 0.7063 0.6331]\n",
+      "Mean F1     : 0.6486 +/- 0.0292\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "soft_voting = VotingClassifier(\n",
+    "    estimators=[\n",
+    "        ('rf',  RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',\n",
+    "                                       random_state=RANDOM_STATE, n_jobs=-1)),\n",
+    "        ('lr',  LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),\n",
+    "        ('xgb', xgb.XGBClassifier(  n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,\n",
+    "                                    colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),\n",
+    "        ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,\n",
+    "                                    random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),\n",
+    "        ('knn', KNeighborsClassifier(n_neighbors=7)),\n",
+    "        ('lda', LinearDiscriminantAnalysis()),\n",
+    "    ],\n",
+    "    voting='soft',\n",
+    "    n_jobs=-1,\n",
+    ")\n",
+    "sv_pipeline = Pipeline([\n",
+    "    ('scaler', StandardScaler()),\n",
+    "    ('voting', soft_voting),\n",
+    "])\n",
+    "\n",
+    "print('Running CV for Soft Voting champion')\n",
+    "sv_scores = cross_val_score(sv_pipeline, X, y, cv=outer_cv, scoring='f1_weighted', n_jobs=-1)\n",
+    "print(f'Per-fold F1 : {np.round(sv_scores, 4)}')\n",
+    "print(f'Mean F1     : {sv_scores.mean():.4f} +/- {sv_scores.std():.4f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
+    "outputId": "3b908043-6c47-428c-f434-abcacd15da08"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                    Model  F1_mean   F1_std vs_A5b\n",
+      "A5 Champion (Soft Voting) 0.648627 0.029224  +0.0%\n",
+      "          SVM (Nested CV) 0.604041 0.021310  -6.8%\n"
+     ]
+    }
+   ],
+   "source": [
+    "CHAMPION_F1 = 0.6484  # A5b reported score\n",
+    "\n",
+    "results = [\n",
+    "    {'Model': 'SVM (Nested CV)',           'F1_mean': nested_svm_scores.mean(), 'F1_std': nested_svm_scores.std(), '_scores': nested_svm_scores},\n",
+    "    {'Model': 'A5 Champion (Soft Voting)', 'F1_mean': sv_scores.mean(),         'F1_std': sv_scores.std(),         '_scores': sv_scores},\n",
+    "]\n",
+    "\n",
+    "results_df = pd.DataFrame([{k:v for k,v in r.items() if k != '_scores'} for r in results])\n",
+    "results_df = results_df.sort_values('F1_mean', ascending=False).reset_index(drop=True)\n",
+    "results_df['vs_A5b'] = results_df['F1_mean'].apply(lambda f: f'{(f - CHAMPION_F1)/CHAMPION_F1*100:+.1f}%')\n",
+    "print(results_df[['Model','F1_mean','F1_std','vs_A5b']].to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
+    "outputId": "8beb76a7-854d-4960-8e9d-1c88850792d5"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM (Nested CV) vs A5 Champion: t=-3.913, p=0.0173  -> Significant\n"
+     ]
     }
+   ],
+   "source": [
+    "from scipy import stats\n",
+    "\n",
+    "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
+    "    k       = len(scores_a)\n",
+    "    diff    = scores_a - scores_b\n",
+    "    d_bar   = diff.mean()\n",
+    "    s_sq    = diff.var(ddof=1)\n",
+    "    var_corr = (1/k + n_test/n_train) * s_sq\n",
+    "    t_stat  = d_bar / np.sqrt(var_corr)\n",
+    "    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))\n",
+    "    return float(t_stat), float(p_value)\n",
+    "\n",
+    "n_total      = len(X)\n",
+    "n_test_fold  = n_total // N_SPLITS\n",
+    "n_train_fold = n_total - n_test_fold\n",
+    "\n",
+    "score_map = {r['Model']: r['_scores'] for r in results}\n",
+    "sv_f1     = score_map['A5 Champion (Soft Voting)']\n",
+    "svm_f1    = score_map['SVM (Nested CV)']\n",
+    "\n",
+    "t, p = corrected_resampled_ttest(svm_f1, sv_f1, n_train_fold, n_test_fold)\n",
+    "sig  = 'Significant' if p < 0.05 else 'Not significant'\n",
+    "print(f'SVM (Nested CV) vs A5 Champion: t={t:+.3f}, p={p:.4f}  -> {sig}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
+   "metadata": {
     "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
+    "outputId": "bfd2d8ec-e390-43f5-99bc-bbb517f1935b"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fitting 5 folds for each of 52 candidates, totalling 260 fits\n",
+      "Best params: {'svm__C': 8, 'svm__class_weight': 'balanced', 'svm__gamma': 0.015625, 'svm__kernel': 'rbf'}\n",
+      "Model saved to champion_svm.pkl\n"
+     ]
     }
+   ],
+   "source": [
+    "final_pipeline = Pipeline([\n",
+    "    ('scaler', StandardScaler()),\n",
+    "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
+    "])\n",
+    "\n",
+    "final_grid = GridSearchCV(\n",
+    "    final_pipeline, svm_param_grid,\n",
+    "    cv      = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),\n",
+    "    scoring = 'f1_weighted',\n",
+    "    n_jobs  = -1, verbose=1,\n",
+    ")\n",
+    "final_grid.fit(X, y)\n",
+    "print(f'Best params: {final_grid.best_params_}')\n",
+    "\n",
+    "with open(OUT_DIR / 'champion_svm.pkl', 'wb') as f:\n",
+    "    pickle.dump(final_grid.best_estimator_, f)\n",
+    "print('Model saved to champion_svm.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "YLYSUEj82IXQ",
+   "metadata": {
+    "id": "YLYSUEj82IXQ"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
   },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }

A7/A7_PCA_with_clusters.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

A7/A7_Report.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff