Reem commited on
Commit
08843cc
·
1 Parent(s): 216ab79

pca-analysis-&-cluster-visualization

Browse files
A6/A6_Classification.ipynb CHANGED
@@ -1,461 +1,462 @@
1
  {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983",
7
- "metadata": {
8
- "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983"
9
- },
10
- "outputs": [],
11
- "source": [
12
- "import os\n",
13
- "import pickle\n",
14
- "import warnings\n",
15
- "import numpy as np\n",
16
- "import pandas as pd\n",
17
- "import matplotlib.pyplot as plt\n",
18
- "import seaborn as sns\n",
19
- "from pathlib import Path\n",
20
- "from scipy import stats\n",
21
- "from sklearn.svm import SVC\n",
22
- "from sklearn.model_selection import GridSearchCV\n",
23
- "from time import time\n",
24
- "\n",
25
- "from sklearn.model_selection import (\n",
26
- " StratifiedKFold, cross_validate\n",
27
- ")\n",
28
- "from sklearn.pipeline import Pipeline\n",
29
- "from sklearn.model_selection import cross_val_score\n",
30
- "from sklearn.preprocessing import StandardScaler\n",
31
- "from sklearn.metrics import (\n",
32
- " accuracy_score, precision_score, recall_score, f1_score,\n",
33
- " classification_report, confusion_matrix\n",
34
- ")\n",
35
- "from sklearn.linear_model import LogisticRegression\n",
36
- "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
37
- "from sklearn.neighbors import KNeighborsClassifier\n",
38
- "from sklearn.naive_bayes import GaussianNB\n",
39
- "from sklearn.ensemble import (\n",
40
- " RandomForestClassifier,\n",
41
- " VotingClassifier,\n",
42
- " BaggingClassifier,\n",
43
- " StackingClassifier,\n",
44
- ")\n",
45
- "import xgboost as xgb\n",
46
- "import lightgbm as lgb\n",
47
- "import pickle\n",
48
- "warnings.filterwarnings('ignore')\n",
49
- "np.random.seed(42)"
50
- ]
51
- },
52
- {
53
- "cell_type": "code",
54
- "execution_count": null,
55
- "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d",
56
- "metadata": {
57
- "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d"
58
- },
59
- "outputs": [],
60
- "source": [
61
- "REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
62
- "DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')\n",
63
- "OUT_DIR = Path('models')\n",
64
- "OUT_DIR.mkdir(exist_ok=True)\n",
65
- "\n",
66
- "RANDOM_STATE = 42\n",
67
- "N_SPLITS = 5\n",
68
- "CHAMPION_F1 = 0.6484 # Score from A5b"
69
- ]
70
- },
71
- {
72
- "cell_type": "code",
73
- "execution_count": 3,
74
- "id": "26dc4267-d9d1-4481-90af-7da28143b033",
75
- "metadata": {
76
- "colab": {
77
- "base_uri": "https://localhost:8080/"
78
- },
79
- "id": "26dc4267-d9d1-4481-90af-7da28143b033",
80
- "outputId": "494d8880-3d67-4cdc-f9b1-545751653d5a"
81
- },
82
- "outputs": [
83
- {
84
- "name": "stdout",
85
- "output_type": "stream",
86
- "text": [
87
- "Movement features shape: (2094, 43)\n",
88
- "Weak link scores shape: (2096, 17)\n",
89
- "Shape after duplicate removal: (2094, 38)\n",
90
- "Weakest Link class distribution:\n",
91
- "WeakestLink\n",
92
- "LeftArmFallForward 616\n",
93
- "RightArmFallForward 458\n",
94
- "RightKneeMovesOutward 274\n",
95
- "RightShoulderElevation 245\n",
96
- "ExcessiveForwardLean 128\n",
97
- "ForwardHead 109\n",
98
- "LeftAsymmetricalWeightShift 80\n",
99
- "LeftShoulderElevation 55\n",
100
- "LeftKneeMovesOutward 54\n",
101
- "RightKneeMovesInward 45\n",
102
- "RightAsymmetricalWeightShift 20\n",
103
- "LeftHeelRises 7\n",
104
- "LeftKneeMovesInward 3\n",
105
- "RightHeelRises 2\n",
106
- "Name: count, dtype: int64\n"
107
- ]
108
- }
109
- ],
110
- "source": [
111
- "movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))\n",
112
- "weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))\n",
113
- "\n",
114
- "print('Movement features shape:', movement_features_df.shape)\n",
115
- "print('Weak link scores shape:', weaklink_scores_df.shape)\n",
116
- "\n",
117
- "DUPLICATE_NASM_COLS = [\n",
118
- " 'No_1_NASM_Deviation',\n",
119
- " 'No_2_NASM_Deviation',\n",
120
- " 'No_3_NASM_Deviation',\n",
121
- " 'No_4_NASM_Deviation',\n",
122
- " 'No_5_NASM_Deviation',\n",
123
- "]\n",
124
- "\n",
125
- "movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)\n",
126
- "print('Shape after duplicate removal:', movement_features_df.shape)\n",
127
- "\n",
128
- "weaklink_categories = [\n",
129
- " 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',\n",
130
- " 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',\n",
131
- " 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',\n",
132
- " 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',\n",
133
- " 'RightKneeMovesOutward', 'RightShoulderElevation',\n",
134
- "]\n",
135
- "\n",
136
- "weaklink_scores_df['WeakestLink'] = (\n",
137
- " weaklink_scores_df[weaklink_categories].idxmax(axis=1)\n",
138
- ")\n",
139
- "print('Weakest Link class distribution:')\n",
140
- "print(weaklink_scores_df['WeakestLink'].value_counts())"
141
- ]
142
- },
143
- {
144
- "cell_type": "code",
145
- "execution_count": 4,
146
- "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
147
- "metadata": {
148
- "colab": {
149
- "base_uri": "https://localhost:8080/"
150
- },
151
- "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
152
- "outputId": "fa4dacb3-82fd-410e-c3b2-942cd53eed8c"
153
- },
154
- "outputs": [
155
- {
156
- "name": "stdout",
157
- "output_type": "stream",
158
- "text": [
159
- "Merged dataset shape: (2094, 39)\n",
160
- "Feature matrix shape : (2094, 36)\n",
161
- "Number of features : 36\n",
162
- "Number of classes : 14\n"
163
- ]
164
- }
165
- ],
166
- "source": [
167
- "# Merge Datasets\n",
168
- "target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()\n",
169
- "merged_df = movement_features_df.merge(target_df, on='ID', how='inner')\n",
170
- "print('Merged dataset shape:', merged_df.shape)\n",
171
- "\n",
172
- "EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']\n",
173
- "feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]\n",
174
- "\n",
175
- "X = merged_df[feature_columns].values\n",
176
- "y = merged_df['WeakestLink'].values\n",
177
- "\n",
178
- "print(f'Feature matrix shape : {X.shape}')\n",
179
- "print(f'Number of features : {len(feature_columns)}')\n",
180
- "print(f'Number of classes : {len(np.unique(y))}')"
181
- ]
182
- },
183
- {
184
- "cell_type": "code",
185
- "execution_count": 9,
186
- "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf",
187
- "metadata": {
188
- "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf"
189
- },
190
- "outputs": [],
191
- "source": [
192
- "C_range = [2**i for i in range(-5, 10, 4)]\n",
193
- "gamma_range = [2**i for i in range(-10, 4, 4)]\n",
194
- "\n",
195
- "svm_param_grid = [\n",
196
- " {'svm__kernel': ['rbf'], 'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__class_weight': ['balanced']},\n",
197
- " {'svm__kernel': ['poly'], 'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__degree': [2, 3], 'svm__class_weight': ['balanced']},\n",
198
- " {'svm__kernel': ['linear'], 'svm__C': C_range, 'svm__class_weight': ['balanced']},\n",
199
- "]"
200
- ]
201
  },
 
 
 
 
202
  {
203
- "cell_type": "code",
204
- "execution_count": 10,
205
- "id": "qBUGqPVmp-TH",
206
- "metadata": {
207
- "colab": {
208
- "base_uri": "https://localhost:8080/"
209
- },
210
- "id": "qBUGqPVmp-TH",
211
- "outputId": "f3b9186e-5f25-4b14-a380-69df6232fc2b"
212
- },
213
- "outputs": [
214
- {
215
- "name": "stdout",
216
- "output_type": "stream",
217
- "text": [
218
- "Per-fold F1 : [0.5938 0.5981 0.5761 0.6399 0.6123]\n",
219
- "Mean F1 : 0.6040 +/- 0.0213\n"
220
- ]
221
- }
222
- ],
223
- "source": [
224
- "outer_cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)\n",
225
- "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)\n",
226
- "\n",
227
- "# Pipeline keeps scaler inside each fold\n",
228
- "svm_pipeline = Pipeline([\n",
229
- " ('scaler', StandardScaler()),\n",
230
- " ('svm', SVC(probability=True, random_state=RANDOM_STATE)),\n",
231
- "])\n",
232
- "\n",
233
- "nested_svm = GridSearchCV(\n",
234
- " estimator = svm_pipeline,\n",
235
- " param_grid = svm_param_grid,\n",
236
- " cv = inner_cv,\n",
237
- " scoring = 'f1_weighted',\n",
238
- " n_jobs = -1,\n",
239
- " verbose = 0,\n",
240
- " refit = True,\n",
241
- ")\n",
242
- "nested_svm_scores = cross_val_score(\n",
243
- " nested_svm, X, y,\n",
244
- " cv = outer_cv,\n",
245
- " scoring = 'f1_weighted',\n",
246
- " n_jobs = -1,\n",
247
- ")\n",
248
- "\n",
249
- "print(f'Per-fold F1 : {np.round(nested_svm_scores, 4)}')\n",
250
- "print(f'Mean F1 : {nested_svm_scores.mean():.4f} +/- {nested_svm_scores.std():.4f}')"
251
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  },
 
 
 
 
253
  {
254
- "cell_type": "code",
255
- "execution_count": 11,
256
- "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
257
- "metadata": {
258
- "colab": {
259
- "base_uri": "https://localhost:8080/"
260
- },
261
- "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
262
- "outputId": "56380093-2371-4284-a3b5-10622ec44adc"
263
- },
264
- "outputs": [
265
- {
266
- "name": "stdout",
267
- "output_type": "stream",
268
- "text": [
269
- "Running CV for Soft Voting champion\n",
270
- "Per-fold F1 : [0.6316 0.6433 0.6289 0.7063 0.6331]\n",
271
- "Mean F1 : 0.6486 +/- 0.0292\n"
272
- ]
273
- }
274
- ],
275
- "source": [
276
- "\n",
277
- "soft_voting = VotingClassifier(\n",
278
- " estimators=[\n",
279
- " ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',\n",
280
- " random_state=RANDOM_STATE, n_jobs=-1)),\n",
281
- " ('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),\n",
282
- " ('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,\n",
283
- " colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),\n",
284
- " ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,\n",
285
- " random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),\n",
286
- " ('knn', KNeighborsClassifier(n_neighbors=7)),\n",
287
- " ('lda', LinearDiscriminantAnalysis()),\n",
288
- " ],\n",
289
- " voting='soft',\n",
290
- " n_jobs=-1,\n",
291
- ")\n",
292
- "sv_pipeline = Pipeline([\n",
293
- " ('scaler', StandardScaler()),\n",
294
- " ('voting', soft_voting),\n",
295
- "])\n",
296
- "\n",
297
- "print('Running CV for Soft Voting champion')\n",
298
- "sv_scores = cross_val_score(sv_pipeline, X, y, cv=outer_cv, scoring='f1_weighted', n_jobs=-1)\n",
299
- "print(f'Per-fold F1 : {np.round(sv_scores, 4)}')\n",
300
- "print(f'Mean F1 : {sv_scores.mean():.4f} +/- {sv_scores.std():.4f}')"
301
- ]
 
 
 
 
 
302
  },
 
 
 
 
303
  {
304
- "cell_type": "code",
305
- "execution_count": 12,
306
- "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
307
- "metadata": {
308
- "colab": {
309
- "base_uri": "https://localhost:8080/"
310
- },
311
- "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
312
- "outputId": "3b908043-6c47-428c-f434-abcacd15da08"
313
- },
314
- "outputs": [
315
- {
316
- "name": "stdout",
317
- "output_type": "stream",
318
- "text": [
319
- " Model F1_mean F1_std vs_A5b\n",
320
- "A5 Champion (Soft Voting) 0.648627 0.029224 +0.0%\n",
321
- " SVM (Nested CV) 0.604041 0.021310 -6.8%\n"
322
- ]
323
- }
324
- ],
325
- "source": [
326
- "CHAMPION_F1 = 0.6484 # A5b reported score\n",
327
- "\n",
328
- "results = [\n",
329
- " {'Model': 'SVM (Nested CV)', 'F1_mean': nested_svm_scores.mean(), 'F1_std': nested_svm_scores.std(), '_scores': nested_svm_scores},\n",
330
- " {'Model': 'A5 Champion (Soft Voting)', 'F1_mean': sv_scores.mean(), 'F1_std': sv_scores.std(), '_scores': sv_scores},\n",
331
- "]\n",
332
- "\n",
333
- "results_df = pd.DataFrame([{k:v for k,v in r.items() if k != '_scores'} for r in results])\n",
334
- "results_df = results_df.sort_values('F1_mean', ascending=False).reset_index(drop=True)\n",
335
- "results_df['vs_A5b'] = results_df['F1_mean'].apply(lambda f: f'{(f - CHAMPION_F1)/CHAMPION_F1*100:+.1f}%')\n",
336
- "print(results_df[['Model','F1_mean','F1_std','vs_A5b']].to_string(index=False))"
337
- ]
 
 
 
 
 
 
 
 
 
 
 
338
  },
 
 
 
 
339
  {
340
- "cell_type": "code",
341
- "execution_count": 13,
342
- "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
343
- "metadata": {
344
- "colab": {
345
- "base_uri": "https://localhost:8080/"
346
- },
347
- "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
348
- "outputId": "8beb76a7-854d-4960-8e9d-1c88850792d5"
349
- },
350
- "outputs": [
351
- {
352
- "name": "stdout",
353
- "output_type": "stream",
354
- "text": [
355
- "SVM (Nested CV) vs A5 Champion: t=-3.913, p=0.0173 -> Significant\n"
356
- ]
357
- }
358
- ],
359
- "source": [
360
- "from scipy import stats\n",
361
- "\n",
362
- "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
363
- " k = len(scores_a)\n",
364
- " diff = scores_a - scores_b\n",
365
- " d_bar = diff.mean()\n",
366
- " s_sq = diff.var(ddof=1)\n",
367
- " var_corr = (1/k + n_test/n_train) * s_sq\n",
368
- " t_stat = d_bar / np.sqrt(var_corr)\n",
369
- " p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))\n",
370
- " return float(t_stat), float(p_value)\n",
371
- "\n",
372
- "n_total = len(X)\n",
373
- "n_test_fold = n_total // N_SPLITS\n",
374
- "n_train_fold = n_total - n_test_fold\n",
375
- "\n",
376
- "score_map = {r['Model']: r['_scores'] for r in results}\n",
377
- "sv_f1 = score_map['A5 Champion (Soft Voting)']\n",
378
- "svm_f1 = score_map['SVM (Nested CV)']\n",
379
- "\n",
380
- "t, p = corrected_resampled_ttest(svm_f1, sv_f1, n_train_fold, n_test_fold)\n",
381
- "sig = 'Significant' if p < 0.05 else 'Not significant'\n",
382
- "print(f'SVM (Nested CV) vs A5 Champion: t={t:+.3f}, p={p:.4f} -> {sig}')"
383
- ]
384
  },
 
 
 
 
385
  {
386
- "cell_type": "code",
387
- "execution_count": 14,
388
- "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
389
- "metadata": {
390
- "colab": {
391
- "base_uri": "https://localhost:8080/"
392
- },
393
- "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
394
- "outputId": "bfd2d8ec-e390-43f5-99bc-bbb517f1935b"
395
- },
396
- "outputs": [
397
- {
398
- "name": "stdout",
399
- "output_type": "stream",
400
- "text": [
401
- "Fitting 5 folds for each of 52 candidates, totalling 260 fits\n",
402
- "Best params: {'svm__C': 8, 'svm__class_weight': 'balanced', 'svm__gamma': 0.015625, 'svm__kernel': 'rbf'}\n",
403
- "Model saved to champion_svm.pkl\n"
404
- ]
405
- }
406
- ],
407
- "source": [
408
- "final_pipeline = Pipeline([\n",
409
- " ('scaler', StandardScaler()),\n",
410
- " ('svm', SVC(probability=True, random_state=RANDOM_STATE)),\n",
411
- "])\n",
412
- "\n",
413
- "final_grid = GridSearchCV(\n",
414
- " final_pipeline, svm_param_grid,\n",
415
- " cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),\n",
416
- " scoring = 'f1_weighted',\n",
417
- " n_jobs = -1, verbose=1,\n",
418
- ")\n",
419
- "final_grid.fit(X, y)\n",
420
- "print(f'Best params: {final_grid.best_params_}')\n",
421
- "\n",
422
- "with open(OUT_DIR / 'champion_svm.pkl', 'wb') as f:\n",
423
- " pickle.dump(final_grid.best_estimator_, f)\n",
424
- "print('Model saved to champion_svm.pkl')"
425
- ]
426
  },
 
 
 
 
427
  {
428
- "cell_type": "code",
429
- "execution_count": null,
430
- "id": "YLYSUEj82IXQ",
431
- "metadata": {
432
- "id": "YLYSUEj82IXQ"
433
- },
434
- "outputs": [],
435
- "source": []
436
  }
437
- ],
438
- "metadata": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  "colab": {
440
- "provenance": []
441
  },
442
- "kernelspec": {
443
- "display_name": "Python 3",
444
- "name": "python3"
445
- },
446
- "language_info": {
447
- "codemirror_mode": {
448
- "name": "ipython",
449
- "version": 3
450
- },
451
- "file_extension": ".py",
452
- "mimetype": "text/x-python",
453
- "name": "python",
454
- "nbconvert_exporter": "python",
455
- "pygments_lexer": "ipython3",
456
- "version": "3.10.11"
457
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  },
459
- "nbformat": 4,
460
- "nbformat_minor": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  }
 
1
  {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983",
7
+ "metadata": {
8
+ "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983"
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "import os\n",
13
+ "import pickle\n",
14
+ "import warnings\n",
15
+ "import numpy as np\n",
16
+ "import pandas as pd\n",
17
+ "import matplotlib.pyplot as plt\n",
18
+ "import seaborn as sns\n",
19
+ "from pathlib import Path\n",
20
+ "from scipy import stats\n",
21
+ "from sklearn.svm import SVC\n",
22
+ "from sklearn.model_selection import GridSearchCV\n",
23
+ "from time import time\n",
24
+ "\n",
25
+ "from sklearn.model_selection import (\n",
26
+ " StratifiedKFold, cross_validate\n",
27
+ ")\n",
28
+ "from sklearn.pipeline import Pipeline\n",
29
+ "from sklearn.model_selection import cross_val_score\n",
30
+ "from sklearn.preprocessing import StandardScaler\n",
31
+ "from sklearn.metrics import (\n",
32
+ " accuracy_score, precision_score, recall_score, f1_score,\n",
33
+ " classification_report, confusion_matrix\n",
34
+ ")\n",
35
+ "from sklearn.linear_model import LogisticRegression\n",
36
+ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
37
+ "from sklearn.neighbors import KNeighborsClassifier\n",
38
+ "from sklearn.naive_bayes import GaussianNB\n",
39
+ "from sklearn.ensemble import (\n",
40
+ " RandomForestClassifier,\n",
41
+ " VotingClassifier,\n",
42
+ " BaggingClassifier,\n",
43
+ " StackingClassifier,\n",
44
+ ")\n",
45
+ "import xgboost as xgb\n",
46
+ "import lightgbm as lgb\n",
47
+ "import pickle\n",
48
+ "warnings.filterwarnings('ignore')\n",
49
+ "np.random.seed(42)"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d",
56
+ "metadata": {
57
+ "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d"
58
+ },
59
+ "outputs": [],
60
+ "source": [
61
+ "REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
62
+ "DATA_DIR = os.path.join(REPO_ROOT, 'Datasets_all')\n",
63
+ "OUT_DIR = Path('models')\n",
64
+ "OUT_DIR.mkdir(exist_ok=True)\n",
65
+ "\n",
66
+ "RANDOM_STATE = 42\n",
67
+ "N_SPLITS = 5\n",
68
+ "CHAMPION_F1 = 0.6484 # Score from A5b"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 3,
74
+ "id": "26dc4267-d9d1-4481-90af-7da28143b033",
75
+ "metadata": {
76
+ "colab": {
77
+ "base_uri": "https://localhost:8080/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  },
79
+ "id": "26dc4267-d9d1-4481-90af-7da28143b033",
80
+ "outputId": "494d8880-3d67-4cdc-f9b1-545751653d5a"
81
+ },
82
+ "outputs": [
83
  {
84
+ "name": "stdout",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "Movement features shape: (2094, 43)\n",
88
+ "Weak link scores shape: (2096, 17)\n",
89
+ "Shape after duplicate removal: (2094, 38)\n",
90
+ "Weakest Link class distribution:\n",
91
+ "WeakestLink\n",
92
+ "LeftArmFallForward 616\n",
93
+ "RightArmFallForward 458\n",
94
+ "RightKneeMovesOutward 274\n",
95
+ "RightShoulderElevation 245\n",
96
+ "ExcessiveForwardLean 128\n",
97
+ "ForwardHead 109\n",
98
+ "LeftAsymmetricalWeightShift 80\n",
99
+ "LeftShoulderElevation 55\n",
100
+ "LeftKneeMovesOutward 54\n",
101
+ "RightKneeMovesInward 45\n",
102
+ "RightAsymmetricalWeightShift 20\n",
103
+ "LeftHeelRises 7\n",
104
+ "LeftKneeMovesInward 3\n",
105
+ "RightHeelRises 2\n",
106
+ "Name: count, dtype: int64\n"
107
+ ]
108
+ }
109
+ ],
110
+ "source": [
111
+ "movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))\n",
112
+ "weaklink_scores_df = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))\n",
113
+ "\n",
114
+ "print('Movement features shape:', movement_features_df.shape)\n",
115
+ "print('Weak link scores shape:', weaklink_scores_df.shape)\n",
116
+ "\n",
117
+ "DUPLICATE_NASM_COLS = [\n",
118
+ " 'No_1_NASM_Deviation',\n",
119
+ " 'No_2_NASM_Deviation',\n",
120
+ " 'No_3_NASM_Deviation',\n",
121
+ " 'No_4_NASM_Deviation',\n",
122
+ " 'No_5_NASM_Deviation',\n",
123
+ "]\n",
124
+ "\n",
125
+ "movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)\n",
126
+ "print('Shape after duplicate removal:', movement_features_df.shape)\n",
127
+ "\n",
128
+ "weaklink_categories = [\n",
129
+ " 'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',\n",
130
+ " 'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',\n",
131
+ " 'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',\n",
132
+ " 'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',\n",
133
+ " 'RightKneeMovesOutward', 'RightShoulderElevation',\n",
134
+ "]\n",
135
+ "\n",
136
+ "weaklink_scores_df['WeakestLink'] = (\n",
137
+ " weaklink_scores_df[weaklink_categories].idxmax(axis=1)\n",
138
+ ")\n",
139
+ "print('Weakest Link class distribution:')\n",
140
+ "print(weaklink_scores_df['WeakestLink'].value_counts())"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 4,
146
+ "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
147
+ "metadata": {
148
+ "colab": {
149
+ "base_uri": "https://localhost:8080/"
150
  },
151
+ "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
152
+ "outputId": "fa4dacb3-82fd-410e-c3b2-942cd53eed8c"
153
+ },
154
+ "outputs": [
155
  {
156
+ "name": "stdout",
157
+ "output_type": "stream",
158
+ "text": [
159
+ "Merged dataset shape: (2094, 39)\n",
160
+ "Feature matrix shape : (2094, 36)\n",
161
+ "Number of features : 36\n",
162
+ "Number of classes : 14\n"
163
+ ]
164
+ }
165
+ ],
166
+ "source": [
167
+ "# Merge Datasets\n",
168
+ "target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()\n",
169
+ "merged_df = movement_features_df.merge(target_df, on='ID', how='inner')\n",
170
+ "print('Merged dataset shape:', merged_df.shape)\n",
171
+ "\n",
172
+ "EXCLUDE_COLS = ['ID', 'WeakestLink', 'EstimatedScore']\n",
173
+ "feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]\n",
174
+ "\n",
175
+ "X = merged_df[feature_columns].values\n",
176
+ "y = merged_df['WeakestLink'].values\n",
177
+ "\n",
178
+ "print(f'Feature matrix shape : {X.shape}')\n",
179
+ "print(f'Number of features : {len(feature_columns)}')\n",
180
+ "print(f'Number of classes : {len(np.unique(y))}')"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": 9,
186
+ "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf",
187
+ "metadata": {
188
+ "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf"
189
+ },
190
+ "outputs": [],
191
+ "source": [
192
+ "C_range = [2**i for i in range(-5, 10, 4)]\n",
193
+ "gamma_range = [2**i for i in range(-10, 4, 4)]\n",
194
+ "\n",
195
+ "svm_param_grid = [\n",
196
+ " {'svm__kernel': ['rbf'], 'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__class_weight': ['balanced']},\n",
197
+ " {'svm__kernel': ['poly'], 'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__degree': [2, 3], 'svm__class_weight': ['balanced']},\n",
198
+ " {'svm__kernel': ['linear'], 'svm__C': C_range, 'svm__class_weight': ['balanced']},\n",
199
+ "]"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": 10,
205
+ "id": "qBUGqPVmp-TH",
206
+ "metadata": {
207
+ "colab": {
208
+ "base_uri": "https://localhost:8080/"
209
  },
210
+ "id": "qBUGqPVmp-TH",
211
+ "outputId": "f3b9186e-5f25-4b14-a380-69df6232fc2b"
212
+ },
213
+ "outputs": [
214
  {
215
+ "name": "stdout",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "Per-fold F1 : [0.5938 0.5981 0.5761 0.6399 0.6123]\n",
219
+ "Mean F1 : 0.6040 +/- 0.0213\n"
220
+ ]
221
+ }
222
+ ],
223
+ "source": [
224
+ "outer_cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)\n",
225
+ "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)\n",
226
+ "\n",
227
+ "# Pipeline keeps scaler inside each fold\n",
228
+ "svm_pipeline = Pipeline([\n",
229
+ " ('scaler', StandardScaler()),\n",
230
+ " ('svm', SVC(probability=True, random_state=RANDOM_STATE)),\n",
231
+ "])\n",
232
+ "\n",
233
+ "nested_svm = GridSearchCV(\n",
234
+ " estimator = svm_pipeline,\n",
235
+ " param_grid = svm_param_grid,\n",
236
+ " cv = inner_cv,\n",
237
+ " scoring = 'f1_weighted',\n",
238
+ " n_jobs = -1,\n",
239
+ " verbose = 0,\n",
240
+ " refit = True,\n",
241
+ ")\n",
242
+ "nested_svm_scores = cross_val_score(\n",
243
+ " nested_svm, X, y,\n",
244
+ " cv = outer_cv,\n",
245
+ " scoring = 'f1_weighted',\n",
246
+ " n_jobs = -1,\n",
247
+ ")\n",
248
+ "\n",
249
+ "print(f'Per-fold F1 : {np.round(nested_svm_scores, 4)}')\n",
250
+ "print(f'Mean F1 : {nested_svm_scores.mean():.4f} +/- {nested_svm_scores.std():.4f}')"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": 11,
256
+ "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
257
+ "metadata": {
258
+ "colab": {
259
+ "base_uri": "https://localhost:8080/"
260
  },
261
+ "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
262
+ "outputId": "56380093-2371-4284-a3b5-10622ec44adc"
263
+ },
264
+ "outputs": [
265
  {
266
+ "name": "stdout",
267
+ "output_type": "stream",
268
+ "text": [
269
+ "Running CV for Soft Voting champion\n",
270
+ "Per-fold F1 : [0.6316 0.6433 0.6289 0.7063 0.6331]\n",
271
+ "Mean F1 : 0.6486 +/- 0.0292\n"
272
+ ]
273
+ }
274
+ ],
275
+ "source": [
276
+ "\n",
277
+ "soft_voting = VotingClassifier(\n",
278
+ " estimators=[\n",
279
+ " ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',\n",
280
+ " random_state=RANDOM_STATE, n_jobs=-1)),\n",
281
+ " ('lr', LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),\n",
282
+ " ('xgb', xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,\n",
283
+ " colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),\n",
284
+ " ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,\n",
285
+ " random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),\n",
286
+ " ('knn', KNeighborsClassifier(n_neighbors=7)),\n",
287
+ " ('lda', LinearDiscriminantAnalysis()),\n",
288
+ " ],\n",
289
+ " voting='soft',\n",
290
+ " n_jobs=-1,\n",
291
+ ")\n",
292
+ "sv_pipeline = Pipeline([\n",
293
+ " ('scaler', StandardScaler()),\n",
294
+ " ('voting', soft_voting),\n",
295
+ "])\n",
296
+ "\n",
297
+ "print('Running CV for Soft Voting champion')\n",
298
+ "sv_scores = cross_val_score(sv_pipeline, X, y, cv=outer_cv, scoring='f1_weighted', n_jobs=-1)\n",
299
+ "print(f'Per-fold F1 : {np.round(sv_scores, 4)}')\n",
300
+ "print(f'Mean F1 : {sv_scores.mean():.4f} +/- {sv_scores.std():.4f}')"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": 12,
306
+ "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
307
+ "metadata": {
308
+ "colab": {
309
+ "base_uri": "https://localhost:8080/"
310
  },
311
+ "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
312
+ "outputId": "3b908043-6c47-428c-f434-abcacd15da08"
313
+ },
314
+ "outputs": [
315
  {
316
+ "name": "stdout",
317
+ "output_type": "stream",
318
+ "text": [
319
+ " Model F1_mean F1_std vs_A5b\n",
320
+ "A5 Champion (Soft Voting) 0.648627 0.029224 +0.0%\n",
321
+ " SVM (Nested CV) 0.604041 0.021310 -6.8%\n"
322
+ ]
323
+ }
324
+ ],
325
+ "source": [
326
+ "CHAMPION_F1 = 0.6484 # A5b reported score\n",
327
+ "\n",
328
+ "results = [\n",
329
+ " {'Model': 'SVM (Nested CV)', 'F1_mean': nested_svm_scores.mean(), 'F1_std': nested_svm_scores.std(), '_scores': nested_svm_scores},\n",
330
+ " {'Model': 'A5 Champion (Soft Voting)', 'F1_mean': sv_scores.mean(), 'F1_std': sv_scores.std(), '_scores': sv_scores},\n",
331
+ "]\n",
332
+ "\n",
333
+ "results_df = pd.DataFrame([{k:v for k,v in r.items() if k != '_scores'} for r in results])\n",
334
+ "results_df = results_df.sort_values('F1_mean', ascending=False).reset_index(drop=True)\n",
335
+ "results_df['vs_A5b'] = results_df['F1_mean'].apply(lambda f: f'{(f - CHAMPION_F1)/CHAMPION_F1*100:+.1f}%')\n",
336
+ "print(results_df[['Model','F1_mean','F1_std','vs_A5b']].to_string(index=False))"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "execution_count": 13,
342
+ "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
343
+ "metadata": {
344
+ "colab": {
345
+ "base_uri": "https://localhost:8080/"
 
 
 
 
 
 
 
 
 
 
346
  },
347
+ "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
348
+ "outputId": "8beb76a7-854d-4960-8e9d-1c88850792d5"
349
+ },
350
+ "outputs": [
351
  {
352
+ "name": "stdout",
353
+ "output_type": "stream",
354
+ "text": [
355
+ "SVM (Nested CV) vs A5 Champion: t=-3.913, p=0.0173 -> Significant\n"
356
+ ]
 
 
 
357
  }
358
+ ],
359
+ "source": [
360
+ "from scipy import stats\n",
361
+ "\n",
362
+ "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
363
+ " k = len(scores_a)\n",
364
+ " diff = scores_a - scores_b\n",
365
+ " d_bar = diff.mean()\n",
366
+ " s_sq = diff.var(ddof=1)\n",
367
+ " var_corr = (1/k + n_test/n_train) * s_sq\n",
368
+ " t_stat = d_bar / np.sqrt(var_corr)\n",
369
+ " p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))\n",
370
+ " return float(t_stat), float(p_value)\n",
371
+ "\n",
372
+ "n_total = len(X)\n",
373
+ "n_test_fold = n_total // N_SPLITS\n",
374
+ "n_train_fold = n_total - n_test_fold\n",
375
+ "\n",
376
+ "score_map = {r['Model']: r['_scores'] for r in results}\n",
377
+ "sv_f1 = score_map['A5 Champion (Soft Voting)']\n",
378
+ "svm_f1 = score_map['SVM (Nested CV)']\n",
379
+ "\n",
380
+ "t, p = corrected_resampled_ttest(svm_f1, sv_f1, n_train_fold, n_test_fold)\n",
381
+ "sig = 'Significant' if p < 0.05 else 'Not significant'\n",
382
+ "print(f'SVM (Nested CV) vs A5 Champion: t={t:+.3f}, p={p:.4f} -> {sig}')"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 14,
388
+ "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
389
+ "metadata": {
390
  "colab": {
391
+ "base_uri": "https://localhost:8080/"
392
  },
393
+ "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
394
+ "outputId": "bfd2d8ec-e390-43f5-99bc-bbb517f1935b"
395
+ },
396
+ "outputs": [
397
+ {
398
+ "name": "stdout",
399
+ "output_type": "stream",
400
+ "text": [
401
+ "Fitting 5 folds for each of 52 candidates, totalling 260 fits\n",
402
+ "Best params: {'svm__C': 8, 'svm__class_weight': 'balanced', 'svm__gamma': 0.015625, 'svm__kernel': 'rbf'}\n",
403
+ "Model saved to champion_svm.pkl\n"
404
+ ]
 
 
 
405
  }
406
+ ],
407
+ "source": [
408
+ "final_pipeline = Pipeline([\n",
409
+ " ('scaler', StandardScaler()),\n",
410
+ " ('svm', SVC(probability=True, random_state=RANDOM_STATE)),\n",
411
+ "])\n",
412
+ "\n",
413
+ "final_grid = GridSearchCV(\n",
414
+ " final_pipeline, svm_param_grid,\n",
415
+ " cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),\n",
416
+ " scoring = 'f1_weighted',\n",
417
+ " n_jobs = -1, verbose=1,\n",
418
+ ")\n",
419
+ "final_grid.fit(X, y)\n",
420
+ "print(f'Best params: {final_grid.best_params_}')\n",
421
+ "\n",
422
+ "with open(OUT_DIR / 'champion_svm.pkl', 'wb') as f:\n",
423
+ " pickle.dump(final_grid.best_estimator_, f)\n",
424
+ "print('Model saved to champion_svm.pkl')"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": null,
430
+ "id": "YLYSUEj82IXQ",
431
+ "metadata": {
432
+ "id": "YLYSUEj82IXQ"
433
+ },
434
+ "outputs": [],
435
+ "source": []
436
+ }
437
+ ],
438
+ "metadata": {
439
+ "colab": {
440
+ "provenance": []
441
+ },
442
+ "kernelspec": {
443
+ "display_name": "Python 3 (ipykernel)",
444
+ "language": "python",
445
+ "name": "python3"
446
  },
447
+ "language_info": {
448
+ "codemirror_mode": {
449
+ "name": "ipython",
450
+ "version": 3
451
+ },
452
+ "file_extension": ".py",
453
+ "mimetype": "text/x-python",
454
+ "name": "python",
455
+ "nbconvert_exporter": "python",
456
+ "pygments_lexer": "ipython3",
457
+ "version": "3.12.8"
458
+ }
459
+ },
460
+ "nbformat": 4,
461
+ "nbformat_minor": 5
462
  }
A7/A7_PCA_with_clusters.csv ADDED
The diff for this file is too large to render. See raw diff
 
A7/A7_Report.ipynb ADDED
The diff for this file is too large to render. See raw diff