DC-Well-Being-AI / eda_visualizations.py
CHRISDANIEL145
Deploy: Clean LF-tracked Release
d1d1019
# =====================================================================
# MENTAL HEALTH PROJECT - EXPLORATORY DATA ANALYSIS & VISUALIZATIONS
# Generates charts for final-year project report
# =====================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# Use non-interactive backend for saving figures
import matplotlib
matplotlib.use('Agg')
# Set style for publication-quality figures
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
# =====================================================================
# LOAD ALL DATASETS
# =====================================================================
def load_datasets():
"""Load all 4 datasets"""
datasets = {
'students': pd.read_csv('dataset/students_sleep_screen.csv'),
'social_media': pd.read_csv('dataset/social_media_depression.csv'),
'burnout': pd.read_csv('dataset/burnout_medical_post_covid.csv'),
'mbsr': pd.read_csv('dataset/mbsr_healthcare.csv')
}
print("✓ All datasets loaded successfully")
for name, df in datasets.items():
print(f" - {name}: {df.shape[0]} rows, {df.shape[1]} columns")
return datasets
# =====================================================================
# 1. SCREEN TIME VS SLEEP QUALITY ANALYSIS
# =====================================================================
def analyze_screen_time_sleep(df):
"""
Problem 1: Correlation Between Screen Time and Sleep Quality
"""
print("\n" + "="*70)
print("ANALYSIS 1: SCREEN TIME VS SLEEP QUALITY (Students)")
print("="*70)
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 1a. Correlation Heatmap
corr_cols = ['screen_time_hours', 'screen_time_night_hours',
'social_media_hours', 'sleep_duration_hours',
'sleep_quality_score', 'insomnia_score', 'stress_score']
corr_matrix = df[corr_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='RdYlBu_r', center=0,
fmt='.2f', ax=axes[0, 0], square=True)
axes[0, 0].set_title('Correlation Heatmap: Screen Time & Sleep Factors', fontsize=12)
# 1b. Screen Time vs Sleep Quality Scatter
axes[0, 1].scatter(df['screen_time_hours'], df['sleep_quality_score'],
alpha=0.6, c=df['stress_score'], cmap='coolwarm')
axes[0, 1].set_xlabel('Total Screen Time (hours)')
axes[0, 1].set_ylabel('Sleep Quality Score')
axes[0, 1].set_title('Screen Time vs Sleep Quality (color=stress)')
# Add regression line
z = np.polyfit(df['screen_time_hours'], df['sleep_quality_score'], 1)
p = np.poly1d(z)
axes[0, 1].plot(df['screen_time_hours'].sort_values(),
p(df['screen_time_hours'].sort_values()),
"r--", alpha=0.8, label=f'Trend line')
axes[0, 1].legend()
# 1c. Night Screen Time Distribution by Sleep Label
df['sleep_label'] = (df['sleep_quality_score'] >= 7.5).astype(int)
df['sleep_category'] = df['sleep_label'].map({0: 'Poor Sleep', 1: 'Good Sleep'})
sns.boxplot(data=df, x='sleep_category', y='screen_time_night_hours', ax=axes[1, 0])
axes[1, 0].set_title('Night Screen Time by Sleep Quality')
axes[1, 0].set_xlabel('Sleep Category')
axes[1, 0].set_ylabel('Night Screen Time (hours)')
# 1d. Sleep Quality Distribution
sns.histplot(data=df, x='sleep_quality_score', hue='gender', kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Sleep Quality Distribution by Gender')
axes[1, 1].axvline(x=7.5, color='red', linestyle='--', label='Threshold (7.5)')
axes[1, 1].legend()
plt.tight_layout()
plt.savefig('fig_screen_time_sleep.png', dpi=300, bbox_inches='tight')
plt.close()
# Statistical Analysis
print("\n--- Statistical Summary ---")
corr, p_val = stats.pearsonr(df['screen_time_hours'], df['sleep_quality_score'])
print(f"Correlation (Screen Time vs Sleep Quality): r = {corr:.4f}, p = {p_val:.4f}")
corr_night, p_night = stats.pearsonr(df['screen_time_night_hours'], df['sleep_quality_score'])
print(f"Correlation (Night Screen vs Sleep Quality): r = {corr_night:.4f}, p = {p_night:.4f}")
# T-test: Good vs Poor sleepers
good_sleep = df[df['sleep_label'] == 1]['screen_time_night_hours']
poor_sleep = df[df['sleep_label'] == 0]['screen_time_night_hours']
t_stat, t_pval = stats.ttest_ind(good_sleep, poor_sleep)
print(f"\nT-test (Night Screen Time: Good vs Poor Sleep):")
print(f" t-statistic = {t_stat:.4f}, p-value = {t_pval:.4f}")
print("\n✓ Figure saved: fig_screen_time_sleep.png")
return corr_matrix
# =====================================================================
# 2. MBSR EFFECTIVENESS ANALYSIS
# =====================================================================
def analyze_mbsr_effectiveness(df):
"""
Problem 2: Effectiveness of MBSR in Healthcare Workers
"""
print("\n" + "="*70)
print("ANALYSIS 2: MBSR EFFECTIVENESS (Healthcare Workers)")
print("="*70)
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 2a. Stress Score: MBSR vs Non-MBSR
df['mbsr_group'] = df['mbsr_participation'].map({0: 'No MBSR', 1: 'MBSR Participant'})
sns.boxplot(data=df, x='mbsr_group', y='stress_score', ax=axes[0, 0])
axes[0, 0].set_title('Stress Score: MBSR vs Non-MBSR Groups')
axes[0, 0].set_ylabel('Stress Score')
# 2b. Burnout Score: MBSR vs Non-MBSR
sns.boxplot(data=df, x='mbsr_group', y='burnout_score', ax=axes[0, 1])
axes[0, 1].set_title('Burnout Score: MBSR vs Non-MBSR Groups')
axes[0, 1].set_ylabel('Burnout Score')
# 2c. MBSR Weeks vs Burnout (for participants only)
mbsr_participants = df[df['mbsr_participation'] == 1]
axes[1, 0].scatter(mbsr_participants['mbsr_weeks_completed'],
mbsr_participants['burnout_score'], alpha=0.6)
axes[1, 0].set_xlabel('MBSR Weeks Completed')
axes[1, 0].set_ylabel('Burnout Score')
axes[1, 0].set_title('MBSR Duration vs Burnout Score')
# Add trend line
if len(mbsr_participants) > 2:
z = np.polyfit(mbsr_participants['mbsr_weeks_completed'],
mbsr_participants['burnout_score'], 1)
p = np.poly1d(z)
x_line = np.linspace(mbsr_participants['mbsr_weeks_completed'].min(),
mbsr_participants['mbsr_weeks_completed'].max(), 100)
axes[1, 0].plot(x_line, p(x_line), "r--", label='Trend')
axes[1, 0].legend()
# 2d. Pre vs Post COVID Burnout
df['covid_period'] = df['post_covid_flag'].map({0: 'Pre-COVID', 1: 'Post-COVID'})
sns.barplot(data=df, x='covid_period', y='burnout_score', hue='mbsr_group', ax=axes[1, 1])
axes[1, 1].set_title('Burnout: Pre vs Post COVID (by MBSR Status)')
axes[1, 1].set_ylabel('Mean Burnout Score')
plt.tight_layout()
plt.savefig('fig_mbsr_effectiveness.png', dpi=300, bbox_inches='tight')
plt.close()
# Statistical Tests
print("\n--- Statistical Analysis ---")
mbsr_yes = df[df['mbsr_participation'] == 1]['burnout_score']
mbsr_no = df[df['mbsr_participation'] == 0]['burnout_score']
t_stat, t_pval = stats.ttest_ind(mbsr_yes, mbsr_no)
print(f"T-test (Burnout: MBSR vs Non-MBSR):")
print(f" Mean MBSR: {mbsr_yes.mean():.2f}, Mean Non-MBSR: {mbsr_no.mean():.2f}")
print(f" t-statistic = {t_stat:.4f}, p-value = {t_pval:.4f}")
if t_pval < 0.05:
print(" → Statistically significant difference (p < 0.05)")
else:
print(" → No statistically significant difference (p >= 0.05)")
print("\n✓ Figure saved: fig_mbsr_effectiveness.png")
# =====================================================================
# 3. SOCIAL MEDIA & DEPRESSION/ANXIETY ANALYSIS
# =====================================================================
def analyze_social_media_mental_health(df):
"""
Problem 3 & 4: Social Media Impact on Depression/Anxiety
"""
print("\n" + "="*70)
print("ANALYSIS 3: SOCIAL MEDIA & DEPRESSION/ANXIETY")
print("="*70)
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 3a. Social Media Hours vs PHQ-9 Score
axes[0, 0].scatter(df['social_media_hours'], df['phq9_score'],
alpha=0.6, c=df['depression_label'], cmap='RdYlGn_r')
axes[0, 0].set_xlabel('Social Media Hours (daily)')
axes[0, 0].set_ylabel('PHQ-9 Score (Depression)')
axes[0, 0].set_title('Social Media Usage vs Depression Score')
# 3b. Depression by Platform
platform_depression = df.groupby('social_media_type_dominant')['depression_label'].mean() * 100
platform_depression.sort_values(ascending=True).plot(kind='barh', ax=axes[0, 1], color='coral')
axes[0, 1].set_xlabel('Depression Rate (%)')
axes[0, 1].set_title('Depression Rate by Social Media Platform')
# 3c. PHQ-9 and GAD-7 Distribution
axes[1, 0].hist(df['phq9_score'], bins=20, alpha=0.7, label='PHQ-9 (Depression)', color='blue')
axes[1, 0].hist(df['gad7_score'], bins=20, alpha=0.7, label='GAD-7 (Anxiety)', color='orange')
axes[1, 0].set_xlabel('Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of PHQ-9 and GAD-7 Scores')
axes[1, 0].legend()
# 3d. Depression & Anxiety by Role
role_stats = df.groupby('role')[['depression_label', 'anxiety_label']].mean() * 100
role_stats.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_xlabel('Role')
axes[1, 1].set_ylabel('Percentage (%)')
axes[1, 1].set_title('Depression & Anxiety Rates by Role')
axes[1, 1].legend(['Depression', 'Anxiety'])
axes[1, 1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('fig_social_media_mental_health.png', dpi=300, bbox_inches='tight')
plt.close()
# Statistics
print("\n--- Statistical Summary ---")
corr_dep, p_dep = stats.pearsonr(df['social_media_hours'], df['phq9_score'])
print(f"Correlation (Social Media Hours vs PHQ-9): r = {corr_dep:.4f}, p = {p_dep:.4f}")
corr_anx, p_anx = stats.pearsonr(df['social_media_hours'], df['gad7_score'])
print(f"Correlation (Social Media Hours vs GAD-7): r = {corr_anx:.4f}, p = {p_anx:.4f}")
print(f"\nDepression Rate: {df['depression_label'].mean()*100:.1f}%")
print(f"Anxiety Rate: {df['anxiety_label'].mean()*100:.1f}%")
print("\n✓ Figure saved: fig_social_media_mental_health.png")
# =====================================================================
# 4. BURNOUT POST-COVID ANALYSIS
# =====================================================================
def analyze_burnout_post_covid(df):
"""
Problem 5: Burnout Among Medical Professionals Post-COVID
"""
print("\n" + "="*70)
print("ANALYSIS 4: BURNOUT POST-COVID (Medical Professionals)")
print("="*70)
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 4a. Burnout Distribution
df['burnout_category'] = df['burnout_label'].map({0: 'Low Burnout', 1: 'High Burnout'})
burnout_counts = df['burnout_category'].value_counts()
axes[0, 0].pie(burnout_counts, labels=burnout_counts.index, autopct='%1.1f%%',
colors=['lightgreen', 'salmon'], explode=[0, 0.05])
axes[0, 0].set_title('Burnout Distribution (Post-COVID)')
# 4b. Work Hours vs Burnout Score
axes[0, 1].scatter(df['work_hours_per_week'], df['burnout_score'],
alpha=0.6, c=df['burnout_label'], cmap='RdYlGn_r')
axes[0, 1].set_xlabel('Work Hours per Week')
axes[0, 1].set_ylabel('Burnout Score')
axes[0, 1].set_title('Work Hours vs Burnout Score')
axes[0, 1].axhline(y=60, color='red', linestyle='--', alpha=0.5, label='High Burnout Threshold')
axes[0, 1].legend()
# 4c. Burnout by Role
role_burnout = df.groupby('role')['burnout_score'].agg(['mean', 'std'])
role_burnout['mean'].plot(kind='bar', yerr=role_burnout['std'], ax=axes[1, 0],
capsize=5, color=['steelblue', 'coral'])
axes[1, 0].set_xlabel('Role')
axes[1, 0].set_ylabel('Mean Burnout Score')
axes[1, 0].set_title('Burnout Score by Role')
axes[1, 0].tick_params(axis='x', rotation=0)
# 4d. Factors Contributing to Burnout (Correlation)
factors = ['stress_score', 'work_hours_per_week', 'patient_load_per_week', 'mbsr_participation']
correlations = df[factors + ['burnout_score']].corr()['burnout_score'][:-1]
correlations.sort_values().plot(kind='barh', ax=axes[1, 1], color='teal')
axes[1, 1].set_xlabel('Correlation with Burnout Score')
axes[1, 1].set_title('Factors Contributing to Burnout')
axes[1, 1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.savefig('fig_burnout_post_covid.png', dpi=300, bbox_inches='tight')
plt.close()
# Statistics
print("\n--- Statistical Summary ---")
print(f"Total Medical Professionals: {len(df)}")
print(f"High Burnout Rate: {df['burnout_label'].mean()*100:.1f}%")
print(f"Mean Burnout Score: {df['burnout_score'].mean():.2f} (SD: {df['burnout_score'].std():.2f})")
print(f"Mean Work Hours: {df['work_hours_per_week'].mean():.1f} hours/week")
print(f"Mean Patient Load: {df['patient_load_per_week'].mean():.1f} patients/week")
# MBSR effect
mbsr_yes = df[df['mbsr_participation'] == 1]['burnout_score']
mbsr_no = df[df['mbsr_participation'] == 0]['burnout_score']
t_stat, t_pval = stats.ttest_ind(mbsr_yes, mbsr_no)
print(f"\nMBSR Effect on Burnout:")
print(f" With MBSR: {mbsr_yes.mean():.2f}, Without MBSR: {mbsr_no.mean():.2f}")
print(f" t-statistic = {t_stat:.4f}, p-value = {t_pval:.4f}")
print("\n✓ Figure saved: fig_burnout_post_covid.png")
# =====================================================================
# 5. COMPREHENSIVE SUMMARY STATISTICS
# =====================================================================
def generate_summary_statistics(datasets):
"""Generate summary statistics for all datasets"""
print("\n" + "="*70)
print("COMPREHENSIVE SUMMARY STATISTICS")
print("="*70)
for name, df in datasets.items():
print(f"\n--- {name.upper()} DATASET ---")
print(f"Shape: {df.shape}")
print(f"\nNumerical Summary:")
print(df.describe().round(2).to_string())
print("\n" + "-"*50)
# =====================================================================
# MAIN EXECUTION
# =====================================================================
def run_all_analyses():
"""Run complete EDA for all problem statements"""
print("\n" + "="*70)
print("MENTAL HEALTH PROJECT - COMPLETE EDA")
print("="*70)
# Load data
datasets = load_datasets()
# Run analyses
analyze_screen_time_sleep(datasets['students'])
analyze_mbsr_effectiveness(datasets['mbsr'])
analyze_social_media_mental_health(datasets['social_media'])
analyze_burnout_post_covid(datasets['burnout'])
# Summary
generate_summary_statistics(datasets)
print("\n" + "="*70)
print("✅ ALL ANALYSES COMPLETE")
print("Generated Figures:")
print(" 1. fig_screen_time_sleep.png")
print(" 2. fig_mbsr_effectiveness.png")
print(" 3. fig_social_media_mental_health.png")
print(" 4. fig_burnout_post_covid.png")
print("="*70)
if __name__ == "__main__":
run_all_analyses()