Spaces:

Danielchris145
/

DC-Well-Being-AI

Sleeping

DC-Well-Being-AI / eda_visualizations.py

CHRISDANIEL145

Deploy: Clean LF-tracked Release

d1d1019 5 months ago

15.7 kB

	# =====================================================================
	# MENTAL HEALTH PROJECT - EXPLORATORY DATA ANALYSIS & VISUALIZATIONS
	# Generates charts for final-year project report
	# =====================================================================

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy import stats
	import warnings
	warnings.filterwarnings('ignore')

	# Use non-interactive backend for saving figures
	import matplotlib
	matplotlib.use('Agg')

	# Set style for publication-quality figures
	plt.style.use('seaborn-v0_8-whitegrid')
	sns.set_palette("husl")

	# =====================================================================
	# LOAD ALL DATASETS
	# =====================================================================
	def load_datasets():
	"""Load all 4 datasets"""
	datasets = {
	'students': pd.read_csv('dataset/students_sleep_screen.csv'),
	'social_media': pd.read_csv('dataset/social_media_depression.csv'),
	'burnout': pd.read_csv('dataset/burnout_medical_post_covid.csv'),
	'mbsr': pd.read_csv('dataset/mbsr_healthcare.csv')
	}
	print("✓ All datasets loaded successfully")
	for name, df in datasets.items():
	print(f" - {name}: {df.shape[0]} rows, {df.shape[1]} columns")
	return datasets

	# =====================================================================
	# 1. SCREEN TIME VS SLEEP QUALITY ANALYSIS
	# =====================================================================
	def analyze_screen_time_sleep(df):
	"""
	Problem 1: Correlation Between Screen Time and Sleep Quality
	"""
	print("\n" + "="*70)
	print("ANALYSIS 1: SCREEN TIME VS SLEEP QUALITY (Students)")
	print("="*70)

	fig, axes = plt.subplots(2, 2, figsize=(14, 12))

	# 1a. Correlation Heatmap
	corr_cols = ['screen_time_hours', 'screen_time_night_hours',
	'social_media_hours', 'sleep_duration_hours',
	'sleep_quality_score', 'insomnia_score', 'stress_score']
	corr_matrix = df[corr_cols].corr()

	sns.heatmap(corr_matrix, annot=True, cmap='RdYlBu_r', center=0,
	fmt='.2f', ax=axes[0, 0], square=True)
	axes[0, 0].set_title('Correlation Heatmap: Screen Time & Sleep Factors', fontsize=12)

	# 1b. Screen Time vs Sleep Quality Scatter
	axes[0, 1].scatter(df['screen_time_hours'], df['sleep_quality_score'],
	alpha=0.6, c=df['stress_score'], cmap='coolwarm')
	axes[0, 1].set_xlabel('Total Screen Time (hours)')
	axes[0, 1].set_ylabel('Sleep Quality Score')
	axes[0, 1].set_title('Screen Time vs Sleep Quality (color=stress)')

	# Add regression line
	z = np.polyfit(df['screen_time_hours'], df['sleep_quality_score'], 1)
	p = np.poly1d(z)
	axes[0, 1].plot(df['screen_time_hours'].sort_values(),
	p(df['screen_time_hours'].sort_values()),
	"r--", alpha=0.8, label=f'Trend line')
	axes[0, 1].legend()

	# 1c. Night Screen Time Distribution by Sleep Label
	df['sleep_label'] = (df['sleep_quality_score'] >= 7.5).astype(int)
	df['sleep_category'] = df['sleep_label'].map({0: 'Poor Sleep', 1: 'Good Sleep'})

	sns.boxplot(data=df, x='sleep_category', y='screen_time_night_hours', ax=axes[1, 0])
	axes[1, 0].set_title('Night Screen Time by Sleep Quality')
	axes[1, 0].set_xlabel('Sleep Category')
	axes[1, 0].set_ylabel('Night Screen Time (hours)')

	# 1d. Sleep Quality Distribution
	sns.histplot(data=df, x='sleep_quality_score', hue='gender', kde=True, ax=axes[1, 1])
	axes[1, 1].set_title('Sleep Quality Distribution by Gender')
	axes[1, 1].axvline(x=7.5, color='red', linestyle='--', label='Threshold (7.5)')
	axes[1, 1].legend()

	plt.tight_layout()
	plt.savefig('fig_screen_time_sleep.png', dpi=300, bbox_inches='tight')
	plt.close()

	# Statistical Analysis
	print("\n--- Statistical Summary ---")
	corr, p_val = stats.pearsonr(df['screen_time_hours'], df['sleep_quality_score'])
	print(f"Correlation (Screen Time vs Sleep Quality): r = {corr:.4f}, p = {p_val:.4f}")

	corr_night, p_night = stats.pearsonr(df['screen_time_night_hours'], df['sleep_quality_score'])
	print(f"Correlation (Night Screen vs Sleep Quality): r = {corr_night:.4f}, p = {p_night:.4f}")

	# T-test: Good vs Poor sleepers
	good_sleep = df[df['sleep_label'] == 1]['screen_time_night_hours']
	poor_sleep = df[df['sleep_label'] == 0]['screen_time_night_hours']
	t_stat, t_pval = stats.ttest_ind(good_sleep, poor_sleep)
	print(f"\nT-test (Night Screen Time: Good vs Poor Sleep):")
	print(f" t-statistic = {t_stat:.4f}, p-value = {t_pval:.4f}")

	print("\n✓ Figure saved: fig_screen_time_sleep.png")
	return corr_matrix

	# =====================================================================
	# 2. MBSR EFFECTIVENESS ANALYSIS
	# =====================================================================
	def analyze_mbsr_effectiveness(df):
	"""
	Problem 2: Effectiveness of MBSR in Healthcare Workers
	"""
	print("\n" + "="*70)
	print("ANALYSIS 2: MBSR EFFECTIVENESS (Healthcare Workers)")
	print("="*70)

	fig, axes = plt.subplots(2, 2, figsize=(14, 12))

	# 2a. Stress Score: MBSR vs Non-MBSR
	df['mbsr_group'] = df['mbsr_participation'].map({0: 'No MBSR', 1: 'MBSR Participant'})
	sns.boxplot(data=df, x='mbsr_group', y='stress_score', ax=axes[0, 0])
	axes[0, 0].set_title('Stress Score: MBSR vs Non-MBSR Groups')
	axes[0, 0].set_ylabel('Stress Score')

	# 2b. Burnout Score: MBSR vs Non-MBSR
	sns.boxplot(data=df, x='mbsr_group', y='burnout_score', ax=axes[0, 1])
	axes[0, 1].set_title('Burnout Score: MBSR vs Non-MBSR Groups')
	axes[0, 1].set_ylabel('Burnout Score')

	# 2c. MBSR Weeks vs Burnout (for participants only)
	mbsr_participants = df[df['mbsr_participation'] == 1]
	axes[1, 0].scatter(mbsr_participants['mbsr_weeks_completed'],
	mbsr_participants['burnout_score'], alpha=0.6)
	axes[1, 0].set_xlabel('MBSR Weeks Completed')
	axes[1, 0].set_ylabel('Burnout Score')
	axes[1, 0].set_title('MBSR Duration vs Burnout Score')

	# Add trend line
	if len(mbsr_participants) > 2:
	z = np.polyfit(mbsr_participants['mbsr_weeks_completed'],
	mbsr_participants['burnout_score'], 1)
	p = np.poly1d(z)
	x_line = np.linspace(mbsr_participants['mbsr_weeks_completed'].min(),
	mbsr_participants['mbsr_weeks_completed'].max(), 100)
	axes[1, 0].plot(x_line, p(x_line), "r--", label='Trend')
	axes[1, 0].legend()

	# 2d. Pre vs Post COVID Burnout
	df['covid_period'] = df['post_covid_flag'].map({0: 'Pre-COVID', 1: 'Post-COVID'})
	sns.barplot(data=df, x='covid_period', y='burnout_score', hue='mbsr_group', ax=axes[1, 1])
	axes[1, 1].set_title('Burnout: Pre vs Post COVID (by MBSR Status)')
	axes[1, 1].set_ylabel('Mean Burnout Score')

	plt.tight_layout()
	plt.savefig('fig_mbsr_effectiveness.png', dpi=300, bbox_inches='tight')
	plt.close()

	# Statistical Tests
	print("\n--- Statistical Analysis ---")
	mbsr_yes = df[df['mbsr_participation'] == 1]['burnout_score']
	mbsr_no = df[df['mbsr_participation'] == 0]['burnout_score']

	t_stat, t_pval = stats.ttest_ind(mbsr_yes, mbsr_no)
	print(f"T-test (Burnout: MBSR vs Non-MBSR):")
	print(f" Mean MBSR: {mbsr_yes.mean():.2f}, Mean Non-MBSR: {mbsr_no.mean():.2f}")
	print(f" t-statistic = {t_stat:.4f}, p-value = {t_pval:.4f}")

	if t_pval < 0.05:
	print(" → Statistically significant difference (p < 0.05)")
	else:
	print(" → No statistically significant difference (p >= 0.05)")

	print("\n✓ Figure saved: fig_mbsr_effectiveness.png")

	# =====================================================================
	# 3. SOCIAL MEDIA & DEPRESSION/ANXIETY ANALYSIS
	# =====================================================================
	def analyze_social_media_mental_health(df):
	"""
	Problem 3 & 4: Social Media Impact on Depression/Anxiety
	"""
	print("\n" + "="*70)
	print("ANALYSIS 3: SOCIAL MEDIA & DEPRESSION/ANXIETY")
	print("="*70)

	fig, axes = plt.subplots(2, 2, figsize=(14, 12))

	# 3a. Social Media Hours vs PHQ-9 Score
	axes[0, 0].scatter(df['social_media_hours'], df['phq9_score'],
	alpha=0.6, c=df['depression_label'], cmap='RdYlGn_r')
	axes[0, 0].set_xlabel('Social Media Hours (daily)')
	axes[0, 0].set_ylabel('PHQ-9 Score (Depression)')
	axes[0, 0].set_title('Social Media Usage vs Depression Score')

	# 3b. Depression by Platform
	platform_depression = df.groupby('social_media_type_dominant')['depression_label'].mean() * 100
	platform_depression.sort_values(ascending=True).plot(kind='barh', ax=axes[0, 1], color='coral')
	axes[0, 1].set_xlabel('Depression Rate (%)')
	axes[0, 1].set_title('Depression Rate by Social Media Platform')

	# 3c. PHQ-9 and GAD-7 Distribution
	axes[1, 0].hist(df['phq9_score'], bins=20, alpha=0.7, label='PHQ-9 (Depression)', color='blue')
	axes[1, 0].hist(df['gad7_score'], bins=20, alpha=0.7, label='GAD-7 (Anxiety)', color='orange')
	axes[1, 0].set_xlabel('Score')
	axes[1, 0].set_ylabel('Frequency')
	axes[1, 0].set_title('Distribution of PHQ-9 and GAD-7 Scores')
	axes[1, 0].legend()

	# 3d. Depression & Anxiety by Role
	role_stats = df.groupby('role')[['depression_label', 'anxiety_label']].mean() * 100
	role_stats.plot(kind='bar', ax=axes[1, 1])
	axes[1, 1].set_xlabel('Role')
	axes[1, 1].set_ylabel('Percentage (%)')
	axes[1, 1].set_title('Depression & Anxiety Rates by Role')
	axes[1, 1].legend(['Depression', 'Anxiety'])
	axes[1, 1].tick_params(axis='x', rotation=45)

	plt.tight_layout()
	plt.savefig('fig_social_media_mental_health.png', dpi=300, bbox_inches='tight')
	plt.close()

	# Statistics
	print("\n--- Statistical Summary ---")
	corr_dep, p_dep = stats.pearsonr(df['social_media_hours'], df['phq9_score'])
	print(f"Correlation (Social Media Hours vs PHQ-9): r = {corr_dep:.4f}, p = {p_dep:.4f}")

	corr_anx, p_anx = stats.pearsonr(df['social_media_hours'], df['gad7_score'])
	print(f"Correlation (Social Media Hours vs GAD-7): r = {corr_anx:.4f}, p = {p_anx:.4f}")

	print(f"\nDepression Rate: {df['depression_label'].mean()*100:.1f}%")
	print(f"Anxiety Rate: {df['anxiety_label'].mean()*100:.1f}%")

	print("\n✓ Figure saved: fig_social_media_mental_health.png")


	# =====================================================================
	# 4. BURNOUT POST-COVID ANALYSIS
	# =====================================================================
	def analyze_burnout_post_covid(df):
	"""
	Problem 5: Burnout Among Medical Professionals Post-COVID
	"""
	print("\n" + "="*70)
	print("ANALYSIS 4: BURNOUT POST-COVID (Medical Professionals)")
	print("="*70)

	fig, axes = plt.subplots(2, 2, figsize=(14, 12))

	# 4a. Burnout Distribution
	df['burnout_category'] = df['burnout_label'].map({0: 'Low Burnout', 1: 'High Burnout'})
	burnout_counts = df['burnout_category'].value_counts()
	axes[0, 0].pie(burnout_counts, labels=burnout_counts.index, autopct='%1.1f%%',
	colors=['lightgreen', 'salmon'], explode=[0, 0.05])
	axes[0, 0].set_title('Burnout Distribution (Post-COVID)')

	# 4b. Work Hours vs Burnout Score
	axes[0, 1].scatter(df['work_hours_per_week'], df['burnout_score'],
	alpha=0.6, c=df['burnout_label'], cmap='RdYlGn_r')
	axes[0, 1].set_xlabel('Work Hours per Week')
	axes[0, 1].set_ylabel('Burnout Score')
	axes[0, 1].set_title('Work Hours vs Burnout Score')
	axes[0, 1].axhline(y=60, color='red', linestyle='--', alpha=0.5, label='High Burnout Threshold')
	axes[0, 1].legend()

	# 4c. Burnout by Role
	role_burnout = df.groupby('role')['burnout_score'].agg(['mean', 'std'])
	role_burnout['mean'].plot(kind='bar', yerr=role_burnout['std'], ax=axes[1, 0],
	capsize=5, color=['steelblue', 'coral'])
	axes[1, 0].set_xlabel('Role')
	axes[1, 0].set_ylabel('Mean Burnout Score')
	axes[1, 0].set_title('Burnout Score by Role')
	axes[1, 0].tick_params(axis='x', rotation=0)

	# 4d. Factors Contributing to Burnout (Correlation)
	factors = ['stress_score', 'work_hours_per_week', 'patient_load_per_week', 'mbsr_participation']
	correlations = df[factors + ['burnout_score']].corr()['burnout_score'][:-1]
	correlations.sort_values().plot(kind='barh', ax=axes[1, 1], color='teal')
	axes[1, 1].set_xlabel('Correlation with Burnout Score')
	axes[1, 1].set_title('Factors Contributing to Burnout')
	axes[1, 1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)

	plt.tight_layout()
	plt.savefig('fig_burnout_post_covid.png', dpi=300, bbox_inches='tight')
	plt.close()

	# Statistics
	print("\n--- Statistical Summary ---")
	print(f"Total Medical Professionals: {len(df)}")
	print(f"High Burnout Rate: {df['burnout_label'].mean()*100:.1f}%")
	print(f"Mean Burnout Score: {df['burnout_score'].mean():.2f} (SD: {df['burnout_score'].std():.2f})")
	print(f"Mean Work Hours: {df['work_hours_per_week'].mean():.1f} hours/week")
	print(f"Mean Patient Load: {df['patient_load_per_week'].mean():.1f} patients/week")

	# MBSR effect
	mbsr_yes = df[df['mbsr_participation'] == 1]['burnout_score']
	mbsr_no = df[df['mbsr_participation'] == 0]['burnout_score']
	t_stat, t_pval = stats.ttest_ind(mbsr_yes, mbsr_no)
	print(f"\nMBSR Effect on Burnout:")
	print(f" With MBSR: {mbsr_yes.mean():.2f}, Without MBSR: {mbsr_no.mean():.2f}")
	print(f" t-statistic = {t_stat:.4f}, p-value = {t_pval:.4f}")

	print("\n✓ Figure saved: fig_burnout_post_covid.png")

	# =====================================================================
	# 5. COMPREHENSIVE SUMMARY STATISTICS
	# =====================================================================
	def generate_summary_statistics(datasets):
	"""Generate summary statistics for all datasets"""
	print("\n" + "="*70)
	print("COMPREHENSIVE SUMMARY STATISTICS")
	print("="*70)

	for name, df in datasets.items():
	print(f"\n--- {name.upper()} DATASET ---")
	print(f"Shape: {df.shape}")
	print(f"\nNumerical Summary:")
	print(df.describe().round(2).to_string())
	print("\n" + "-"*50)

	# =====================================================================
	# MAIN EXECUTION
	# =====================================================================
	def run_all_analyses():
	"""Run complete EDA for all problem statements"""

	print("\n" + "="*70)
	print("MENTAL HEALTH PROJECT - COMPLETE EDA")
	print("="*70)

	# Load data
	datasets = load_datasets()

	# Run analyses
	analyze_screen_time_sleep(datasets['students'])
	analyze_mbsr_effectiveness(datasets['mbsr'])
	analyze_social_media_mental_health(datasets['social_media'])
	analyze_burnout_post_covid(datasets['burnout'])

	# Summary
	generate_summary_statistics(datasets)

	print("\n" + "="*70)
	print("✅ ALL ANALYSES COMPLETE")
	print("Generated Figures:")
	print(" 1. fig_screen_time_sleep.png")
	print(" 2. fig_mbsr_effectiveness.png")
	print(" 3. fig_social_media_mental_health.png")
	print(" 4. fig_burnout_post_covid.png")
	print("="*70)

	if __name__ == "__main__":
	run_all_analyses()