heart disease classification

import%20marimo%0A%0A__generated_with%20%3D%20%220.19.6%22%0Aapp%20%3D%20marimo.App(width%3D%22medium%22)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20%23%20Data%20Processing%20and%20Visualization%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20import%20matplotlib.pyplot%20as%20plt%0A%20%20%20%20import%20numpy%20as%20np%0A%20%20%20%20import%20pandas%20as%20pd%0A%20%20%20%20import%20seaborn%20as%20sns%0A%0A%20%20%20%20%23%20Sklearn%3A%20Model%20Selection%20and%20Preprocessing%0A%20%20%20%20from%20sklearn.compose%20import%20ColumnTransformer%0A%20%20%20%20from%20sklearn.model_selection%20import%20GridSearchCV%2C%20RepeatedStratifiedKFold%2C%20train_test_split%0A%20%20%20%20from%20sklearn.pipeline%20import%20Pipeline%0A%20%20%20%20from%20sklearn.preprocessing%20import%20OneHotEncoder%2C%20StandardScaler%0A%0A%20%20%20%20%23%20Sklearn%3A%20Machine%20Learning%20Models%0A%20%20%20%20from%20sklearn.ensemble%20import%20GradientBoostingClassifier%2C%20RandomForestClassifier%0A%20%20%20%20from%20sklearn.linear_model%20import%20LogisticRegression%0A%20%20%20%20from%20sklearn.neighbors%20import%20KNeighborsClassifier%0A%20%20%20%20from%20sklearn.svm%20import%20SVC%0A%0A%20%20%20%20%23%20Sklearn%3A%20Metrics%20and%20Evaluation%0A%20%20%20%20from%20sklearn.metrics%20import%20(%0A%20%20%20%20%20%20%20%20ConfusionMatrixDisplay%2C%0A%20%20%20%20%20%20%20%20classification_report%2C%0A%20%20%20%20%20%20%20%20confusion_matrix%2C%0A%20%20%20%20%20%20%20%20precision_recall_curve%2C%0A%20%20%20%20%20%20%20%20roc_auc_score%2C%0A%20%20%20%20%20%20%20%20roc_curve%2C%0A%20%20%20%20%20%20%20%20RocCurveDisplay%0A%20%20%20%20)%0A%20%20%20%20return%20(%0A%20%20%20%20%20%20%20%20ColumnTransformer%2C%0A%20%20%20%20%20%20%20%20GradientBoostingClassifier%2C%0A%20%20%20%20%20%20%20%20GridSearchCV%2C%0A%20%20%20%20%20%20%20%20LogisticRegression%2C%0A%20%20%20%20%20%20%20%20OneHotEncoder%2C%0A%20%20%20%20%20%20%20%20RandomForestClassifier%2C%0A%20%20%20%20%20%20%20%20RocCurveDisplay%2C%0A%20%20%20%20%20%20%20%20SVC%2C%0A%20%20%20%20%20%20%20%20StandardScaler%2C%0A%20%20%20%20%20%20%20%20classification_report%2C%0A%20%20%20%20%20%20%20%20confusion_matrix%2C%0A%20%20%20%20%20%20%20%20mo%2C%0A%20%20%20%20%20%20%20%20np%2C%0A%20%20%20%20%20%20%20%20pd%2C%0A%20%20%20%20%20%20%20%20plt%2C%0A%20%20%20%20%20%20%20%20roc_auc_score%2C%0A%20%20%20%20%20%20%20%20sns%2C%0A%20%20%20%20%20%20%20%20train_test_split%2C%0A%20%20%20%20)%0A%0A%0A%40app.cell%0Adef%20_(pd)%3A%0A%20%20%20%20%23%20IMPORTING%20DATA%0A%20%20%20%20heart_df%20%3D%20pd.read_csv(%22data%2Fheart_disease.csv%22)%0A%20%20%20%20return%20(heart_df%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20%23%20DATA%20DICTIONARY%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%7C%20Column%20%7C%20Description%20%7C%20Details%20%7C%0A%20%20%20%20%7C%20%3A---%20%7C%20%3A---%20%7C%20%3A---%20%7C%0A%20%20%20%20%7C%20**age**%20%7C%20Age%20%7C%20In%20years%20%7C%0A%20%20%20%20%7C%20**sex**%20%7C%20Sex%20%7C%201%20%3D%20male%3B%200%20%3D%20female%20%7C%0A%20%20%20%20%7C%20**cp**%20%7C%20Chest%20Pain%20Type%20%7C%20Types%200-3%20%7C%0A%20%20%20%20%7C%20**trestbps**%20%7C%20Resting%20Blood%20Pressure%20%7C%20In%20mmHg%20(on%20admission%20to%20the%20hospital)%20%7C%0A%20%20%20%20%7C%20**chol**%20%7C%20Serum%20Cholesterol%20%7C%20In%20mg%2Fdl%20%7C%0A%20%20%20%20%7C%20**fbs**%20%7C%20Fasting%20Blood%20Sugar%20%7C%20%3E%20120%20mg%2Fdl%20(1%20%3D%20true%3B%200%20%3D%20false)%20%7C%0A%20%20%20%20%7C%20**restecg**%20%7C%20Resting%20ECG%20Results%20%7C%20Values%200-2%20%7C%0A%20%20%20%20%7C%20**thalach**%20%7C%20Max%20Heart%20Rate%20%7C%20Maximum%20heart%20rate%20achieved%20%7C%0A%20%20%20%20%7C%20**exang**%20%7C%20Exercise%20Induced%20Angina%20%7C%201%20%3D%20yes%3B%200%20%3D%20no%20%7C%0A%20%20%20%20%7C%20**oldpeak**%20%7C%20ST%20Depression%20%7C%20ST%20depression%20induced%20by%20exercise%20relative%20to%20rest%20%7C%0A%20%20%20%20%7C%20**slope**%20%7C%20ST%20Slope%20%7C%20The%20slope%20of%20the%20peak%20exercise%20ST%20segment%20%7C%0A%20%20%20%20%7C%20**ca**%20%7C%20Major%20Vessels%20%7C%20Number%20of%20major%20vessels%20colored%20by%20flourosopy%20(0-3)%20%7C%0A%20%20%20%20%7C%20**thal**%20%7C%20Thalassemia%20%7C%201%2C3%20%3D%20normal%3B%206%20%3D%20fixed%20defect%3B%207%20%3D%20reversable%20defect%20%7C%0A%20%20%20%20%7C%20**target**%20%7C%20Diagnosis%20%7C%201%20%3D%20disease%3B%200%20%3D%20no%20disease%20%7C%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df%2C%20mo)%3A%0A%20%20%20%20%23%20LOADING%20%26%20SHOWING%20DATA%0A%20%20%20%20rows%2C%20columns%20%3D%20heart_df.shape%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%AB%80%20Heart%20Disease%20Prediction%20Project%22)%2C%0A%20%20%20%20%20%20%20%20mo.md(f%22%23%23%23%20Total%20Records%3A%20%7Brows%7D%20%7C%20Total%20Columns%3A%20%7Bcolumns%7D%22)%2C%0A%20%20%20%20%20%20%20%20mo.ui.table(%0A%20%20%20%20%20%20%20%20%20%20%20%20heart_df%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20label%3D%22Anonymous%20Patient%20Data%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20selection%3DNone%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20pagination%3DTrue%2C%0A%20%20%20%20%20%20%20%20)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df%2C%20mo%2C%20pd)%3A%0A%20%20%20%20%23%20DATA%20QUALITY%20%26%20PROFILING%0A%20%20%20%20%23%20Check%20for%20missing%20values%20and%20duplicates%2C%20unique%20values%0A%20%20%20%20missing_values%20%3D%20heart_df.isna().sum()%0A%20%20%20%20duplicate_count%20%3D%20heart_df.duplicated().sum()%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%94%8D%20Check%20Data%20Quality%22)%2C%0A%20%20%20%20%20%20%20%20mo.md(f%22%22%22%20%23%23%23%20Duplicates%3A%20%7Bduplicate_count%7D%22%22%22)%2C%0A%20%20%20%20%20%20%20%20mo.md(f%22%22%22%20%23%23%23%20Missing%20Values%3A%20%7Bmissing_values.sum()%7D%22%22%22)%2C%0A%20%20%20%20%20%20%20%20mo.ui.table(pd.DataFrame(%7B%22Data%20Type%22%3A%20heart_df.dtypes.astype(str)%2C%20%22Unique%20Values%22%3A%20heart_df.nunique()%20%0A%20%20%20%20%7D)%2C%20selection%3DNone)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df)%3A%0A%20%20%20%20%23%20Drop%20duplicate%20rows%0A%20%20%20%20heart_df_clean%20%3D%20heart_df.drop_duplicates()%0A%20%20%20%20return%20(heart_df_clean%2C)%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo%2C%20pd%2C%20plt%2C%20sns)%3A%0A%20%20%20%20%23%20TARGET%20DISTRIBUTION%0A%20%20%20%20target_counts%20%3D%20heart_df_clean%5B'target'%5D.value_counts()%0A%20%20%20%20target_percent%20%3D%20(heart_df_clean%5B'target'%5D.value_counts(normalize%3DTrue)%20*%20100).round(2).astype(str)%20%2B%20'%25'%0A%0A%20%20%20%20target_summary%20%3D%20pd.DataFrame(%7B%0A%20%20%20%20%20%20%20%20%22Count%22%3A%20target_counts%2C%0A%20%20%20%20%20%20%20%20%22Percentage%22%3A%20target_percent%0A%20%20%20%20%7D)%0A%0A%20%20%20%20%23%20PLOT%0A%20%20%20%20fig%2C%20ax%20%3D%20plt.subplots(figsize%3D(5%2C%206))%0A%0A%20%20%20%20sns.countplot(x%3D'target'%2C%20data%3Dheart_df_clean%2C%20palette%3D%5B'%233498db'%2C%20'%23e74c3c'%5D%2C%20hue%3D'target'%2C%20ax%3Dax)%0A%0A%20%20%20%20ax.set_title(%22Visual%20Balance%20(0%20vs%201)%22)%0A%20%20%20%20ax.set_xlabel(%22Diagnosis%20(0%3DHealthy%2C%201%3DDisease)%22)%0A%20%20%20%20ax.set_ylabel(%22Count%22)%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%8E%AF%20Target%20Variable%20Distribution%22)%2C%0A%20%20%20%20%20%20%20%20mo.hstack(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20target_summary%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20fig%0A%20%20%20%20%20%20%20%20%5D%2C%20justify%3D%22start%22%2C%20gap%3D2)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%8E%AF%20Target%20Variable%20Conclusion%20%26%20Strategy%0A%20%20%20%20*%20**Balanced%20Dataset%3A**%20The%20dataset%20is%20well-balanced%20(~54%25%20Disease%20vs%20~46%25%20Healthy).%20This%20is%20excellent%20for%20model%20training.%0A%20%20%20%20*%20**No%20Bias%3A**%20We%20don't%20need%20to%20apply%20complex%20resampling%20techniques%20(like%20SMOTE).%0A%20%20%20%20*%20**Evaluation%20Metric%3A**%20Our%20primary%20focus%20is%20**Recall**.%20In%20medical%20diagnostics%2C%20minimizing%20False%20Negatives%20(missing%20a%20sick%20patient)%20is%20far%20more%20critical%20than%20overall%20Accuracy.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo)%3A%0A%20%20%20%20%23%20STATISTICAL%20SUMMARY%0A%20%20%20%20stats%20%3D%20heart_df_clean.describe().T.round(2)%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%93%8A%20Statistical%20Overview%22)%2C%0A%20%20%20%20%20%20%20%20mo.ui.table(stats%2C%20selection%3DNone)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%93%8A%20Stats%0A%20%20%20%20*%20**Demographics%3A**%20The%20average%20patient%20age%20is%20**~54%20years**%2C%20ranging%20from%2029%20to%2077.%0A%20%20%20%20*%20**Gender%20Imbalance%3A**%20The%20mean%20of%20%60sex%60%20is%20**0.68**%2C%20indicating%20that%20approximately%20**68%25**%20of%20the%20dataset%20consists%20of%20male%20patients%20(assuming%201%20%3D%20Male).%0A%20%20%20%20*%20**Potential%20Outliers%3A**%0A%20%20%20%20%20%20*%20**Cholesterol%20(%60chol%60)%3A**%20The%20max%20value%20is%20**564%20mg%2Fdl**%2C%20which%20is%20extremely%20high%20compared%20to%20the%20mean%20(246).%0A%20%20%20%20%20%20*%20**Blood%20Pressure%20(%60trestbps%60)%3A**%20The%20max%20value%20reaches%20**200%20mm%20Hg**%2C%20indicating%20hypertensive%20crisis%20cases.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo%2C%20plt%2C%20sns)%3A%0A%20%20%20%20%23%20Correlation%20Matrix%0A%20%20%20%20corr_matrix%20%3D%20heart_df_clean.corr()%0A%0A%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(10%2C%207))%0A%0A%20%20%20%20sns.heatmap(%0A%20%20%20%20%20%20%20%20corr_matrix%2C%20%0A%20%20%20%20%20%20%20%20annot%3DTrue%2C%20%20%20%20%20%20%20%20%20%0A%20%20%20%20%20%20%20%20fmt%3D%22.2f%22%2C%20%20%20%20%20%20%20%20%20%20%20%0A%20%20%20%20%20%20%20%20cmap%3D%22coolwarm%22%2C%20%20%20%20%20%0A%20%20%20%20%20%20%20%20linewidths%3D0.5%2C%0A%20%20%20%20%20%20%20%20ax%3D_ax%20%0A%20%20%20%20)%0A%0A%20%20%20%20_ax.set_title(%22Correlation%20Matrix%20of%20Heart%20Disease%20Features%22%2C%20pad%3D20%2C%20fontsize%3D14%2C%20weight%3D'bold')%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%8C%A1%EF%B8%8F%20Feature%20Correlation%20Analysis%22)%2C%0A%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%8C%A1%EF%B8%8F%20Correlation%20Matrix%0A%20%20%20%20*%20**Strongest%20Positive%20Features%3A**%20%60cp%60%20(Chest%20Pain%2C%20**0.43**)%20and%20%60thalach%60%20(Max%20Heart%20Rate%2C%20**0.42**)%20show%20the%20highest%20positive%20correlation%20with%20the%20target.%20As%20these%20values%20increase%2C%20the%20likelihood%20of%20heart%20disease%20increases.%0A%20%20%20%20*%20**Strongest%20Negative%20Features%3A**%20%60exang%60%20(Exercise%20Induced%20Angina%2C%20**-0.44**)%20and%20%60oldpeak%60%20(ST%20Depression%2C%20**-0.43**)%20have%20the%20strongest%20inverse%20relationship.%0A%20%20%20%20*%20**Multicollinearity%3A**%20Notice%20the%20strong%20correlation%20between%20%60slope%60%20and%20%60oldpeak%60%20(**-0.58**).%20This%20indicates%20some%20redundancy%20between%20these%20features%2C%20but%20generally%2C%20the%20features%20are%20well-distributed.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo%2C%20pd%2C%20plt%2C%20sns)%3A%0A%20%20%20%20%23%20Target%20By%20Sex%0A%20%20%20%20_counts%20%3D%20pd.crosstab(heart_df_clean%5B'sex'%5D%2C%20heart_df_clean%5B'target'%5D)%0A%20%20%20%20_percs%20%3D%20pd.crosstab(heart_df_clean%5B'sex'%5D%2C%20heart_df_clean%5B'target'%5D%2C%20normalize%3D'index')%20*%20100%0A%0A%20%20%20%20sex_target_summary%20%3D%20_counts.astype(str)%20%2B%20%22%20(%22%20%2B%20_percs.round(2).astype(str)%20%2B%20%22%25)%22%0A%20%20%20%20sex_target_summary%5B'Total%20(N)'%5D%20%3D%20_counts.sum(axis%3D1)%0A%0A%20%20%20%20sex_target_summary.index%20%3D%20%5B'Female%20(0)'%2C%20'Male%20(1)'%5D%0A%20%20%20%20sex_target_summary.columns%20%3D%20%5B'Healthy%20(0)'%2C%20'Disease%20(1)'%2C%20'Total%20(N)'%5D%0A%0A%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(7%2C%206))%0A%0A%20%20%20%20sns.countplot(%0A%20%20%20%20%20%20%20%20x%3D'sex'%2C%20%0A%20%20%20%20%20%20%20%20hue%3D'target'%2C%20%0A%20%20%20%20%20%20%20%20data%3Dheart_df_clean%2C%0A%20%20%20%20%20%20%20%20palette%3D%5B'%233498db'%2C%20'%23e74c3c'%5D%2C%20%0A%20%20%20%20%20%20%20%20ax%3D_ax%20%0A%20%20%20%20)%0A%0A%20%20%20%20_ax.bar_label(_ax.containers%5B0%5D%2C%20labels%3Dsex_target_summary%5B'Healthy%20(0)'%5D%2C%20padding%3D3)%0A%20%20%20%20_ax.bar_label(_ax.containers%5B1%5D%2C%20labels%3Dsex_target_summary%5B'Disease%20(1)'%5D%2C%20padding%3D3)%0A%0A%20%20%20%20_ax.set_title(%22Heart%20Disease%20Frequency%20by%20Sex%22%2C%20pad%3D15%2C%20weight%3D'bold')%0A%20%20%20%20_ax.set_xlabel(%22Sex%20(0%20%3D%20Female%2C%201%20%3D%20Male)%22)%0A%20%20%20%20_ax.set_ylabel(%22Amount%20of%20Patients%22)%0A%20%20%20%20_ax.legend(%5B%22Healthy%22%2C%20%22Disease%22%5D%2C%20title%3D%22Target%22)%0A%20%20%20%20sns.despine(ax%3D_ax)%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%9A%BB%20Heart%20Disease%20vs%20Sex%22)%2C%0A%20%20%20%20%20%20%20%20mo.hstack(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(sex_target_summary%2C%20selection%3DNone)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%20%20%20%20%5D%2C%20justify%3D%22start%22%2C%20gap%3D4)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%9A%BB%20Heart%20Disease%20vs%20Sex%0A%20%20%20%20*%20**Demographic%20Imbalance%3A**%20The%20dataset%20is%20heavily%20skewed%20towards%20males%20(**206%20males**%20vs%20**96%20females**).%0A%20%20%20%20*%20**High%20Risk%20in%20Females%3A**%20Interestingly%2C%20**75%25%20of%20females**%20in%20this%20dataset%20have%20heart%20disease%20(72%20out%20of%2096).%20This%20suggests%20that%20if%20a%20patient%20is%20female%20in%20this%20specific%20dataset%2C%20the%20probability%20of%20diagnosis%20is%20very%20high.%0A%20%20%20%20*%20**Male%20Distribution%3A**%20Males%20are%20more%20evenly%20distributed%2C%20with%20a%20slight%20lean%20towards%20being%20healthy%20(**~55%25%20healthy**%20vs%20**~45%25%20disease**).%0A%20%20%20%20*%20**Conclusion%3A**%20Sex%20is%20a%20crucial%20feature.%20The%20model%20will%20likely%20learn%20that%20being%20female%20increases%20the%20probability%20of%20a%20positive%20diagnosis%20in%20this%20specific%20context.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo%2C%20pd%2C%20plt%2C%20sns)%3A%0A%20%20%20%20%23%20CHEST%20PAIN%20(CP)%20vs%20TARGET%0A%20%20%20%20_cp_counts%20%3D%20pd.crosstab(heart_df_clean%5B'cp'%5D%2C%20heart_df_clean%5B'target'%5D)%0A%20%20%20%20_cp_percs%20%3D%20pd.crosstab(heart_df_clean%5B'cp'%5D%2C%20heart_df_clean%5B'target'%5D%2C%20normalize%3D'index')%20*%20100%0A%0A%20%20%20%20_label_healthy%20%3D%20_cp_counts%5B0%5D.astype(str)%20%2B%20%22%20(%22%20%2B%20_cp_percs%5B0%5D.round(1).astype(str)%20%2B%20%22%25)%22%0A%20%20%20%20_label_disease%20%3D%20_cp_counts%5B1%5D.astype(str)%20%2B%20%22%20(%22%20%2B%20_cp_percs%5B1%5D.round(1).astype(str)%20%2B%20%22%25)%22%0A%0A%20%20%20%20cp_summary_df%20%3D%20pd.DataFrame(%7B%0A%20%20%20%20%20%20%20%20%22Healthy%20(0)%22%3A%20_label_healthy%2C%0A%20%20%20%20%20%20%20%20%22Disease%20(1)%22%3A%20_label_disease%2C%0A%20%20%20%20%20%20%20%20%22Total%20(N)%22%3A%20_cp_counts.sum(axis%3D1)%0A%20%20%20%20%7D)%0A%20%20%20%20cp_summary_df.index%20%3D%20%5B'Typical%20Angina%20(0)'%2C%20'Atypical%20Angina%20(1)'%2C%20'Non-anginal%20(2)'%2C%20'Asymptomatic%20(3)'%5D%0A%0A%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(9%2C%208))%0A%20%20%20%20sns.countplot(%0A%20%20%20%20%20%20%20%20x%3D'cp'%2C%20%0A%20%20%20%20%20%20%20%20hue%3D'target'%2C%20%0A%20%20%20%20%20%20%20%20data%3Dheart_df_clean%2C%0A%20%20%20%20%20%20%20%20palette%3D%5B'%239b59b6'%2C%20'%23e74c3c'%5D%2C%0A%20%20%20%20%20%20%20%20ax%3D_ax%20%20%20%20%20%20%20%20%20%20%20%20%20%20%0A%20%20%20%20)%0A%0A%20%20%20%20_ax.bar_label(_ax.containers%5B0%5D%2C%20labels%3D_label_healthy%2C%20padding%3D3)%0A%20%20%20%20_ax.bar_label(_ax.containers%5B1%5D%2C%20labels%3D_label_disease%2C%20padding%3D3)%0A%0A%20%20%20%20_ax.set_title(%22Heart%20Disease%20Rate%20by%20Chest%20Pain%20Type%22%2C%20pad%3D15%2C%20weight%3D'bold')%0A%20%20%20%20_ax.set_xlabel(%22Chest%20Pain%20Type%22)%0A%20%20%20%20_ax.set_ylabel(%22Count%20of%20Patients%22)%0A%20%20%20%20_ax.legend(%5B%22Healthy%22%2C%20%22Disease%22%5D%2C%20title%3D%22Target%22%2C%20loc%3D'upper%20left')%0A%20%20%20%20_ax.set_ylim(0%2C%20130)%20%0A%20%20%20%20sns.despine(ax%3D_ax)%20%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%AB%80%20Chest%20Pain%20vs%20Target%20(Diagnosis)%22)%2C%0A%20%20%20%20%20%20%20%20mo.hstack(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(cp_summary_df%2C%20selection%3DNone)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%20%20%20%20%5D%2C%20justify%3D%22start%22%2C%20gap%3D4)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%AB%80%20Chest%20Pain%20Type%0A%20%20%20%20*%20**The%20%22Typical%22%20Paradox%3A**%20Surprisingly%2C%20**Type%200%20(Typical%20Angina)**%20is%20the%20safest%20category.%20**72.7%25**%20of%20patients%20with%20this%20pain%20type%20are%20healthy.%20This%20is%20a%20crucial%20insight%3A%20having%20%22typical%22%20pain%20doesn't%20guarantee%20heart%20disease%20in%20this%20dataset.%0A%20%20%20%20*%20**The%20Danger%20Zone%20(Type%202)%3A**%20**Type%202%20(Non-anginal%20pain)**%20is%20a%20massive%20red%20flag.%20Out%20of%2086%20patients%2C%20**79.1%25**%20have%20heart%20disease.%20This%20will%20be%20a%20dominant%20predictor%20for%20the%20model.%0A%20%20%20%20*%20**High%20Risk%20in%20Types%201%20%26%203%3A**%20Types%201%20and%203%20also%20show%20very%20high%20disease%20rates%20(82%25%20and%2069.6%25%20respectively)%2C%20making%20any%20pain%20type%20*other%20than%200*%20a%20strong%20indicator%20of%20risk.%0A%20%20%20%20*%20**Conclusion%3A**%20This%20variable%20provides%20excellent%20%22Separability%22.%20If%20%24cp%20%3E%200%24%2C%20the%20risk%20skyrockets.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo%2C%20plt%2C%20sns)%3A%0A%20%20%20%20%23%20THALACH%20vs%20Target%0A%20%20%20%20_thalach_stats%20%3D%20heart_df_clean.groupby('target')%5B'thalach'%5D.describe()%5B%5B'count'%2C%20'mean'%2C%20'50%25'%2C%20'max'%5D%5D%0A%0A%20%20%20%20_thalach_stats.columns%20%3D%20%5B'Count'%2C%20'Mean%20(Average)'%2C%20'Median'%2C%20'Max%20Rate'%5D%0A%20%20%20%20_thalach_stats.index%20%3D%20%5B'Healthy%20(0)'%2C%20'Disease%20(1)'%5D%0A%20%20%20%20thalach_stats_df%20%3D%20_thalach_stats.round(1)%0A%0A%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(10%2C%206))%0A%0A%20%20%20%20sns.kdeplot(%0A%20%20%20%20%20%20%20%20x%3D'thalach'%2C%20%0A%20%20%20%20%20%20%20%20hue%3D'target'%2C%20%0A%20%20%20%20%20%20%20%20data%3Dheart_df_clean%2C%0A%20%20%20%20%20%20%20%20fill%3DTrue%2C%20%0A%20%20%20%20%20%20%20%20palette%3D%5B'%232ecc71'%2C%20'%23e74c3c'%5D%2C%0A%20%20%20%20%20%20%20%20common_norm%3DFalse%2C%20%0A%20%20%20%20%20%20%20%20alpha%3D0.4%2C%0A%20%20%20%20%20%20%20%20ax%3D_ax%20%20%20%20%20%20%20%20%20%20%20%20%20%20%0A%20%20%20%20)%0A%0A%20%20%20%20_ax.axvline(thalach_stats_df.loc%5B'Healthy%20(0)'%2C%20'Mean%20(Average)'%5D%2C%20color%3D'%2327ae60'%2C%20linestyle%3D'--'%2C%20label%3D'Healthy%20Mean'%2C%20linewidth%3D2)%0A%20%20%20%20_ax.axvline(thalach_stats_df.loc%5B'Disease%20(1)'%2C%20'Mean%20(Average)'%5D%2C%20color%3D'%23c0392b'%2C%20linestyle%3D'--'%2C%20label%3D'Disease%20Mean'%2C%20linewidth%3D2)%0A%0A%20%20%20%20_ax.set_title(%22Distribution%20of%20Max%20Heart%20Rate%20(thalach)%20by%20Diagnosis%22%2C%20pad%3D15%2C%20weight%3D'bold')%0A%20%20%20%20_ax.set_xlabel(%22Maximum%20Heart%20Rate%20Achieved%22)%0A%20%20%20%20_ax.set_ylabel(%22Density%20(Probability)%22)%0A%20%20%20%20_ax.legend()%0A%20%20%20%20sns.despine(ax%3D_ax)%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%92%93%20Max%20Heart%20Rate%20(thalach)%20Analysis%22)%2C%0A%20%20%20%20%20%20%20%20mo.hstack(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(thalach_stats_df%2C%20selection%3DNone)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%20%20%20%20%5D%2C%20justify%3D%22start%22%2C%20gap%3D4)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%92%93%20Max%20Heart%20Rate%20(thalach)%0A%20%20%20%20*%20**Clear%20Separation%3A**%20There%20is%20a%20distinct%20difference%20between%20the%20two%20groups.%20The%20distributions%20(bumps)%20are%20far%20apart%2C%20which%20makes%20%60thalach%60%20an%20excellent%20predictor.%0A%20%20%20%20*%20**The%20Trend%3A**%20Patients%20in%20the%20**Disease%20(1)**%20group%20tend%20to%20have%20a%20**significantly%20higher**%20maximum%20heart%20rate%20(Mean%3A%20**158.4**)%20compared%20to%20the%20**Healthy%20(0)**%20group%20(Mean%3A%20**139.1**).%0A%20%20%20%20*%20**Correlation%20Confirmation%3A**%20This%20aligns%20with%20the%20correlation%20matrix%20(%24r%3D0.42%24)%2C%20showing%20that%20a%20higher%20heart%20rate%20is%20positively%20associated%20with%20the%20target%20diagnosis%20in%20this%20specific%20dataset.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo%2C%20np%2C%20plt%2C%20sns)%3A%0A%20%20%20%20%23%20AGE%20vs%20THALACH%0A%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(10%2C%206))%0A%0A%20%20%20%20sns.scatterplot(%0A%20%20%20%20%20%20%20%20x%3D'age'%2C%20%0A%20%20%20%20%20%20%20%20y%3D'thalach'%2C%20%0A%20%20%20%20%20%20%20%20data%3Dheart_df_clean%2C%0A%20%20%20%20%20%20%20%20hue%3D'target'%2C%20%0A%20%20%20%20%20%20%20%20palette%3D%5B'%233498db'%2C%20'%23e74c3c'%5D%2C%0A%20%20%20%20%20%20%20%20alpha%3D0.7%2C%20%0A%20%20%20%20%20%20%20%20s%3D70%2C%0A%20%20%20%20%20%20%20%20ax%3D_ax%20%20%20%20%20%20%20%20%20%20%20%0A%20%20%20%20)%0A%0A%20%20%20%20_x_points%20%3D%20np.linspace(heart_df_clean%5B'age'%5D.min()%2C%20heart_df_clean%5B'age'%5D.max()%2C%20100)%0A%20%20%20%20_y_points%20%3D%20220%20-%20_x_points%0A%20%20%20%20_ax.plot(_x_points%2C%20_y_points%2C%20color%3D'grey'%2C%20linestyle%3D'--'%2C%20label%3D'Theoretical%20Max%20(220-Age)'%2C%20alpha%3D0.6%2C%20linewidth%3D2)%0A%0A%20%20%20%20_ax.set_title(%22Age%20vs%20Max%20Heart%20Rate%3A%20The%20Impact%20of%20Disease%22%2C%20pad%3D15%2C%20weight%3D'bold')%0A%20%20%20%20_ax.set_xlabel(%22Age%20(Years)%22)%0A%20%20%20%20_ax.set_ylabel(%22Max%20Heart%20Rate%20(thalach)%22)%0A%0A%20%20%20%20_handles%2C%20_labels%20%3D%20_ax.get_legend_handles_labels()%0A%0A%20%20%20%20_clean_labels%20%3D%20%5B'Healthy%20(0)'%20if%20l%20%3D%3D%20'0'%20else%20'Disease%20(1)'%20if%20l%20%3D%3D%20'1'%20else%20l%20for%20l%20in%20_labels%5D%0A%0A%20%20%20%20_ax.legend(_handles%2C%20_clean_labels%2C%20bbox_to_anchor%3D(1.02%2C%200.5)%2C%20loc%3D'center%20left'%2C%20borderaxespad%3D0.)%0A%0A%20%20%20%20sns.despine(ax%3D_ax)%0A%20%20%20%20_fig.tight_layout()%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%93%89%20Age%20vs%20Heart%20Rate%20(Domain%20Knowledge)%22)%2C%0A%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%93%89%20Age%20vs%20Max%20Heart%20Rate%0A%20%20%20%20*%20**Natural%20Decline%3A**%20The%20plot%20clearly%20shows%20that%20as%20**Age%20increases**%2C%20the%20**Max%20Heart%20Rate%20decreases**.%20This%20follows%20the%20natural%20physiological%20trend%20(roughly%20%24220%20-%20Age%24).%0A%20%20%20%20*%20**The%20%22Risk%20Layer%22%3A**%20Notice%20the%20vertical%20separation.%20The%20**Disease%20(Red)**%20points%20tend%20to%20be%20positioned%20**higher**%20than%20the%20Healthy%20(Blue)%20points%20across%20most%20ages.%0A%20%20%20%20*%20**Combined%20Power%3A**%20While%20Age%20alone%20had%20a%20lot%20of%20overlap%20(as%20seen%20in%20the%20KDE%20plot)%2C%20combining%20it%20with%20Heart%20Rate%20reveals%20a%20clearer%20pattern.%20A%2060-year-old%20with%20a%20heart%20rate%20of%20170%20is%20much%20more%20likely%20to%20be%20in%20the%20%22Disease%22%20group%20than%20a%2060-year-old%20with%20a%20heart%20rate%20of%20130.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(heart_df_clean%2C%20mo%2C%20plt%2C%20sns)%3A%0A%20%20%20%20%23%20AGE%20vs%20target%0A%20%20%20%20_age_stats%20%3D%20heart_df_clean.groupby('target')%5B'age'%5D.describe()%5B%5B'count'%2C%20'mean'%2C%20'50%25'%2C%20'max'%5D%5D%0A%0A%20%20%20%20_age_stats.columns%20%3D%20%5B'Count'%2C%20'Mean%20Age'%2C%20'Median'%2C%20'Oldest'%5D%0A%20%20%20%20_age_stats.index%20%3D%20%5B'Healthy%20(0)'%2C%20'Disease%20(1)'%5D%0A%20%20%20%20age_stats_df%20%3D%20_age_stats.round(1)%0A%0A%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(10%2C%206))%0A%0A%20%20%20%20sns.kdeplot(%0A%20%20%20%20%20%20%20%20x%3D'age'%2C%20%0A%20%20%20%20%20%20%20%20hue%3D'target'%2C%20%0A%20%20%20%20%20%20%20%20data%3Dheart_df_clean%2C%0A%20%20%20%20%20%20%20%20fill%3DTrue%2C%20%0A%20%20%20%20%20%20%20%20palette%3D%5B'%232ecc71'%2C%20'%23e74c3c'%5D%2C%0A%20%20%20%20%20%20%20%20common_norm%3DFalse%2C%20%0A%20%20%20%20%20%20%20%20alpha%3D0.4%2C%0A%20%20%20%20%20%20%20%20ax%3D_ax%20%20%20%20%20%20%20%20%20%0A%20%20%20%20)%0A%0A%20%20%20%20_ax.axvline(age_stats_df.loc%5B'Healthy%20(0)'%2C%20'Mean%20Age'%5D%2C%20color%3D'%2327ae60'%2C%20linestyle%3D'--'%2C%20label%3D'Healthy%20Mean'%2C%20linewidth%3D2)%0A%20%20%20%20_ax.axvline(age_stats_df.loc%5B'Disease%20(1)'%2C%20'Mean%20Age'%5D%2C%20color%3D'%23c0392b'%2C%20linestyle%3D'--'%2C%20label%3D'Disease%20Mean'%2C%20linewidth%3D2)%0A%0A%20%20%20%20_ax.set_title(%22Age%20Distribution%20by%20Diagnosis%22%2C%20pad%3D15%2C%20weight%3D'bold')%0A%20%20%20%20_ax.set_xlabel(%22Age%20(Years)%22)%0A%20%20%20%20_ax.set_ylabel(%22Density%22)%0A%20%20%20%20_ax.legend()%20%0A%20%20%20%20sns.despine(ax%3D_ax)%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20%F0%9F%8E%82%20Age%20Distribution%20Analysis%22)%2C%0A%20%20%20%20%20%20%20%20mo.hstack(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(age_stats_df%2C%20selection%3DNone)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%20%20%20%20%5D%2C%20justify%3D%22start%22%2C%20gap%3D4)%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20%F0%9F%8E%82%20Age%20Distribution%20vs%20Diagnosis%0A%20%20%20%20*%20**Significant%20Overlap%3A**%20Unlike%20%22Chest%20Pain%22%20or%20%22Heart%20Rate%22%2C%20the%20age%20distributions%20for%20Healthy%20and%20Disease%20groups%20overlap%20significantly.%20This%20means%20**Age%20alone%20is%20not%20a%20strong%20separator**.%0A%20%20%20%20*%20**The%20Shift%3A**%20However%2C%20there%20is%20a%20visible%20trend%3A%20the%20**Disease%20(Red)**%20curve%20is%20shifted%20to%20the%20**right**.%20The%20peak%20risk%20appears%20around%20**58-60%20years%20old**%2C%20whereas%20the%20healthy%20group%20peaks%20younger%20(~52-54).%0A%20%20%20%20*%20**The%20%22Confusion%20Zone%22%3A**%20Between%20ages%20**50%20and%2065**%2C%20the%20probability%20is%20quite%20mixed.%20The%20model%20will%20need%20other%20features%20(like%20%60thalach%60%20or%20%60cp%60)%20to%20make%20a%20confident%20decision%20in%20this%20age%20range.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20ColumnTransformer%2C%0A%20%20%20%20OneHotEncoder%2C%0A%20%20%20%20StandardScaler%2C%0A%20%20%20%20heart_df_clean%2C%0A%20%20%20%20mo%2C%0A%20%20%20%20train_test_split%2C%0A)%3A%0A%20%20%20%20%23%20Data%20Preprocessing%20Pipeline%3A%20Train%2FTest%20Split%2C%20Scaling%2C%20One-Hot%20Encoding%20(Preventing%20Data%20Leakage)%0A%20%20%20%20X%20%3D%20heart_df_clean.drop('target'%2C%20axis%3D1)%0A%20%20%20%20y%20%3D%20heart_df_clean%5B'target'%5D%0A%0A%20%20%20%20X_train%2C%20X_test%2C%20y_train%2C%20y_test%20%3D%20train_test_split(%0A%20%20%20%20%20%20%20%20X%2C%20y%2C%20%0A%20%20%20%20%20%20%20%20test_size%3D0.2%2C%20%0A%20%20%20%20%20%20%20%20random_state%3D42%2C%20%0A%20%20%20%20%20%20%20%20stratify%3Dy%20%0A%20%20%20%20)%0A%0A%20%20%20%20numeric_features%20%3D%20%5B'age'%2C%20'trestbps'%2C%20'chol'%2C%20'thalach'%2C%20'oldpeak'%5D%0A%20%20%20%20categorical_features%20%3D%20%5B'cp'%2C%20'restecg'%2C%20'slope'%2C%20'ca'%2C%20'thal'%5D%0A%20%20%20%20binary_features%20%3D%20%5B'sex'%2C%20'fbs'%2C%20'exang'%5D%20%0A%0A%20%20%20%20%23%20ColumnTransformer%0A%20%20%20%20preprocessor%20%3D%20ColumnTransformer(%0A%20%20%20%20%20%20%20%20transformers%3D%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20('num'%2C%20StandardScaler()%2C%20numeric_features)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20('cat'%2C%20OneHotEncoder(drop%3D'first'%2C%20handle_unknown%3D'ignore')%2C%20categorical_features)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20('bin'%2C%20'passthrough'%2C%20binary_features)%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%0A%20%20%20%20%23%20Fit%20only%20on%20Train!%0A%20%20%20%20X_train_scaled%20%3D%20preprocessor.fit_transform(X_train)%0A%20%20%20%20X_test_scaled%20%3D%20preprocessor.transform(X_test)%0A%0A%20%20%20%20mo.md(f%22%22%22%0A%20%20%20%20%23%23%23%20%E2%9A%99%EF%B8%8F%20Data%20Preprocessing%20Pipeline!%0A%20%20%20%20*%20**Steps%3A**%20Train%2FTest%20Split%20-%3E%20StandardScaler%20-%3E%20OneHotEncoder%0A%20%20%20%20*%20**Training%20Data%3A**%20%7BX_train_scaled.shape%7D%0A%20%20%20%20*%20**Testing%20Data%3A**%20%7BX_test_scaled.shape%7D%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%20X_test_scaled%2C%20X_train_scaled%2C%20preprocessor%2C%20y_test%2C%20y_train%0A%0A%0A%40app.cell%0Adef%20_(classification_report%2C%20confusion_matrix%2C%20mo%2C%20plt%2C%20roc_auc_score%2C%20sns)%3A%0A%20%20%20%20%23%20Create%20a%20universal%20function%20to%20evaluate%20any%20model!%0A%20%20%20%20def%20evaluate_model(model%2C%20name%2C%20X_train%2C%20X_test%2C%20y_train%2C%20y_test)%3A%0A%20%20%20%20%20%20%20%20%23%20Training%20(Fit)%0A%20%20%20%20%20%20%20%20model.fit(X_train%2C%20y_train)%0A%0A%20%20%20%20%20%20%20%20%23%20Predict%20on%20Test%20set%0A%20%20%20%20%20%20%20%20y_pred%20%3D%20model.predict(X_test)%0A%20%20%20%20%20%20%20%20y_prob%20%3D%20model.predict_proba(X_test)%5B%3A%2C%201%5D%0A%0A%20%20%20%20%20%20%20%20%23%20Calculate%20statistics%0A%20%20%20%20%20%20%20%20roc_auc%20%3D%20roc_auc_score(y_test%2C%20y_prob)%0A%20%20%20%20%20%20%20%20report%20%3D%20classification_report(y_test%2C%20y_pred)%0A%0A%20%20%20%20%20%20%20%20%23%20Confusion%20Matrix%0A%20%20%20%20%20%20%20%20cm%20%3D%20confusion_matrix(y_test%2C%20y_pred)%0A%20%20%20%20%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(8%2C%206))%0A%20%20%20%20%20%20%20%20sns.heatmap(cm%2C%20annot%3DTrue%2C%20fmt%3D'd'%2C%20cmap%3D'Blues'%2C%20cbar%3DFalse%2C%20ax%3D_ax%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20xticklabels%3D%5B'Predicted%20Healthy%20(0)'%2C%20'Predicted%20Disease%20(1)'%5D%2C%20%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20yticklabels%3D%5B'Actual%20Healthy%20(0)'%2C%20'Actual%20Disease%20(1)'%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20annot_kws%3D%7B%22size%22%3A%2014%2C%20%22weight%22%3A%20%22bold%22%7D)%0A%20%20%20%20%20%20%20%20_ax.set_title(f'Confusion%20Matrix%3A%20%7Bname%7D'%2C%20pad%3D15%2C%20weight%3D'bold')%0A%20%20%20%20%20%20%20%20plt.tight_layout()%0A%0A%20%20%20%20%20%20%20%20%23%20Marimo%20UI%0A%20%20%20%20%20%20%20%20return%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(f%22%23%23%20Model%20Evaluation%3A%20%7Bname%7D%22)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(f%22**ROC-AUC%20Score%3A**%20%60%7Broc_auc%3A.3f%7D%60%20Closer%20to%201%20is%20better%22)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(f%22%23%23%23%20%F0%9F%93%8A%20Classification%20Report%3A%5Cn%60%60%60text%5Cn%7Breport%7D%5Cn%60%60%60%22)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%20%20%20%20%5D)%0A%20%20%20%20return%20(evaluate_model%2C)%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20LogisticRegression%2C%0A%20%20%20%20X_test_scaled%2C%0A%20%20%20%20X_train_scaled%2C%0A%20%20%20%20evaluate_model%2C%0A%20%20%20%20y_test%2C%0A%20%20%20%20y_train%2C%0A)%3A%0A%20%20%20%20%23%20Logistic%20Regression%0A%20%20%20%20log_reg%20%3D%20LogisticRegression(random_state%3D42%2C%20class_weight%3D'balanced')%0A%0A%20%20%20%20%23%20Call%20the%20func%0A%20%20%20%20log_reg_results%20%3D%20evaluate_model(%0A%20%20%20%20%20%20%20%20model%3Dlog_reg%2C%20%0A%20%20%20%20%20%20%20%20name%3D%22Logistic%20Regression%20(Baseline)%22%2C%20%0A%20%20%20%20%20%20%20%20X_train%3DX_train_scaled%2C%20%0A%20%20%20%20%20%20%20%20X_test%3DX_test_scaled%2C%20%0A%20%20%20%20%20%20%20%20y_train%3Dy_train%2C%20%0A%20%20%20%20%20%20%20%20y_test%3Dy_test%0A%20%20%20%20)%0A%0A%20%20%20%20log_reg_results%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20GradientBoostingClassifier%2C%0A%20%20%20%20RandomForestClassifier%2C%0A%20%20%20%20SVC%2C%0A%20%20%20%20X_test_scaled%2C%0A%20%20%20%20X_train_scaled%2C%0A%20%20%20%20evaluate_model%2C%0A%20%20%20%20mo%2C%0A%20%20%20%20y_test%2C%0A%20%20%20%20y_train%2C%0A)%3A%0A%20%20%20%20%23%20Random%20Forest%0A%20%20%20%20rf_model%20%3D%20RandomForestClassifier(random_state%3D42%2C%20class_weight%3D'balanced')%0A%20%20%20%20rf_results%20%3D%20evaluate_model(%0A%20%20%20%20%20%20%20%20model%3Drf_model%2C%20%0A%20%20%20%20%20%20%20%20name%3D%22Random%20Forest%22%2C%20%0A%20%20%20%20%20%20%20%20X_train%3DX_train_scaled%2C%20X_test%3DX_test_scaled%2C%20y_train%3Dy_train%2C%20y_test%3Dy_test%0A%20%20%20%20)%0A%0A%20%20%20%20%23%20Support%20Vector%20Machine%20(SVC)%0A%20%20%20%20svm_model%20%3D%20SVC(probability%3DTrue%2C%20random_state%3D42%2C%20class_weight%3D'balanced')%0A%20%20%20%20svm_results%20%3D%20evaluate_model(%0A%20%20%20%20%20%20%20%20model%3Dsvm_model%2C%20%0A%20%20%20%20%20%20%20%20name%3D%22Support%20Vector%20Machine%20(SVC)%22%2C%20%0A%20%20%20%20%20%20%20%20X_train%3DX_train_scaled%2C%20X_test%3DX_test_scaled%2C%20y_train%3Dy_train%2C%20y_test%3Dy_test%0A%20%20%20%20)%0A%0A%20%20%20%20%23%20Gradient%20Boosting%20%0A%20%20%20%20gb_model%20%3D%20GradientBoostingClassifier(random_state%3D42)%0A%20%20%20%20gb_results%20%3D%20evaluate_model(%0A%20%20%20%20%20%20%20%20model%3Dgb_model%2C%20%0A%20%20%20%20%20%20%20%20name%3D%22Gradient%20Boosting%22%2C%20%0A%20%20%20%20%20%20%20%20X_train%3DX_train_scaled%2C%20X_test%3DX_test_scaled%2C%20y_train%3Dy_train%2C%20y_test%3Dy_test%0A%20%20%20%20)%0A%0A%20%20%20%20%23%20All%20three%20model%20results%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20Baseline%20Comparison%22)%2C%0A%20%20%20%20%20%20%20%20rf_results%2C%0A%20%20%20%20%20%20%20%20mo.md(%22---%22)%2C%0A%20%20%20%20%20%20%20%20svm_results%2C%0A%20%20%20%20%20%20%20%20mo.md(%22---%22)%2C%0A%20%20%20%20%20%20%20%20gb_results%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20Baseline%20Conclusion%3A%20The%20Power%20of%20Simplicity%0A%20%20%20%20*%20**Observation%3A**%20Against%20expectations%2C%20the%20simplest%20model%20(Logistic%20Regression)%20outperformed%20complex%20ensemble%20methods%20(Random%20Forest%2C%20Gradient%20Boosting)%20and%20SVC%20out-of-the-box%2C%20achieving%20the%20highest%20Recall%20(82%25)%20and%20ROC-AUC%20(0.887).%0A%20%20%20%20*%20**The%20%22Why%22%20(Occam's%20Razor)%3A**%20Our%20dataset%20is%20relatively%20small%20(~300%20records).%20In%20such%20cases%2C%20highly%20complex%20models%20tend%20to%20overfit%20the%20training%20data%20and%20generalize%20poorly%20on%20unseen%20data%20when%20using%20default%20parameters.%20Linear%20models%20like%20Logistic%20Regression%20are%20much%20more%20robust%20and%20stable%20here.%0A%20%20%20%20*%20**Next%20Step%20(Hypothesis)%3A**%20Can%20hyperparameter%20tuning%20%22wake%20up%22%20the%20complex%20models%3F%20We%20will%20now%20use%20%60GridSearchCV%60%20to%20optimize%20both%20Logistic%20Regression%20and%20Random%20Forest%20specifically%20for%20**Recall**%2C%20to%20see%20if%20we%20can%20catch%20those%20remaining%206%20false%20negative%20patients.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(%0A%20%20%20%20GridSearchCV%2C%0A%20%20%20%20LogisticRegression%2C%0A%20%20%20%20RandomForestClassifier%2C%0A%20%20%20%20X_test_scaled%2C%0A%20%20%20%20X_train_scaled%2C%0A%20%20%20%20evaluate_model%2C%0A%20%20%20%20mo%2C%0A%20%20%20%20y_test%2C%0A%20%20%20%20y_train%2C%0A)%3A%0A%20%20%20%20%23%20Hyperparameter%20Tuning%20via%20GridSearchCV%20(Optimizing%20for%20Recall)%0A%0A%20%20%20%20%23%20Logistic%20Regression%0A%20%20%20%20log_param_grid%20%3D%20%7B%0A%20%20%20%20%20%20%20%20'C'%3A%20%5B0.01%2C%200.1%2C%201%2C%2010%2C%20100%5D%2C%0A%20%20%20%20%20%20%20%20'class_weight'%3A%20%5B'balanced'%5D%0A%20%20%20%20%7D%0A%0A%20%20%20%20%23%20Init%20GridSearch%20(Focus%20on%20Recall!)%0A%20%20%20%20log_grid%20%3D%20GridSearchCV(%0A%20%20%20%20%20%20%20%20LogisticRegression(random_state%3D42)%2C%20%0A%20%20%20%20%20%20%20%20log_param_grid%2C%20%0A%20%20%20%20%20%20%20%20cv%3D5%2C%0A%20%20%20%20%20%20%20%20scoring%3D'recall'%2C%0A%20%20%20%20%20%20%20%20n_jobs%3D-1%0A%20%20%20%20)%0A%0A%20%20%20%20%23%20Training%0A%20%20%20%20log_grid.fit(X_train_scaled%2C%20y_train)%0A%20%20%20%20best_log_model%20%3D%20log_grid.best_estimator_%0A%0A%20%20%20%20%23%20Random%20Forest%0A%20%20%20%20rf_param_grid%20%3D%20%7B%0A%20%20%20%20%20%20%20%20'n_estimators'%3A%20%5B50%2C%20100%2C%20200%5D%2C%0A%20%20%20%20%20%20%20%20'max_depth'%3A%20%5B3%2C%205%2C%2010%2C%20None%5D%2C%0A%20%20%20%20%20%20%20%20'min_samples_split'%3A%20%5B2%2C%205%2C%2010%5D%2C%0A%20%20%20%20%20%20%20%20'class_weight'%3A%20%5B'balanced'%2C%20'balanced_subsample'%5D%0A%20%20%20%20%7D%0A%0A%20%20%20%20rf_grid%20%3D%20GridSearchCV(%0A%20%20%20%20%20%20%20%20RandomForestClassifier(random_state%3D42)%2C%20%0A%20%20%20%20%20%20%20%20rf_param_grid%2C%20%0A%20%20%20%20%20%20%20%20cv%3D5%2C%20%0A%20%20%20%20%20%20%20%20scoring%3D'recall'%2C%0A%20%20%20%20%20%20%20%20n_jobs%3D-1%0A%20%20%20%20)%0A%0A%20%20%20%20rf_grid.fit(X_train_scaled%2C%20y_train)%0A%20%20%20%20best_rf_model%20%3D%20rf_grid.best_estimator_%0A%0A%20%20%20%20%23%20Evaluating%20the%20Tuned%20models%0A%20%20%20%20tuned_log_results%20%3D%20evaluate_model(%0A%20%20%20%20%20%20%20%20model%3Dbest_log_model%2C%20%0A%20%20%20%20%20%20%20%20name%3Df%22Tuned%20Logistic%20Regression%20(Best%20C%3A%20%7Blog_grid.best_params_%5B'C'%5D%7D)%22%2C%20%0A%20%20%20%20%20%20%20%20X_train%3DX_train_scaled%2C%20X_test%3DX_test_scaled%2C%20y_train%3Dy_train%2C%20y_test%3Dy_test%0A%20%20%20%20)%0A%0A%20%20%20%20tuned_rf_results%20%3D%20evaluate_model(%0A%20%20%20%20%20%20%20%20model%3Dbest_rf_model%2C%20%0A%20%20%20%20%20%20%20%20name%3Df%22Tuned%20Random%20Forest%20(Best%20Depth%3A%20%7Brf_grid.best_params_%5B'max_depth'%5D%7D)%22%2C%20%0A%20%20%20%20%20%20%20%20X_train%3DX_train_scaled%2C%20X_test%3DX_test_scaled%2C%20y_train%3Dy_train%2C%20y_test%3Dy_test%0A%20%20%20%20)%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%23%23%20Hyperparameter%20Tuning%20Results%20(Optimized%20for%20Recall)%22)%2C%0A%20%20%20%20%20%20%20%20tuned_log_results%2C%0A%20%20%20%20%20%20%20%20mo.md(%22---%22)%2C%0A%20%20%20%20%20%20%20%20tuned_rf_results%0A%20%20%20%20%5D)%0A%20%20%20%20return%20(best_log_model%2C)%0A%0A%0A%40app.cell%0Adef%20_(RocCurveDisplay%2C%20X_test_scaled%2C%20best_log_model%2C%20mo%2C%20plt%2C%20y_test)%3A%0A%20%20%20%20%23%20Plotting%20the%20ROC%20Curve%20to%20visualize%20model%20dynamics%0A%20%20%20%20_fig_roc%2C%20_ax_roc%20%3D%20plt.subplots(figsize%3D(8%2C%206))%0A%0A%20%20%20%20RocCurveDisplay.from_estimator(%0A%20%20%20%20%20%20%20%20best_log_model%2C%20%0A%20%20%20%20%20%20%20%20X_test_scaled%2C%20%0A%20%20%20%20%20%20%20%20y_test%2C%20%0A%20%20%20%20%20%20%20%20name%3D%22Tuned%20Logistic%20Regression%22%2C%0A%20%20%20%20%20%20%20%20curve_kwargs%3D%7B%22color%22%3A%20%22darkorange%22%2C%20%22linewidth%22%3A%202%7D%2C%0A%20%20%20%20%20%20%20%20ax%3D_ax_roc%0A%20%20%20%20)%0A%0A%20%20%20%20_ax_roc.plot(%5B0%2C%201%5D%2C%20%5B0%2C%201%5D%2C%20color%3D%22navy%22%2C%20lw%3D2%2C%20linestyle%3D%22--%22%2C%20label%3D%22Random%20Guess%20(AUC%20%3D%200.5)%22)%0A%0A%20%20%20%20_ax_roc.set_title(%22(ROC)%20Curve%22%2C%20weight%3D%22bold%22%2C%20size%3D14%2C%20pad%3D15)%0A%20%20%20%20_ax_roc.set_xlabel(%22FP%20Rate%20(1%20-%20Specificity)%22%2C%20weight%3D%22bold%22)%0A%20%20%20%20_ax_roc.set_ylabel(%22TP%20Rate%20(Recall%20%2F%20Sensitivity)%22%2C%20weight%3D%22bold%22)%0A%20%20%20%20_ax_roc.legend(loc%3D%22lower%20right%22)%0A%20%20%20%20_ax_roc.grid(alpha%3D0.3)%0A%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%20%20%20%20%23%23%23%20%F0%9F%93%88%20ROC%20Curve%0A%20%20%20%20%20%20%20%20%22%22%22)%2C%0A%20%20%20%20%20%20%20%20_fig_roc%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20Final%20Conclusion%3A%20Winning%20with%20GridSearchCV%0A%0A%20%20%20%20*%20**Tuned%20Logistic%20Regression**%20(with%20%60C%3D10%60)%20is%20the%20absolute%20winner.%20By%20optimizing%20for%20Recall%2C%20we%20successfully%20increased%20our%20primary%20metric%20from%2082%25%20to%20**85%25**%20and%20pushed%20the%20ROC-AUC%20score%20to%20an%20excellent%20**0.900**.%0A%20%20%20%20*%20**Clinical%20Impact%3A**%20In%20a%20medical%20context%2C%20this%20improvement%20is%20critical.%20We%20successfully%20identified%20an%20additional%20complex%20disease%20case%2C%20reducing%20our%20False%20Negatives%20from%206%20down%20to%205.%0A%20%20%20%20*%20**The%20%22Small%20Data%22%20Reality%3A**%20Despite%20hyperparameter%20tuning%2C%20the%20Random%20Forest%20classifier%20could%20not%20surpass%20the%20baseline%20Recall%20(staying%20at%2082%25)%20and%20performed%20worse%20on%20the%20ROC-AUC%20metric%20(0.852).%20This%20perfectly%20validates%20our%20earlier%20hypothesis%3A%20on%20small%20tabular%20datasets%20(~300%20records)%2C%20well-tuned%20linear%20models%20often%20outperform%20complex%20tree-based%20ensembles%20by%20preventing%20overfitting.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(best_log_model%2C%20mo%2C%20pd%2C%20plt%2C%20preprocessor%2C%20sns)%3A%0A%20%20%20%20%23%20Extracting%20feature%20names%20from%20the%20preprocessor%0A%20%20%20%20feature_names%20%3D%20preprocessor.get_feature_names_out()%0A%0A%20%20%20%20%23%20Extracting%20coefficients%20from%20our%20champion%20model%0A%20%20%20%20coefficients%20%3D%20best_log_model.coef_%5B0%5D%0A%0A%20%20%20%20%23%20Creating%20a%20DataFrame%20and%20sorting%20descending%0A%20%20%20%20feature_importance_df%20%3D%20pd.DataFrame(%7B%0A%20%20%20%20%20%20%20%20'Feature'%3A%20feature_names%2C%0A%20%20%20%20%20%20%20%20'Importance'%3A%20coefficients%0A%20%20%20%20%7D).sort_values(by%3D'Importance'%2C%20ascending%3DFalse)%0A%0A%20%20%20%20%23%20Plotting%0A%20%20%20%20_fig%2C%20_ax%20%3D%20plt.subplots(figsize%3D(8%2C%206))%0A%0A%20%20%20%20%23%20Color%20logic%3A%20Positive%20(Red%2FDanger)%2C%20Negative%20(Blue%2FSafe)%0A%20%20%20%20colors%20%3D%20%5B'%23e74c3c'%20if%20x%20%3E%200%20else%20'%233498db'%20for%20x%20in%20feature_importance_df%5B'Importance'%5D%5D%0A%0A%20%20%20%20sns.barplot(x%3D'Importance'%2C%20y%3D'Feature'%2C%20data%3Dfeature_importance_df%2C%20palette%3Dcolors%2C%20hue%3D'Feature'%2C%20legend%3DFalse%2C%20ax%3D_ax)%0A%20%20%20%20_ax.set_title('Feature%20Importance%20(Tuned%20Logistic%20Regression)'%2C%20weight%3D'bold'%2C%20size%3D14%2C%20pad%3D15)%0A%20%20%20%20_ax.set_xlabel('Coefficient%20Value%20(Impact%20on%20Disease%20Probability)'%2C%20weight%3D'bold')%0A%20%20%20%20_ax.set_ylabel('Patient%20Features'%2C%20weight%3D'bold')%0A%20%20%20%20_ax.axvline(0%2C%20color%3D'black'%2C%20linestyle%3D'--'%2C%20linewidth%3D1)%0A%20%20%20%20plt.tight_layout()%0A%0A%20%20%20%20%23%20Output%20in%20Marimo%0A%20%20%20%20mo.vstack(%5B%0A%20%20%20%20%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%20%20%20%20%23%23%20%F0%9F%A7%A0%20How%20Does%20the%20Model%20Think%3F%0A%0A%0A%20%20%20%20%20%20%20%20*%20**Red%20Bars%20(Positive%20%3E%200)%3A**%20These%20features%20**increase**%20the%20probability%20of%20heart%20disease.%20The%20longer%20the%20bar%2C%20the%20more%20dangerous%20the%20symptom.%0A%20%20%20%20%20%20%20%20*%20**Blue%20Bars%20(Negative%20%3C%200)%3A**%20These%20features%20**decrease**%20the%20probability%20(indicate%20a%20healthier%20patient).%0A%20%20%20%20%20%20%20%20%22%22%22)%2C%0A%20%20%20%20%20%20%20%20_fig%0A%20%20%20%20%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(%22%22%22%0A%20%20%20%20%23%23%23%20Project%20Conclusion%20%26%20Business%20Value%0A%0A%20%20%20%20We%20extracted%20the%20underlying%20logic%20of%20our%20champion%20model%20(Tuned%20Logistic%20Regression).%20The%20model%20aligns%20perfectly%20with%20medical%20intuition%3A%0A%0A%20%20%20%20*%20**Primary%20Risk%20Factors%3A**%20Different%20types%20of%20chest%20pain%20(%60cp_3%60%2C%20%60cp_2%60%2C%20%60cp_1%60)%20are%20the%20strongest%20indicators%20pushing%20the%20model%20to%20predict%20the%20presence%20of%20heart%20disease.%0A%20%20%20%20*%20**Health%20Indicators%3A**%20Certain%20fluoroscopy%20results%20(e.g.%2C%20%60ca_2%60)%20act%20as%20the%20strongest%20negative%20coefficients%2C%20heavily%20reducing%20the%20probability%20of%20disease.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A