The content for this site available on GitHub. If you want to launch the notebooks interactively click on the binder stamp below.
Contents | AutoML using FLAML >
Auto-EDAs using ydata-profiling¶
ydata automates much of the EDA process and can be a quick way to get a feel for a new dataset. I don't think its a replacement for manual EDA (esp if its a dataset you plan to use extensively), but it can be a great way to get a quick overview of data if you are in the familiarizing yourself new different datasets stage.
In [1]:
%reload_ext autoreload
%autoreload 2
In [9]:
from ucimlrepo import fetch_ucirepo
from sklearn.datasets import load_digits, load_breast_cancer
from ydata_profiling import ProfileReport
from helpers.base_imports import *
Gather datasets¶
In [10]:
# Dictionary of datasets: name -> (features, target)
datasets = {}
# UCI datasets
for name, id in [("abalone", 1), ("iris", 53), ("wine", 109)]:
data = fetch_ucirepo(id=id).data
df = pd.concat([data.features, data.targets], axis=1)
datasets[name] = df
# sklearn.datasets: Digits
digits = load_digits(as_frame=True)
df_digits = digits.frame # Already includes 'target' column
datasets["digits"] = df_digits
# sklearn.datasets: Breast Cancer
bc = load_breast_cancer(as_frame=True)
df_bc = bc.frame # Already includes 'target' column
datasets["breast_cancer"] = df_bc
Generate reports for each dataset¶
In [ ]:
# Generate annotated profile reports for each dataset
for name, df in datasets.items():
profile = ProfileReport(
df, # Combined DataFrame with features and target
title=f"{name.replace('_', ' ').title()} EDA", # Title for report
explorative=True, # Enables deeper analyses (interactions, clustering)
minimal=False, # Set True for very large datasets
samples={
"head": 10, # Show top 10 rows
"tail": 10, # Show bottom 10 rows
},
correlations={ # Enable correlation metrics
"pearson": {"calculate": True}, # Linear correlation (numeric)
"spearman": {"calculate": True}, # Monotonic relationship
"kendall": {"calculate": True}, # Rank-based, noise-resistant
"phi_k": {"calculate": True}, # Mixed types
"cramers": {"calculate": True}, # For categorical associations
},
missing_diagrams={ # Visualize missing data patterns
"heatmap": True,
"dendrogram": True,
},
duplicates={"head": 10}, # Show top 10 duplicate rows
interactions={"continuous": True}, # Pairwise scatterplots for numerics
progress_bar=True,
sort=None,
)
profile.to_file(f"data/autoeda_{name}.html")
See results