The content for this site available on GitHub. If you want to launch the notebooks interactively click on the binder stamp below.
< EDA Wine Cultivars | Contents | EDA Wine Quality Prediction continuous >
EDA: Wine Quality Prediction (binary)¶
In [ ]:
%reload_ext autoreload
%autoreload 2
In [ ]:
from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from helpers.base_imports import *
Create new EDA¶
In [ ]:
eda = EDA(name="wineq")
eda
Get raw dataset from remote source¶
In [ ]:
# fetch dataset
data = fetch_ucirepo(id=186)
In [ ]:
X = data.data.features
y = data.data.targets
df = pd.concat([X, y], axis=1)
disp_df(df)
!!!!! STOPPED HERE...need to make quality rating a binary choice (try to chose a split/threshold that roughly evenly splits the data???!?!)
In [ ]:
y.columns
In [ ]:
X.shape, y.shape
In [ ]:
eda.update_param(
"description", "Predict the quality of wine based on physicochemical tests"
)
eda.update_param("n features", X.shape[1])
eda.update_param("n samples", X.shape[0])
eda.update_param("f/n ratio", X.shape[1] / X.shape[0])
In [ ]:
eda.summary_df
Noise¶
In [ ]:
# check for missing values
X.isna().sum().sum()
In [ ]:
eda.update_param("noise", "None, no missing vals")
Stats¶
In [ ]:
# # Calculate skewness for numeric columns only
# numeric_X = X.select_dtypes(include=[np.number])
# skewness = numeric_X.skew()
skewness = X.skew()
summary_stats = X.describe().T[["min", "max", "mean", "std"]]
summary_stats["skewness"] = skewness
summary_stats
In [ ]:
fig, ax = plot_feature_statistics(X, X.columns, line=False)
fig.savefig(f"{FIGS_DIR}/{eda.name}_feature-statistics.png")
In [ ]:
eda.update_param("skewness", "lots of skweness in the data")
eda.update_param("stats", "strangeness")
eda.update_param("outliers", "many outliers")
In [ ]:
# class distribution of whole dataset
ax = sns.countplot(x=y.iloc[:, 0])
plt.title(f"Target Class Distribution ({eda.name})")
plt.xlabel("Class")
plt.ylabel("Count")
# Annotate each bar with the count
for p in ax.patches:
height = p.get_height()
ax.annotate(
f"{height}",
(p.get_x() + p.get_width() / 2.0, height),
ha="center",
va="center",
xytext=(0, 5),
textcoords="offset points",
)
plt.savefig(f"{FIGS_DIR}/{eda.name}_target-class-distribution.png")
plt.show()
In [ ]:
eda.update_param("class balance", "~Normal")
Feature Correlations¶
In [ ]:
sns.pairplot(data=df, hue="quality", palette="bright")
plt.savefig(f"{FIGS_DIR}/{eda.name}_pairplot.png")
In [ ]:
# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title(f"Correlation Matrix ({eda.name})")
plt.savefig(f"{FIGS_DIR}/{eda.name}_correlation-matrix.png")
plt.show()
In [ ]:
eda.update_param("correlations", "several moderate/low correlations, 1-2 higher")
Dimensionality Reduction Potential¶
In [ ]:
# PCA - number of components to explain 95% variance
pca_pipe = Pipeline(
[
("scaler", StandardScaler()),
("pca", PCA()),
]
)
pca_pipe.fit(X)
In [ ]:
explained_variance_ratio = pca_pipe.named_steps["pca"].explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
plt.figure(figsize=(8, 6))
plt.plot(cumulative_explained_variance, marker="o", linestyle="--")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA - Cumulative Explained Variance")
plt.axhline(y=0.95, color="r", linestyle="--") # Threshold for 95% explained variance
plt.show()
# Number of components to explain 95% variance
num_components_95 = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of original features: {X.shape[1]}")
print(f"Number of components to explain 95% of the variance: {num_components_95}")
In [ ]:
# ICA - number of independent components
ica_pipe = Pipeline(
[
("scaler", StandardScaler()),
("ica", FastICA()),
]
)
components = ica_pipe.fit_transform(X)
# Number of independent components
num_independent_components = components.shape[1]
print(f"Number of original features: {X.shape[1]}")
print(f"Number of independent components found: {num_independent_components}")
In [ ]:
eda.update_param(
"DR potential",
"PCA: 9 components to explain 95% variance, ICA: 11 independent components",
)
Save EDA results¶
In [ ]:
eda.summary_df
In [ ]:
eda.save(overwrite_existing=False)
Create and save a shuffled 80/20 train/test split¶
< EDA Wine Cultivars | Contents | EDA Wine Quality Prediction continuous >