The content for this site available on GitHub. If you want to launch the notebooks interactively click on the binder stamp below. Binder

< EDA Wine Cultivars | Contents | EDA Wine Quality Prediction continuous >

EDA: Wine Quality Prediction (binary)

In [ ]:
%reload_ext autoreload
%autoreload 2
In [ ]:
from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from helpers.base_imports import *

Create new EDA

In [ ]:
eda = EDA(name="wineq")
eda

Get raw dataset from remote source

In [ ]:
# fetch dataset
data = fetch_ucirepo(id=186)
In [ ]:
X = data.data.features
y = data.data.targets

df = pd.concat([X, y], axis=1)
disp_df(df)

!!!!! STOPPED HERE...need to make quality rating a binary choice (try to chose a split/threshold that roughly evenly splits the data???!?!)

In [ ]:
y.columns
In [ ]:
X.shape, y.shape
In [ ]:
eda.update_param(
    "description", "Predict the quality of wine based on physicochemical tests"
)
eda.update_param("n features", X.shape[1])
eda.update_param("n samples", X.shape[0])
eda.update_param("f/n ratio", X.shape[1] / X.shape[0])
In [ ]:
eda.summary_df

Noise

In [ ]:
# check for missing values
X.isna().sum().sum()
In [ ]:
eda.update_param("noise", "None, no missing vals")

Stats

In [ ]:
# # Calculate skewness for numeric columns only
# numeric_X = X.select_dtypes(include=[np.number])
# skewness = numeric_X.skew()

skewness = X.skew()
summary_stats = X.describe().T[["min", "max", "mean", "std"]]
summary_stats["skewness"] = skewness
summary_stats
In [ ]:
fig, ax = plot_feature_statistics(X, X.columns, line=False)
fig.savefig(f"{FIGS_DIR}/{eda.name}_feature-statistics.png")
In [ ]:
eda.update_param("skewness", "lots of skweness in the data")
eda.update_param("stats", "strangeness")
eda.update_param("outliers", "many outliers")
In [ ]:
# class distribution of whole dataset
ax = sns.countplot(x=y.iloc[:, 0])
plt.title(f"Target Class Distribution ({eda.name})")
plt.xlabel("Class")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{eda.name}_target-class-distribution.png")
plt.show()
In [ ]:
eda.update_param("class balance", "~Normal")

Feature Correlations

In [ ]:
sns.pairplot(data=df, hue="quality", palette="bright")
plt.savefig(f"{FIGS_DIR}/{eda.name}_pairplot.png")
In [ ]:
# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title(f"Correlation Matrix ({eda.name})")
plt.savefig(f"{FIGS_DIR}/{eda.name}_correlation-matrix.png")
plt.show()
In [ ]:
eda.update_param("correlations", "several moderate/low correlations, 1-2 higher")

Dimensionality Reduction Potential

In [ ]:
# PCA - number of components to explain 95% variance
pca_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA()),
    ]
)
pca_pipe.fit(X)
In [ ]:
explained_variance_ratio = pca_pipe.named_steps["pca"].explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(8, 6))
plt.plot(cumulative_explained_variance, marker="o", linestyle="--")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA - Cumulative Explained Variance")
plt.axhline(y=0.95, color="r", linestyle="--")  # Threshold for 95% explained variance
plt.show()

# Number of components to explain 95% variance
num_components_95 = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of original features: {X.shape[1]}")
print(f"Number of components to explain 95% of the variance: {num_components_95}")
In [ ]:
# ICA - number of independent components
ica_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("ica", FastICA()),
    ]
)
components = ica_pipe.fit_transform(X)

# Number of independent components
num_independent_components = components.shape[1]
print(f"Number of original features: {X.shape[1]}")
print(f"Number of independent components found: {num_independent_components}")
In [ ]:
eda.update_param(
    "DR potential",
    "PCA: 9 components to explain 95% variance, ICA: 11 independent components",
)

Save EDA results

In [ ]:
eda.summary_df
In [ ]:
eda.save(overwrite_existing=False)

Create and save a shuffled 80/20 train/test split



< EDA Wine Cultivars | Contents | EDA Wine Quality Prediction continuous >