_{^{The content for this site available on GitHub. If you want to launch the notebooks interactively click on the binder stamp below.}}

< EDA Wine Cultivars | Contents | EDA Wine Quality Prediction continuous >

EDA: Wine Quality Prediction (binary)¶

In [ ]:

%reload_ext autoreload
%autoreload 2

In [ ]:

from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from helpers.base_imports import *

Create new EDA¶

In [ ]:

eda = EDA(name="wineq")
eda

Get raw dataset from remote source¶

In [ ]:

# fetch dataset
data = fetch_ucirepo(id=186)

In [ ]:

X = data.data.features
y = data.data.targets

df = pd.concat([X, y], axis=1)
disp_df(df)

!!!!! STOPPED HERE...need to make quality rating a binary choice (try to chose a split/threshold that roughly evenly splits the data???!?!)

In [ ]:

y.columns

In [ ]:

X.shape, y.shape

In [ ]:

eda.update_param(
    "description", "Predict the quality of wine based on physicochemical tests"
)
eda.update_param("n features", X.shape[1])
eda.update_param("n samples", X.shape[0])
eda.update_param("f/n ratio", X.shape[1] / X.shape[0])

In [ ]:

eda.summary_df

Noise¶

In [ ]:

# check for missing values
X.isna().sum().sum()

In [ ]:

eda.update_param("noise", "None, no missing vals")

Stats¶

In [ ]:

# # Calculate skewness for numeric columns only
# numeric_X = X.select_dtypes(include=[np.number])
# skewness = numeric_X.skew()

skewness = X.skew()
summary_stats = X.describe().T[["min", "max", "mean", "std"]]
summary_stats["skewness"] = skewness
summary_stats

In [ ]:

fig, ax = plot_feature_statistics(X, X.columns, line=False)
fig.savefig(f"{FIGS_DIR}/{eda.name}_feature-statistics.png")

In [ ]:

eda.update_param("skewness", "lots of skweness in the data")
eda.update_param("stats", "strangeness")
eda.update_param("outliers", "many outliers")

In [ ]:

# class distribution of whole dataset
ax = sns.countplot(x=y.iloc[:, 0])
plt.title(f"Target Class Distribution ({eda.name})")
plt.xlabel("Class")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{eda.name}_target-class-distribution.png")
plt.show()

In [ ]:

eda.update_param("class balance", "~Normal")

Feature Correlations¶

In [ ]:

sns.pairplot(data=df, hue="quality", palette="bright")
plt.savefig(f"{FIGS_DIR}/{eda.name}_pairplot.png")

In [ ]:

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title(f"Correlation Matrix ({eda.name})")
plt.savefig(f"{FIGS_DIR}/{eda.name}_correlation-matrix.png")
plt.show()

In [ ]:

eda.update_param("correlations", "several moderate/low correlations, 1-2 higher")

Dimensionality Reduction Potential¶

In [ ]:

# PCA - number of components to explain 95% variance
pca_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA()),
    ]
)
pca_pipe.fit(X)

In [ ]:

explained_variance_ratio = pca_pipe.named_steps["pca"].explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(8, 6))
plt.plot(cumulative_explained_variance, marker="o", linestyle="--")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA - Cumulative Explained Variance")
plt.axhline(y=0.95, color="r", linestyle="--")  # Threshold for 95% explained variance
plt.show()

# Number of components to explain 95% variance
num_components_95 = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of original features: {X.shape[1]}")
print(f"Number of components to explain 95% of the variance: {num_components_95}")

In [ ]:

# ICA - number of independent components
ica_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("ica", FastICA()),
    ]
)
components = ica_pipe.fit_transform(X)

# Number of independent components
num_independent_components = components.shape[1]
print(f"Number of original features: {X.shape[1]}")
print(f"Number of independent components found: {num_independent_components}")

In [ ]:

eda.update_param(
    "DR potential",
    "PCA: 9 components to explain 95% variance, ICA: 11 independent components",
)

Save EDA results¶

In [ ]:

eda.summary_df

In [ ]:

eda.save(overwrite_existing=False)

Create and save a shuffled 80/20 train/test split¶

< EDA Wine Cultivars | Contents | EDA Wine Quality Prediction continuous >