The content for this site available on GitHub. If you want to launch the notebooks interactively click on the binder stamp below. Binder

< EDA Digits | Contents | EDA Wine Cultivars >

EDA: Iris

In [1]:
%reload_ext autoreload
%autoreload 2
In [2]:
from sklearn.datasets import load_iris
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from helpers.base_imports import *

Create new EDA

In [3]:
eda = EDA(name="iris")
eda
Loading 'edas.csv'
Loading 'iris' eda
Out[3]:
EDA: iris
Columns: Index(['description', 'n features', 'n samples', 'f/n ratio', 'noise', 'stats',
       'class balance', 'outliers', 'skewness', 'correlations', 'DR potential',
       'dataset.1', 'dataset.2'],
      dtype='object')
Datasets: Index(['iris'], dtype='object', name='dataset')

Get raw dataset from remote source

In [4]:
# fetch dataset
data = load_iris(as_frame=True)
data.frame.head(5)
Out[4]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
In [5]:
list(data.target_names)
Out[5]:
['setosa', 'versicolor', 'virginica']
In [6]:
disp_df(data.frame)
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
5 5.4 3.9 1.7 0.4 0
6 4.6 3.4 1.4 0.3 0
7 5.0 3.4 1.5 0.2 0
8 4.4 2.9 1.4 0.2 0
9 4.9 3.1 1.5 0.1 0
10 5.4 3.7 1.5 0.2 0
11 4.8 3.4 1.6 0.2 0
12 4.8 3.0 1.4 0.1 0
13 4.3 3.0 1.1 0.1 0
14 5.8 4.0 1.2 0.2 0
15 5.7 4.4 1.5 0.4 0
16 5.4 3.9 1.3 0.4 0
17 5.1 3.5 1.4 0.3 0
18 5.7 3.8 1.7 0.3 0
19 5.1 3.8 1.5 0.3 0
20 5.4 3.4 1.7 0.2 0
21 5.1 3.7 1.5 0.4 0
22 4.6 3.6 1.0 0.2 0
23 5.1 3.3 1.7 0.5 0
24 4.8 3.4 1.9 0.2 0
25 5.0 3.0 1.6 0.2 0
26 5.0 3.4 1.6 0.4 0
27 5.2 3.5 1.5 0.2 0
28 5.2 3.4 1.4 0.2 0
29 4.7 3.2 1.6 0.2 0
30 4.8 3.1 1.6 0.2 0
31 5.4 3.4 1.5 0.4 0
32 5.2 4.1 1.5 0.1 0
33 5.5 4.2 1.4 0.2 0
34 4.9 3.1 1.5 0.2 0
35 5.0 3.2 1.2 0.2 0
36 5.5 3.5 1.3 0.2 0
37 4.9 3.6 1.4 0.1 0
38 4.4 3.0 1.3 0.2 0
39 5.1 3.4 1.5 0.2 0
40 5.0 3.5 1.3 0.3 0
41 4.5 2.3 1.3 0.3 0
42 4.4 3.2 1.3 0.2 0
43 5.0 3.5 1.6 0.6 0
44 5.1 3.8 1.9 0.4 0
45 4.8 3.0 1.4 0.3 0
46 5.1 3.8 1.6 0.2 0
47 4.6 3.2 1.4 0.2 0
48 5.3 3.7 1.5 0.2 0
49 5.0 3.3 1.4 0.2 0
50 7.0 3.2 4.7 1.4 1
51 6.4 3.2 4.5 1.5 1
52 6.9 3.1 4.9 1.5 1
53 5.5 2.3 4.0 1.3 1
54 6.5 2.8 4.6 1.5 1
55 5.7 2.8 4.5 1.3 1
56 6.3 3.3 4.7 1.6 1
57 4.9 2.4 3.3 1.0 1
58 6.6 2.9 4.6 1.3 1
59 5.2 2.7 3.9 1.4 1
60 5.0 2.0 3.5 1.0 1
61 5.9 3.0 4.2 1.5 1
62 6.0 2.2 4.0 1.0 1
63 6.1 2.9 4.7 1.4 1
64 5.6 2.9 3.6 1.3 1
65 6.7 3.1 4.4 1.4 1
66 5.6 3.0 4.5 1.5 1
67 5.8 2.7 4.1 1.0 1
68 6.2 2.2 4.5 1.5 1
69 5.6 2.5 3.9 1.1 1
70 5.9 3.2 4.8 1.8 1
71 6.1 2.8 4.0 1.3 1
72 6.3 2.5 4.9 1.5 1
73 6.1 2.8 4.7 1.2 1
74 6.4 2.9 4.3 1.3 1
75 6.6 3.0 4.4 1.4 1
76 6.8 2.8 4.8 1.4 1
77 6.7 3.0 5.0 1.7 1
78 6.0 2.9 4.5 1.5 1
79 5.7 2.6 3.5 1.0 1
80 5.5 2.4 3.8 1.1 1
81 5.5 2.4 3.7 1.0 1
82 5.8 2.7 3.9 1.2 1
83 6.0 2.7 5.1 1.6 1
84 5.4 3.0 4.5 1.5 1
85 6.0 3.4 4.5 1.6 1
86 6.7 3.1 4.7 1.5 1
87 6.3 2.3 4.4 1.3 1
88 5.6 3.0 4.1 1.3 1
89 5.5 2.5 4.0 1.3 1
90 5.5 2.6 4.4 1.2 1
91 6.1 3.0 4.6 1.4 1
92 5.8 2.6 4.0 1.2 1
93 5.0 2.3 3.3 1.0 1
94 5.6 2.7 4.2 1.3 1
95 5.7 3.0 4.2 1.2 1
96 5.7 2.9 4.2 1.3 1
97 6.2 2.9 4.3 1.3 1
98 5.1 2.5 3.0 1.1 1
99 5.7 2.8 4.1 1.3 1
100 6.3 3.3 6.0 2.5 2
101 5.8 2.7 5.1 1.9 2
102 7.1 3.0 5.9 2.1 2
103 6.3 2.9 5.6 1.8 2
104 6.5 3.0 5.8 2.2 2
105 7.6 3.0 6.6 2.1 2
106 4.9 2.5 4.5 1.7 2
107 7.3 2.9 6.3 1.8 2
108 6.7 2.5 5.8 1.8 2
109 7.2 3.6 6.1 2.5 2
110 6.5 3.2 5.1 2.0 2
111 6.4 2.7 5.3 1.9 2
112 6.8 3.0 5.5 2.1 2
113 5.7 2.5 5.0 2.0 2
114 5.8 2.8 5.1 2.4 2
115 6.4 3.2 5.3 2.3 2
116 6.5 3.0 5.5 1.8 2
117 7.7 3.8 6.7 2.2 2
118 7.7 2.6 6.9 2.3 2
119 6.0 2.2 5.0 1.5 2
120 6.9 3.2 5.7 2.3 2
121 5.6 2.8 4.9 2.0 2
122 7.7 2.8 6.7 2.0 2
123 6.3 2.7 4.9 1.8 2
124 6.7 3.3 5.7 2.1 2
125 7.2 3.2 6.0 1.8 2
126 6.2 2.8 4.8 1.8 2
127 6.1 3.0 4.9 1.8 2
128 6.4 2.8 5.6 2.1 2
129 7.2 3.0 5.8 1.6 2
130 7.4 2.8 6.1 1.9 2
131 7.9 3.8 6.4 2.0 2
132 6.4 2.8 5.6 2.2 2
133 6.3 2.8 5.1 1.5 2
134 6.1 2.6 5.6 1.4 2
135 7.7 3.0 6.1 2.3 2
136 6.3 3.4 5.6 2.4 2
137 6.4 3.1 5.5 1.8 2
138 6.0 3.0 4.8 1.8 2
139 6.9 3.1 5.4 2.1 2
140 6.7 3.1 5.6 2.4 2
141 6.9 3.1 5.1 2.3 2
142 5.8 2.7 5.1 1.9 2
143 6.8 3.2 5.9 2.3 2
144 6.7 3.3 5.7 2.5 2
145 6.7 3.0 5.2 2.3 2
146 6.3 2.5 5.0 1.9 2
147 6.5 3.0 5.2 2.0 2
148 6.2 3.4 5.4 2.3 2
149 5.9 3.0 5.1 1.8 2
In [7]:
X = data.frame.drop(columns="target")
y = data.frame.target
X.shape, y.shape
Out[7]:
((150, 4), (150,))
In [8]:
eda.update_param("description", "Species (3 classes) from sepals/petals dimensions")
eda.update_param("n features", X.shape[1])
eda.update_param("n samples", X.shape[0])
eda.update_param("f/n ratio", X.shape[1] / X.shape[0])

Fairly low F/N, so should be fine. (~50 samples per unique feature)

F/N interpretation:

Low F/N (e.g., <0.01):

  • Typically, sufficient data is available for training.
  • Emphasis can be placed on optimizing model complexity or exploring more advanced models.

Moderate F/N (e.g., ~0.1–1):

  • The dataset has a balanced number of features and samples.
  • Careful attention is needed to avoid overfitting with complex models.

High F/N (e.g., >1):

  • There are more features than samples, making the dataset sparse.
  • Dimensionality reduction (e.g., PCA, feature selection) or simpler models may be necessary to avoid overfitting.
In [9]:
eda.summary_df
Out[9]:
description n features n samples f/n ratio noise stats class balance outliers skewness correlations DR potential dataset.1 dataset.2
dataset
iris Species (3 classes) from sepals/petals dimensions 4 150 0.026667 None, no missing vals petal l/w have highest variation Balanced sepal width has a 4 outliers petal l/w slight left skew, sepal l/w slight r... several strong (>.8) correlations, some linear... PCA: 2 components to explain 95% variance, ICA... iris NaN

Noise

In [10]:
eda.update_param("noise", "None, no missing vals")

Stats

In [11]:
skewness = X.skew()
summary_stats = X.describe().T
summary_stats["skewness"] = skewness
summary_stats[["min", "max", "mean", "std", "skewness", "25%", "50%", "75%"]]
Out[11]:
min max mean std skewness 25% 50% 75%
sepal length (cm) 4.3 7.9 5.843333 0.828066 0.314911 5.1 5.80 6.4
sepal width (cm) 2.0 4.4 3.057333 0.435866 0.318966 2.8 3.00 3.3
petal length (cm) 1.0 6.9 3.758000 1.765298 -0.274884 1.6 4.35 5.1
petal width (cm) 0.1 2.5 1.199333 0.762238 -0.102967 0.3 1.30 1.8
In [12]:
fig, ax = plot_feature_statistics(X, X.columns, line=False)
fig.savefig(f"{FIGS_DIR}/{eda.name}_feature-statistics.png")
In [13]:
eda.update_param("skewness", "petal l/w slight left skew, sepal l/w slight right skew")
eda.update_param("stats", "petal l/w have highest variation ")
eda.update_param("outliers", "sepal width has a 4 outliers")
In [14]:
# class distribution of whole dataset
ax = sns.countplot(x=data.target_names[y])
plt.title(f"Target Class Distribution ({eda.name})")
plt.xlabel("Class")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{eda.name}_target-class-distribution.png")
plt.show()
In [15]:
eda.update_param("class balance", "Balanced")

Feature Correlations

In [16]:
df = data.frame.copy()
df["target"] = data.target_names[y]
# df.head(5)
sns.pairplot(data=df, hue="target", palette="bright")
plt.savefig(f"{FIGS_DIR}/{eda.name}_pairplot.png")
In [17]:
# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")

plt.savefig(f"{FIGS_DIR}/{eda.name}_correlation-matrix.png")
plt.show()
In [18]:
eda.update_param(
    "correlations", "several strong (>.8) correlations, some linear some poly"
)

Dimensionality Reduction Potential

In [19]:
# PCA - number of components to explain 95% variance
pca_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA()),
    ]
)
pca_pipe.fit(X)
Out[19]:
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [20]:
explained_variance_ratio = pca_pipe.named_steps["pca"].explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(8, 6))
plt.plot(cumulative_explained_variance, marker="o", linestyle="--")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA - Cumulative Explained Variance")
plt.axhline(y=0.95, color="r", linestyle="--")  # Threshold for 95% explained variance
plt.show()

# Number of components to explain 95% variance
num_components_95 = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of components to explain 95% of the variance: {num_components_95}")
Number of components to explain 95% of the variance: 2
In [21]:
# ICA - number of independent components
ica_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("ica", FastICA()),
    ]
)
components = ica_pipe.fit_transform(X)

# Number of independent components
num_independent_components = components.shape[1]
print(f"Number of independent components found: {num_independent_components}")
Number of independent components found: 4
In [22]:
eda.update_param(
    "DR potential",
    "PCA: 2 components to explain 95% variance, ICA: 4 independent components",
)

Save EDA results

In [23]:
eda.summary_df
Out[23]:
description n features n samples f/n ratio noise stats class balance outliers skewness correlations DR potential dataset.1 dataset.2
dataset
iris Species (3 classes) from sepals/petals dimensions 4 150 0.026667 None, no missing vals petal l/w have highest variation Balanced sepal width has a 4 outliers petal l/w slight left skew, sepal l/w slight r... several strong (>.8) correlations, some linear... PCA: 2 components to explain 95% variance, ICA... iris NaN
In [24]:
eda.save(overwrite_existing=True)
Loading 'edas.csv'
Overwriting existing iris
Saving iris to results/edas.csv

Create and save a shuffled 80/20 train/test split

In [28]:
dataset_name0 = "iris-20test-shuffled-v0"
X_train0, X_test0, y_train0, y_test0 = train_test_split(
    X,
    y,
    test_size=0.20,  # 20% of data for testing
    shuffle=True,  # shuffle data before splitting
    random_state=0,
)
X_train0.shape, X_test0.shape
Out[28]:
((120, 4), (30, 4))

Make sure train and test target distributions roughly match the original dataset.

In [29]:
data.target_names
Out[29]:
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
In [30]:
# class distribution: training target dists still balanced?
ax = sns.countplot(x=data.target_names[y_train0])
plt.title("Target Distribution: y_train")
plt.xlabel("Species")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{dataset_name0}_target-class-distribution-y_train.png")
plt.show()
In [31]:
# class distribution: training target dists still balanced?
ax = sns.countplot(x=data.target_names[y_test0])
plt.title("Target Distribution: y_test")
plt.xlabel("Species")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{dataset_name0}_target-class-distribution-y_test.png")
plt.show()

Make sure the feature statistics are similar to the original dataset.

In [32]:
fig, ax = plot_feature_statistics(
    dataframe=X_train0, feature_names=X_train0.columns, line=False
)
fig.savefig(f"{FIGS_DIR}/{dataset_name0}_feature-statistics-X_train.png")
In [33]:
fig, ax = plot_feature_statistics(
    dataframe=X_test0, feature_names=X_test0.columns, line=False
)
fig.savefig(f"{FIGS_DIR}/{dataset_name0}_feature-statistics-X_test.png")

Okay, looks good enough. Lets save it.

In [ ]:
save_dataset(
    dataset_name=dataset_name0,
    X_train=X_train0,
    X_test=X_test0,
    y_train=y_train0,
    y_test=y_test0,
    target_names=pd.DataFrame(data.target_names, columns=["target_names"]),
)

Create and save another just like it but shuffled differently

In [31]:
dataset_name1 = "iris-20test-shuffled-v1"
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X,
    y,
    test_size=0.20,  # 20% of data for testing
    shuffle=True,  # shuffle data before splitting
    random_state=1,
)
X_train1.shape, X_test1.shape
Out[31]:
((120, 4), (30, 4))

Make sure train and test target distributions roughly match the original dataset.

In [32]:
# class distribution: training target dists still balanced?
ax = sns.countplot(x=data.target_names[y_train1])
plt.title("Target Distribution: y_train")
plt.xlabel("Species")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{dataset_name1}_target-class-distribution-y_train.png")
plt.show()
In [33]:
# class distribution: training target dists still balanced?
ax = sns.countplot(x=data.target_names[y_test1])
plt.title("Target Distribution: y_test")
plt.xlabel("Species")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{dataset_name1}_target-class-distribution-y_test.png")
plt.show()

Make sure the feature statistics are similar to the original dataset.

In [34]:
fig, ax = plot_feature_statistics(
    dataframe=X_train1, feature_names=X_train1.columns, line=False
)
fig.savefig(f"{FIGS_DIR}/{dataset_name1}_feature-statistics-X_train.png")
In [35]:
fig, ax = plot_feature_statistics(
    dataframe=X_test1, feature_names=X_test1.columns, line=False
)
fig.savefig(f"{FIGS_DIR}/{dataset_name1}_feature-statistics-X_test.png")

Okay, looks good enough. Lets save it.

In [36]:
save_dataset(
    dataset_name=dataset_name1
    X_train=X_train1,
    X_test=X_test1,
    y_train=y_train1,
    y_test=y_test1,
    target_names=pd.DataFrame(data.target_names, columns=["target_names"]),
)


< EDA Digits | Contents | EDA Wine Cultivars >