The content for this site available on GitHub. If you want to launch the notebooks interactively click on the binder stamp below. Binder

< EDA Iris | Contents | EDA Wine Quality Prediction binary >

EDA: Wine Cultivars

Wine dataset - chemical analysis to determine the origin of wines.

In [1]:
%reload_ext autoreload
%autoreload 2
In [2]:
from sklearn.datasets import load_wine
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from helpers.base_imports import *

Create new EDA

In [3]:
eda = EDA(name="winec")
eda
Loading 'edas.csv'
Creating experiment: 'winec'
Out[3]:
EDA: winec
Columns: Index(['description', 'n features', 'n samples', 'f/n ratio', 'noise', 'stats',
       'class balance', 'outliers', 'skewness', 'correlations',
       'DR potential'],
      dtype='object')
Datasets: Index([], dtype='object', name='dataset')

Get raw dataset from remote source

In [4]:
# fetch dataset
data = load_wine(as_frame=True)
data.frame.head(5)
Out[4]:
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline target
0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 0
1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 0
2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 0
3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 0
4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 0
In [5]:
list(data.target_names)
Out[5]:
[np.str_('class_0'), np.str_('class_1'), np.str_('class_2')]
In [6]:
disp_df(data.frame)
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline target
0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.640000 1.040 3.92 1065.0 0
1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.380000 1.050 3.40 1050.0 0
2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.680000 1.030 3.17 1185.0 0
3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.800000 0.860 3.45 1480.0 0
4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.320000 1.040 2.93 735.0 0
5 14.20 1.76 2.45 15.2 112.0 3.27 3.39 0.34 1.97 6.750000 1.050 2.85 1450.0 0
6 14.39 1.87 2.45 14.6 96.0 2.50 2.52 0.30 1.98 5.250000 1.020 3.58 1290.0 0
7 14.06 2.15 2.61 17.6 121.0 2.60 2.51 0.31 1.25 5.050000 1.060 3.58 1295.0 0
8 14.83 1.64 2.17 14.0 97.0 2.80 2.98 0.29 1.98 5.200000 1.080 2.85 1045.0 0
9 13.86 1.35 2.27 16.0 98.0 2.98 3.15 0.22 1.85 7.220000 1.010 3.55 1045.0 0
10 14.10 2.16 2.30 18.0 105.0 2.95 3.32 0.22 2.38 5.750000 1.250 3.17 1510.0 0
11 14.12 1.48 2.32 16.8 95.0 2.20 2.43 0.26 1.57 5.000000 1.170 2.82 1280.0 0
12 13.75 1.73 2.41 16.0 89.0 2.60 2.76 0.29 1.81 5.600000 1.150 2.90 1320.0 0
13 14.75 1.73 2.39 11.4 91.0 3.10 3.69 0.43 2.81 5.400000 1.250 2.73 1150.0 0
14 14.38 1.87 2.38 12.0 102.0 3.30 3.64 0.29 2.96 7.500000 1.200 3.00 1547.0 0
15 13.63 1.81 2.70 17.2 112.0 2.85 2.91 0.30 1.46 7.300000 1.280 2.88 1310.0 0
16 14.30 1.92 2.72 20.0 120.0 2.80 3.14 0.33 1.97 6.200000 1.070 2.65 1280.0 0
17 13.83 1.57 2.62 20.0 115.0 2.95 3.40 0.40 1.72 6.600000 1.130 2.57 1130.0 0
18 14.19 1.59 2.48 16.5 108.0 3.30 3.93 0.32 1.86 8.700000 1.230 2.82 1680.0 0
19 13.64 3.10 2.56 15.2 116.0 2.70 3.03 0.17 1.66 5.100000 0.960 3.36 845.0 0
20 14.06 1.63 2.28 16.0 126.0 3.00 3.17 0.24 2.10 5.650000 1.090 3.71 780.0 0
21 12.93 3.80 2.65 18.6 102.0 2.41 2.41 0.25 1.98 4.500000 1.030 3.52 770.0 0
22 13.71 1.86 2.36 16.6 101.0 2.61 2.88 0.27 1.69 3.800000 1.110 4.00 1035.0 0
23 12.85 1.60 2.52 17.8 95.0 2.48 2.37 0.26 1.46 3.930000 1.090 3.63 1015.0 0
24 13.50 1.81 2.61 20.0 96.0 2.53 2.61 0.28 1.66 3.520000 1.120 3.82 845.0 0
25 13.05 2.05 3.22 25.0 124.0 2.63 2.68 0.47 1.92 3.580000 1.130 3.20 830.0 0
26 13.39 1.77 2.62 16.1 93.0 2.85 2.94 0.34 1.45 4.800000 0.920 3.22 1195.0 0
27 13.30 1.72 2.14 17.0 94.0 2.40 2.19 0.27 1.35 3.950000 1.020 2.77 1285.0 0
28 13.87 1.90 2.80 19.4 107.0 2.95 2.97 0.37 1.76 4.500000 1.250 3.40 915.0 0
29 14.02 1.68 2.21 16.0 96.0 2.65 2.33 0.26 1.98 4.700000 1.040 3.59 1035.0 0
30 13.73 1.50 2.70 22.5 101.0 3.00 3.25 0.29 2.38 5.700000 1.190 2.71 1285.0 0
31 13.58 1.66 2.36 19.1 106.0 2.86 3.19 0.22 1.95 6.900000 1.090 2.88 1515.0 0
32 13.68 1.83 2.36 17.2 104.0 2.42 2.69 0.42 1.97 3.840000 1.230 2.87 990.0 0
33 13.76 1.53 2.70 19.5 132.0 2.95 2.74 0.50 1.35 5.400000 1.250 3.00 1235.0 0
34 13.51 1.80 2.65 19.0 110.0 2.35 2.53 0.29 1.54 4.200000 1.100 2.87 1095.0 0
35 13.48 1.81 2.41 20.5 100.0 2.70 2.98 0.26 1.86 5.100000 1.040 3.47 920.0 0
36 13.28 1.64 2.84 15.5 110.0 2.60 2.68 0.34 1.36 4.600000 1.090 2.78 880.0 0
37 13.05 1.65 2.55 18.0 98.0 2.45 2.43 0.29 1.44 4.250000 1.120 2.51 1105.0 0
38 13.07 1.50 2.10 15.5 98.0 2.40 2.64 0.28 1.37 3.700000 1.180 2.69 1020.0 0
39 14.22 3.99 2.51 13.2 128.0 3.00 3.04 0.20 2.08 5.100000 0.890 3.53 760.0 0
40 13.56 1.71 2.31 16.2 117.0 3.15 3.29 0.34 2.34 6.130000 0.950 3.38 795.0 0
41 13.41 3.84 2.12 18.8 90.0 2.45 2.68 0.27 1.48 4.280000 0.910 3.00 1035.0 0
42 13.88 1.89 2.59 15.0 101.0 3.25 3.56 0.17 1.70 5.430000 0.880 3.56 1095.0 0
43 13.24 3.98 2.29 17.5 103.0 2.64 2.63 0.32 1.66 4.360000 0.820 3.00 680.0 0
44 13.05 1.77 2.10 17.0 107.0 3.00 3.00 0.28 2.03 5.040000 0.880 3.35 885.0 0
45 14.21 4.04 2.44 18.9 111.0 2.85 2.65 0.30 1.25 5.240000 0.870 3.33 1080.0 0
46 14.38 3.59 2.28 16.0 102.0 3.25 3.17 0.27 2.19 4.900000 1.040 3.44 1065.0 0
47 13.90 1.68 2.12 16.0 101.0 3.10 3.39 0.21 2.14 6.100000 0.910 3.33 985.0 0
48 14.10 2.02 2.40 18.8 103.0 2.75 2.92 0.32 2.38 6.200000 1.070 2.75 1060.0 0
49 13.94 1.73 2.27 17.4 108.0 2.88 3.54 0.32 2.08 8.900000 1.120 3.10 1260.0 0
50 13.05 1.73 2.04 12.4 92.0 2.72 3.27 0.17 2.91 7.200000 1.120 2.91 1150.0 0
51 13.83 1.65 2.60 17.2 94.0 2.45 2.99 0.22 2.29 5.600000 1.240 3.37 1265.0 0
52 13.82 1.75 2.42 14.0 111.0 3.88 3.74 0.32 1.87 7.050000 1.010 3.26 1190.0 0
53 13.77 1.90 2.68 17.1 115.0 3.00 2.79 0.39 1.68 6.300000 1.130 2.93 1375.0 0
54 13.74 1.67 2.25 16.4 118.0 2.60 2.90 0.21 1.62 5.850000 0.920 3.20 1060.0 0
55 13.56 1.73 2.46 20.5 116.0 2.96 2.78 0.20 2.45 6.250000 0.980 3.03 1120.0 0
56 14.22 1.70 2.30 16.3 118.0 3.20 3.00 0.26 2.03 6.380000 0.940 3.31 970.0 0
57 13.29 1.97 2.68 16.8 102.0 3.00 3.23 0.31 1.66 6.000000 1.070 2.84 1270.0 0
58 13.72 1.43 2.50 16.7 108.0 3.40 3.67 0.19 2.04 6.800000 0.890 2.87 1285.0 0
59 12.37 0.94 1.36 10.6 88.0 1.98 0.57 0.28 0.42 1.950000 1.050 1.82 520.0 1
60 12.33 1.10 2.28 16.0 101.0 2.05 1.09 0.63 0.41 3.270000 1.250 1.67 680.0 1
61 12.64 1.36 2.02 16.8 100.0 2.02 1.41 0.53 0.62 5.750000 0.980 1.59 450.0 1
62 13.67 1.25 1.92 18.0 94.0 2.10 1.79 0.32 0.73 3.800000 1.230 2.46 630.0 1
63 12.37 1.13 2.16 19.0 87.0 3.50 3.10 0.19 1.87 4.450000 1.220 2.87 420.0 1
64 12.17 1.45 2.53 19.0 104.0 1.89 1.75 0.45 1.03 2.950000 1.450 2.23 355.0 1
65 12.37 1.21 2.56 18.1 98.0 2.42 2.65 0.37 2.08 4.600000 1.190 2.30 678.0 1
66 13.11 1.01 1.70 15.0 78.0 2.98 3.18 0.26 2.28 5.300000 1.120 3.18 502.0 1
67 12.37 1.17 1.92 19.6 78.0 2.11 2.00 0.27 1.04 4.680000 1.120 3.48 510.0 1
68 13.34 0.94 2.36 17.0 110.0 2.53 1.30 0.55 0.42 3.170000 1.020 1.93 750.0 1
69 12.21 1.19 1.75 16.8 151.0 1.85 1.28 0.14 2.50 2.850000 1.280 3.07 718.0 1
70 12.29 1.61 2.21 20.4 103.0 1.10 1.02 0.37 1.46 3.050000 0.906 1.82 870.0 1
71 13.86 1.51 2.67 25.0 86.0 2.95 2.86 0.21 1.87 3.380000 1.360 3.16 410.0 1
72 13.49 1.66 2.24 24.0 87.0 1.88 1.84 0.27 1.03 3.740000 0.980 2.78 472.0 1
73 12.99 1.67 2.60 30.0 139.0 3.30 2.89 0.21 1.96 3.350000 1.310 3.50 985.0 1
74 11.96 1.09 2.30 21.0 101.0 3.38 2.14 0.13 1.65 3.210000 0.990 3.13 886.0 1
75 11.66 1.88 1.92 16.0 97.0 1.61 1.57 0.34 1.15 3.800000 1.230 2.14 428.0 1
76 13.03 0.90 1.71 16.0 86.0 1.95 2.03 0.24 1.46 4.600000 1.190 2.48 392.0 1
77 11.84 2.89 2.23 18.0 112.0 1.72 1.32 0.43 0.95 2.650000 0.960 2.52 500.0 1
78 12.33 0.99 1.95 14.8 136.0 1.90 1.85 0.35 2.76 3.400000 1.060 2.31 750.0 1
79 12.70 3.87 2.40 23.0 101.0 2.83 2.55 0.43 1.95 2.570000 1.190 3.13 463.0 1
80 12.00 0.92 2.00 19.0 86.0 2.42 2.26 0.30 1.43 2.500000 1.380 3.12 278.0 1
81 12.72 1.81 2.20 18.8 86.0 2.20 2.53 0.26 1.77 3.900000 1.160 3.14 714.0 1
82 12.08 1.13 2.51 24.0 78.0 2.00 1.58 0.40 1.40 2.200000 1.310 2.72 630.0 1
83 13.05 3.86 2.32 22.5 85.0 1.65 1.59 0.61 1.62 4.800000 0.840 2.01 515.0 1
84 11.84 0.89 2.58 18.0 94.0 2.20 2.21 0.22 2.35 3.050000 0.790 3.08 520.0 1
85 12.67 0.98 2.24 18.0 99.0 2.20 1.94 0.30 1.46 2.620000 1.230 3.16 450.0 1
86 12.16 1.61 2.31 22.8 90.0 1.78 1.69 0.43 1.56 2.450000 1.330 2.26 495.0 1
87 11.65 1.67 2.62 26.0 88.0 1.92 1.61 0.40 1.34 2.600000 1.360 3.21 562.0 1
88 11.64 2.06 2.46 21.6 84.0 1.95 1.69 0.48 1.35 2.800000 1.000 2.75 680.0 1
89 12.08 1.33 2.30 23.6 70.0 2.20 1.59 0.42 1.38 1.740000 1.070 3.21 625.0 1
90 12.08 1.83 2.32 18.5 81.0 1.60 1.50 0.52 1.64 2.400000 1.080 2.27 480.0 1
91 12.00 1.51 2.42 22.0 86.0 1.45 1.25 0.50 1.63 3.600000 1.050 2.65 450.0 1
92 12.69 1.53 2.26 20.7 80.0 1.38 1.46 0.58 1.62 3.050000 0.960 2.06 495.0 1
93 12.29 2.83 2.22 18.0 88.0 2.45 2.25 0.25 1.99 2.150000 1.150 3.30 290.0 1
94 11.62 1.99 2.28 18.0 98.0 3.02 2.26 0.17 1.35 3.250000 1.160 2.96 345.0 1
95 12.47 1.52 2.20 19.0 162.0 2.50 2.27 0.32 3.28 2.600000 1.160 2.63 937.0 1
96 11.81 2.12 2.74 21.5 134.0 1.60 0.99 0.14 1.56 2.500000 0.950 2.26 625.0 1
97 12.29 1.41 1.98 16.0 85.0 2.55 2.50 0.29 1.77 2.900000 1.230 2.74 428.0 1
98 12.37 1.07 2.10 18.5 88.0 3.52 3.75 0.24 1.95 4.500000 1.040 2.77 660.0 1
99 12.29 3.17 2.21 18.0 88.0 2.85 2.99 0.45 2.81 2.300000 1.420 2.83 406.0 1
100 12.08 2.08 1.70 17.5 97.0 2.23 2.17 0.26 1.40 3.300000 1.270 2.96 710.0 1
101 12.60 1.34 1.90 18.5 88.0 1.45 1.36 0.29 1.35 2.450000 1.040 2.77 562.0 1
102 12.34 2.45 2.46 21.0 98.0 2.56 2.11 0.34 1.31 2.800000 0.800 3.38 438.0 1
103 11.82 1.72 1.88 19.5 86.0 2.50 1.64 0.37 1.42 2.060000 0.940 2.44 415.0 1
104 12.51 1.73 1.98 20.5 85.0 2.20 1.92 0.32 1.48 2.940000 1.040 3.57 672.0 1
105 12.42 2.55 2.27 22.0 90.0 1.68 1.84 0.66 1.42 2.700000 0.860 3.30 315.0 1
106 12.25 1.73 2.12 19.0 80.0 1.65 2.03 0.37 1.63 3.400000 1.000 3.17 510.0 1
107 12.72 1.75 2.28 22.5 84.0 1.38 1.76 0.48 1.63 3.300000 0.880 2.42 488.0 1
108 12.22 1.29 1.94 19.0 92.0 2.36 2.04 0.39 2.08 2.700000 0.860 3.02 312.0 1
109 11.61 1.35 2.70 20.0 94.0 2.74 2.92 0.29 2.49 2.650000 0.960 3.26 680.0 1
110 11.46 3.74 1.82 19.5 107.0 3.18 2.58 0.24 3.58 2.900000 0.750 2.81 562.0 1
111 12.52 2.43 2.17 21.0 88.0 2.55 2.27 0.26 1.22 2.000000 0.900 2.78 325.0 1
112 11.76 2.68 2.92 20.0 103.0 1.75 2.03 0.60 1.05 3.800000 1.230 2.50 607.0 1
113 11.41 0.74 2.50 21.0 88.0 2.48 2.01 0.42 1.44 3.080000 1.100 2.31 434.0 1
114 12.08 1.39 2.50 22.5 84.0 2.56 2.29 0.43 1.04 2.900000 0.930 3.19 385.0 1
115 11.03 1.51 2.20 21.5 85.0 2.46 2.17 0.52 2.01 1.900000 1.710 2.87 407.0 1
116 11.82 1.47 1.99 20.8 86.0 1.98 1.60 0.30 1.53 1.950000 0.950 3.33 495.0 1
117 12.42 1.61 2.19 22.5 108.0 2.00 2.09 0.34 1.61 2.060000 1.060 2.96 345.0 1
118 12.77 3.43 1.98 16.0 80.0 1.63 1.25 0.43 0.83 3.400000 0.700 2.12 372.0 1
119 12.00 3.43 2.00 19.0 87.0 2.00 1.64 0.37 1.87 1.280000 0.930 3.05 564.0 1
120 11.45 2.40 2.42 20.0 96.0 2.90 2.79 0.32 1.83 3.250000 0.800 3.39 625.0 1
121 11.56 2.05 3.23 28.5 119.0 3.18 5.08 0.47 1.87 6.000000 0.930 3.69 465.0 1
122 12.42 4.43 2.73 26.5 102.0 2.20 2.13 0.43 1.71 2.080000 0.920 3.12 365.0 1
123 13.05 5.80 2.13 21.5 86.0 2.62 2.65 0.30 2.01 2.600000 0.730 3.10 380.0 1
124 11.87 4.31 2.39 21.0 82.0 2.86 3.03 0.21 2.91 2.800000 0.750 3.64 380.0 1
125 12.07 2.16 2.17 21.0 85.0 2.60 2.65 0.37 1.35 2.760000 0.860 3.28 378.0 1
126 12.43 1.53 2.29 21.5 86.0 2.74 3.15 0.39 1.77 3.940000 0.690 2.84 352.0 1
127 11.79 2.13 2.78 28.5 92.0 2.13 2.24 0.58 1.76 3.000000 0.970 2.44 466.0 1
128 12.37 1.63 2.30 24.5 88.0 2.22 2.45 0.40 1.90 2.120000 0.890 2.78 342.0 1
129 12.04 4.30 2.38 22.0 80.0 2.10 1.75 0.42 1.35 2.600000 0.790 2.57 580.0 1
130 12.86 1.35 2.32 18.0 122.0 1.51 1.25 0.21 0.94 4.100000 0.760 1.29 630.0 2
131 12.88 2.99 2.40 20.0 104.0 1.30 1.22 0.24 0.83 5.400000 0.740 1.42 530.0 2
132 12.81 2.31 2.40 24.0 98.0 1.15 1.09 0.27 0.83 5.700000 0.660 1.36 560.0 2
133 12.70 3.55 2.36 21.5 106.0 1.70 1.20 0.17 0.84 5.000000 0.780 1.29 600.0 2
134 12.51 1.24 2.25 17.5 85.0 2.00 0.58 0.60 1.25 5.450000 0.750 1.51 650.0 2
135 12.60 2.46 2.20 18.5 94.0 1.62 0.66 0.63 0.94 7.100000 0.730 1.58 695.0 2
136 12.25 4.72 2.54 21.0 89.0 1.38 0.47 0.53 0.80 3.850000 0.750 1.27 720.0 2
137 12.53 5.51 2.64 25.0 96.0 1.79 0.60 0.63 1.10 5.000000 0.820 1.69 515.0 2
138 13.49 3.59 2.19 19.5 88.0 1.62 0.48 0.58 0.88 5.700000 0.810 1.82 580.0 2
139 12.84 2.96 2.61 24.0 101.0 2.32 0.60 0.53 0.81 4.920000 0.890 2.15 590.0 2
140 12.93 2.81 2.70 21.0 96.0 1.54 0.50 0.53 0.75 4.600000 0.770 2.31 600.0 2
141 13.36 2.56 2.35 20.0 89.0 1.40 0.50 0.37 0.64 5.600000 0.700 2.47 780.0 2
142 13.52 3.17 2.72 23.5 97.0 1.55 0.52 0.50 0.55 4.350000 0.890 2.06 520.0 2
143 13.62 4.95 2.35 20.0 92.0 2.00 0.80 0.47 1.02 4.400000 0.910 2.05 550.0 2
144 12.25 3.88 2.20 18.5 112.0 1.38 0.78 0.29 1.14 8.210000 0.650 2.00 855.0 2
145 13.16 3.57 2.15 21.0 102.0 1.50 0.55 0.43 1.30 4.000000 0.600 1.68 830.0 2
146 13.88 5.04 2.23 20.0 80.0 0.98 0.34 0.40 0.68 4.900000 0.580 1.33 415.0 2
147 12.87 4.61 2.48 21.5 86.0 1.70 0.65 0.47 0.86 7.650000 0.540 1.86 625.0 2
148 13.32 3.24 2.38 21.5 92.0 1.93 0.76 0.45 1.25 8.420000 0.550 1.62 650.0 2
149 13.08 3.90 2.36 21.5 113.0 1.41 1.39 0.34 1.14 9.400000 0.570 1.33 550.0 2
150 13.50 3.12 2.62 24.0 123.0 1.40 1.57 0.22 1.25 8.600000 0.590 1.30 500.0 2
151 12.79 2.67 2.48 22.0 112.0 1.48 1.36 0.24 1.26 10.800000 0.480 1.47 480.0 2
152 13.11 1.90 2.75 25.5 116.0 2.20 1.28 0.26 1.56 7.100000 0.610 1.33 425.0 2
153 13.23 3.30 2.28 18.5 98.0 1.80 0.83 0.61 1.87 10.520000 0.560 1.51 675.0 2
154 12.58 1.29 2.10 20.0 103.0 1.48 0.58 0.53 1.40 7.600000 0.580 1.55 640.0 2
155 13.17 5.19 2.32 22.0 93.0 1.74 0.63 0.61 1.55 7.900000 0.600 1.48 725.0 2
156 13.84 4.12 2.38 19.5 89.0 1.80 0.83 0.48 1.56 9.010000 0.570 1.64 480.0 2
157 12.45 3.03 2.64 27.0 97.0 1.90 0.58 0.63 1.14 7.500000 0.670 1.73 880.0 2
158 14.34 1.68 2.70 25.0 98.0 2.80 1.31 0.53 2.70 13.000000 0.570 1.96 660.0 2
159 13.48 1.67 2.64 22.5 89.0 2.60 1.10 0.52 2.29 11.750000 0.570 1.78 620.0 2
160 12.36 3.83 2.38 21.0 88.0 2.30 0.92 0.50 1.04 7.650000 0.560 1.58 520.0 2
161 13.69 3.26 2.54 20.0 107.0 1.83 0.56 0.50 0.80 5.880000 0.960 1.82 680.0 2
162 12.85 3.27 2.58 22.0 106.0 1.65 0.60 0.60 0.96 5.580000 0.870 2.11 570.0 2
163 12.96 3.45 2.35 18.5 106.0 1.39 0.70 0.40 0.94 5.280000 0.680 1.75 675.0 2
164 13.78 2.76 2.30 22.0 90.0 1.35 0.68 0.41 1.03 9.580000 0.700 1.68 615.0 2
165 13.73 4.36 2.26 22.5 88.0 1.28 0.47 0.52 1.15 6.620000 0.780 1.75 520.0 2
166 13.45 3.70 2.60 23.0 111.0 1.70 0.92 0.43 1.46 10.680000 0.850 1.56 695.0 2
167 12.82 3.37 2.30 19.5 88.0 1.48 0.66 0.40 0.97 10.260000 0.720 1.75 685.0 2
168 13.58 2.58 2.69 24.5 105.0 1.55 0.84 0.39 1.54 8.660000 0.740 1.80 750.0 2
169 13.40 4.60 2.86 25.0 112.0 1.98 0.96 0.27 1.11 8.500000 0.670 1.92 630.0 2
170 12.20 3.03 2.32 19.0 96.0 1.25 0.49 0.40 0.73 5.500000 0.660 1.83 510.0 2
171 12.77 2.39 2.28 19.5 86.0 1.39 0.51 0.48 0.64 9.899999 0.570 1.63 470.0 2
172 14.16 2.51 2.48 20.0 91.0 1.68 0.70 0.44 1.24 9.700000 0.620 1.71 660.0 2
173 13.71 5.65 2.45 20.5 95.0 1.68 0.61 0.52 1.06 7.700000 0.640 1.74 740.0 2
174 13.40 3.91 2.48 23.0 102.0 1.80 0.75 0.43 1.41 7.300000 0.700 1.56 750.0 2
175 13.27 4.28 2.26 20.0 120.0 1.59 0.69 0.43 1.35 10.200000 0.590 1.56 835.0 2
176 13.17 2.59 2.37 20.0 120.0 1.65 0.68 0.53 1.46 9.300000 0.600 1.62 840.0 2
177 14.13 4.10 2.74 24.5 96.0 2.05 0.76 0.56 1.35 9.200000 0.610 1.60 560.0 2
In [7]:
X = data.frame.drop(columns="target")
y = data.frame.target
X.shape, y.shape
Out[7]:
((178, 13), (178,))
In [9]:
eda.update_param(
    "description", "Classify wines into 3 classes based on chemical analysis"
)
eda.update_param("n features", X.shape[1])
eda.update_param("n samples", X.shape[0])
eda.update_param("f/n ratio", len(data.target_names) / X.shape[0])
In [10]:
eda.summary_df
Out[10]:
description n features n samples f/n ratio noise stats class balance outliers skewness correlations DR potential
dataset
winec Classify wines into 3 classes based on chemica... 13 178 0.016854 NaN NaN NaN NaN NaN NaN NaN

Noise

In [11]:
# check for missing values
X.isna().sum().sum()
Out[11]:
np.int64(0)
In [12]:
eda.update_param("noise", "None, no missing vals")

Stats

In [13]:
skewness = X.skew()
summary_stats = X.describe().T
summary_stats["skewness"] = skewness
disp_df(summary_stats[["min", "max", "mean", "std", "skewness", "25%", "50%", "75%"]])
min max mean std skewness 25% 50% 75%
alcohol 11.03 14.83 13.000618 0.811827 -0.051482 12.3625 13.050 13.6775
malic_acid 0.74 5.80 2.336348 1.117146 1.039651 1.6025 1.865 3.0825
ash 1.36 3.23 2.366517 0.274344 -0.176699 2.2100 2.360 2.5575
alcalinity_of_ash 10.60 30.00 19.494944 3.339564 0.213047 17.2000 19.500 21.5000
magnesium 70.00 162.00 99.741573 14.282484 1.098191 88.0000 98.000 107.0000
total_phenols 0.98 3.88 2.295112 0.625851 0.086639 1.7425 2.355 2.8000
flavanoids 0.34 5.08 2.029270 0.998859 0.025344 1.2050 2.135 2.8750
nonflavanoid_phenols 0.13 0.66 0.361854 0.124453 0.450151 0.2700 0.340 0.4375
proanthocyanins 0.41 3.58 1.590899 0.572359 0.517137 1.2500 1.555 1.9500
color_intensity 1.28 13.00 5.058090 2.318286 0.868585 3.2200 4.690 6.2000
hue 0.48 1.71 0.957449 0.228572 0.021091 0.7825 0.965 1.1200
od280/od315_of_diluted_wines 1.27 4.00 2.611685 0.709990 -0.307285 1.9375 2.780 3.1700
proline 278.00 1680.00 746.893258 314.907474 0.767822 500.5000 673.500 985.0000
In [14]:
fig, ax = plot_feature_statistics(X, X.columns, line=False)
fig.savefig(f"{FIGS_DIR}/{eda.name}_feature-statistics.png")
In [15]:
eda.update_param("skewness", "some skewness")
eda.update_param("stats", "fine")
eda.update_param("outliers", "some outliers in ~7 features")
In [16]:
# class distribution of whole dataset
ax = sns.countplot(x=data.target_names[y])
plt.title(f"Target Class Distribution ({eda.name})")
plt.xlabel("Class")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height}",
        (p.get_x() + p.get_width() / 2.0, height),
        ha="center",
        va="center",
        xytext=(0, 5),
        textcoords="offset points",
    )

plt.savefig(f"{FIGS_DIR}/{eda.name}_target-class-distribution.png")
plt.show()
In [17]:
eda.update_param("class balance", "~Imbalanced (59, 71, 48)")

Feature Correlations

In [18]:
df = data.frame.copy()
df["target"] = data.target_names[y]
# df.head(5)
sns.pairplot(data=df, hue="target", palette="bright")
plt.savefig(f"{FIGS_DIR}/{eda.name}_pairplot.png")
In [19]:
# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title(f"Correlation Matrix ({eda.name})")

plt.savefig(f"{FIGS_DIR}/{eda.name}_correlation-matrix.png")
plt.show()
In [20]:
eda.update_param("correlations", "a couple medium-strong correlations, but not much")

Dimensionality Reduction Potential

In [21]:
# PCA - number of components to explain 95% variance
pca_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA()),
    ]
)
pca_pipe.fit(X)
Out[21]:
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [22]:
explained_variance_ratio = pca_pipe.named_steps["pca"].explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(8, 6))
plt.plot(cumulative_explained_variance, marker="o", linestyle="--")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title(f"PCA - Cumulative Explained Variance ({eda.name})")
plt.axhline(y=0.95, color="r", linestyle="--")  # Threshold for 95% explained variance
plt.show()

# Number of components to explain 95% variance
num_components_95 = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of components to explain 95% of the variance: {num_components_95}")
Number of components to explain 95% of the variance: 10
In [23]:
# ICA - number of independent components
ica_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("ica", FastICA()),
    ]
)
components = ica_pipe.fit_transform(X)

# Number of independent components
num_independent_components = components.shape[1]
print(f"Number of independent components found: {num_independent_components}")
Number of independent components found: 13
/Users/yarik/vc_projects/ML/ml-prep/.venv/lib/python3.12/site-packages/sklearn/decomposition/_fastica.py:128: ConvergenceWarning: FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
  warnings.warn(
In [24]:
eda.update_param(
    "DR potential",
    "PCA: 95% variance explained with 10 components\nICA: 13 independent components",
)

Save EDA results

In [25]:
eda.save()
Loading 'edas.csv'
Saving winec to results/edas.csv

Create and save a shuffled 80/20 train/test split



< EDA Iris | Contents | EDA Wine Quality Prediction binary >