import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

from scipy.stats import chi2
sns.set()


df = pd.read_csv("./abalone.csv")

df


print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB
None


df.duplicated().sum()

0


df[['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']].describe()


print(df[df['Height'] == 0])
df = df[df['Height'] > 0]

     Sex  Length  Diameter  Height  Whole weight  Shucked weight  \
1257   I   0.430      0.34     0.0         0.428          0.2065   
3996   I   0.315      0.23     0.0         0.134          0.0575   

      Viscera weight  Shell weight  Rings  
1257          0.0860        0.1150      8  
3996          0.0285        0.3505      6


num_vars = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']


sns.countplot(df, x='Sex')

<Axes: xlabel='Sex', ylabel='count'>


sns.displot(df, x='Rings', discrete=True, kde=True, hue='Sex')

<seaborn.axisgrid.FacetGrid at 0x31e0c1650>


sns.set(rc={"figure.figsize":(8, 28)}) 
fig, axes = plt.subplots(7, 1)

for i, var in zip(range(len(num_vars)), num_vars):
    sns.boxplot(df, x=var, ax = axes[i])

sns.set(rc={"figure.figsize":(8, 4)})


sns.set(rc={"figure.figsize":(8, 28)}) 
fig, axes = plt.subplots(7, 1)

for i, var in zip(range(len(num_vars)), num_vars):
    sns.scatterplot(df, x=var, y=df.Rings, ax = axes[i])

sns.set(rc={"figure.figsize":(8, 4)})


sns.heatmap(df[num_vars].corr())

<Axes: >


sns.pairplot(data=df[num_vars + ['Sex']], hue='Sex', corner=True)

<seaborn.axisgrid.PairGrid at 0x323f57050>


cov = np.cov(df[num_vars], rowvar=False)
cov_inv = np.linalg.inv(cov)
means = np.mean(df[num_vars], axis=0)

distances = []
for i, example in df[num_vars].iterrows():
    distances.append((example - means).T @ cov_inv @ (example - means))
    
df['M-Distance'] = distances
cutoff = chi2.ppf(0.95, len(num_vars))
sum((distances > cutoff)/df.shape[0])

0.08694610778443086


df_in = df[distances <= cutoff]

sns.pairplot(data=df_in[num_vars+ ['Sex']], hue='Sex', corner=True)

<seaborn.axisgrid.PairGrid at 0x30e54f910>


df_in = pd.get_dummies(df_in)


y = df_in['Rings']
X = df_in
X = X.drop(['Rings', 'M-Distance'], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)


mean = np.mean(X_train[num_vars], axis=0)
std = np.std(X_train[num_vars], axis=0)

X_train[num_vars] = (X_train[num_vars] - mean.T) / std.T
X_test[num_vars] = (X_test[num_vars] - mean.T) / std.T
X_train


model = LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([-0.05974412,  0.91302825,  0.74466083,  7.13957947, -5.63901424,
       -2.0749007 ,  0.22610161,  0.24733179, -0.55284362,  0.30551183])


y_pred = model.predict(X_test)
sns.histplot(y_pred, discrete=True)

<Axes: ylabel='Count'>


print("R2 score: ", 1 - sum((y_test-y_pred) ** 2) / sum((y_test - np.mean(y_test)) ** 2))

R2 score:  0.5222381351571055


m = X_train.shape[0]
Sigma = (1 / m) * np.dot( X_train[num_vars].T, X_train[num_vars])
U = np.linalg.eig(Sigma)[1]
Lambda = np.linalg.eig(Sigma)[0]
np.round(U, 4)
np.round(Lambda, 3)

array([6.549e+00, 1.810e-01, 1.230e-01, 1.100e-02, 3.000e-03, 5.600e-02,
       7.700e-02])


U_r = U[:, :2]
print(U_r)

[[-0.38042627 -0.17376591]
 [-0.3808758  -0.21901104]
 [-0.36482612 -0.69848051]
 [-0.38668157  0.28072917]
 [-0.3755808   0.49316573]
 [-0.37785864  0.33224088]
 [-0.37914464 -0.03948732]]


Z_train = np.matmul(U_r.T,  X_train[num_vars].T.to_numpy())
plt.scatter(Z_train[0, :], Z_train[1, :])

<matplotlib.collections.PathCollection at 0x30eab6e10>


X_train_approx = (U_r @ Z_train).T

numerator = (X_train_approx - X_train[num_vars]).to_numpy()
np.square(numerator)
numerator = np.sum(numerator)
numerator

denominator = X_train[num_vars].to_numpy()
denominator = np.square(denominator)
denominator = np.sum(denominator)
denominator
1-numerator/denominator

1.0


from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X_train[num_vars])
pca.components_.T

array([[ 0.38042627,  0.17376591],
       [ 0.3808758 ,  0.21901104],
       [ 0.36482612,  0.69848051],
       [ 0.38668157, -0.28072917],
       [ 0.3755808 , -0.49316573],
       [ 0.37785864, -0.33224088],
       [ 0.37914464,  0.03948732]])

	Sex	Length	Diameter	Height	Whole weight	Shucked weight	Viscera weight	Shell weight	Rings
0	M	0.455	0.365	0.095	0.5140	0.2245	0.1010	0.1500	15
1	M	0.350	0.265	0.090	0.2255	0.0995	0.0485	0.0700	7
2	F	0.530	0.420	0.135	0.6770	0.2565	0.1415	0.2100	9
3	M	0.440	0.365	0.125	0.5160	0.2155	0.1140	0.1550	10
4	I	0.330	0.255	0.080	0.2050	0.0895	0.0395	0.0550	7
...	...	...	...	...	...	...	...	...	...
4172	F	0.565	0.450	0.165	0.8870	0.3700	0.2390	0.2490	11
4173	M	0.590	0.440	0.135	0.9660	0.4390	0.2145	0.2605	10
4174	M	0.600	0.475	0.205	1.1760	0.5255	0.2875	0.3080	9
4175	F	0.625	0.485	0.150	1.0945	0.5310	0.2610	0.2960	10
4176	M	0.710	0.555	0.195	1.9485	0.9455	0.3765	0.4950	12

	Length	Diameter	Height	Whole weight	Shucked weight	Viscera weight	Shell weight	Rings
count	4177.000000	4177.000000	4177.000000	4177.000000	4177.000000	4177.000000	4177.000000	4177.000000
mean	0.523992	0.407881	0.139516	0.828742	0.359367	0.180594	0.238831	9.933684
std	0.120093	0.099240	0.041827	0.490389	0.221963	0.109614	0.139203	3.224169
min	0.075000	0.055000	0.000000	0.002000	0.001000	0.000500	0.001500	1.000000
25%	0.450000	0.350000	0.115000	0.441500	0.186000	0.093500	0.130000	8.000000
50%	0.545000	0.425000	0.140000	0.799500	0.336000	0.171000	0.234000	9.000000
75%	0.615000	0.480000	0.165000	1.153000	0.502000	0.253000	0.329000	11.000000
max	0.815000	0.650000	1.130000	2.825500	1.488000	0.760000	1.005000	29.000000

	Length	Diameter	Height	Whole weight	Shucked weight	Viscera weight	Shell weight	Sex_F	Sex_I	Sex_M
3620	1.108662	1.069035	0.962771	1.148178	1.176251	1.720532	0.765672	True	False	False
2244	-1.454991	-1.323513	-1.038572	-1.268759	-1.240889	-1.264593	-1.143890	False	True	False
139	-1.275085	-1.269136	-1.181525	-1.209326	-1.228073	-1.141917	-1.081622	False	False	True
3214	0.478993	1.014659	0.676865	0.744966	0.596957	1.081592	0.765672	True	False	False
3757	0.029229	0.090265	0.105052	-0.169833	0.020227	-0.410970	-0.234773	False	True	False
...	...	...	...	...	...	...	...	...	...	...
2969	1.513449	1.123411	1.534583	1.773971	1.699152	2.415698	1.554403	False	False	True
639	-0.060724	-0.018487	-0.180854	-0.462336	-0.597515	-0.334297	-0.230621	True	False	False
1922	0.838804	0.851530	0.105052	0.603959	0.522623	0.943581	0.392062	False	False	True
2682	0.928756	0.851530	0.390958	0.768273	0.825086	0.754455	0.682647	False	False	True
3405	-1.589920	-1.649769	-1.753338	-1.387624	-1.317787	-1.315708	-1.434475	False	True	False

Abalone age prediction¶

Importing libraries and the dataset¶

Data Analysis¶

Checking missing values¶

Univariate analysis¶

Sex¶

Rings¶

Correlation¶

Bivariate analysis¶

Preprocessing¶

Outliers¶

Mahalanobis distance¶

One hot fix¶

Normalization¶

Splitting the data¶

Training¶

Evaluation¶

$R^2$¶

Principle Component Analysis¶

Further work¶