# start of the Practical Work

# Import necessary libraries
import pandas as pd   # For data handling
import numpy as np    # For numerical operations
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Provided the correct path of the CSV file where it is located.
data = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

#  Displaying the first 5 rows of the dataset to understand how it looks
print("First 5 rows of the dataset:")
data.head()

First 5 rows of the dataset:

# Checking the shape of the dataset (number of rows and columns)
print("\nShape of the dataset (Rows, Columns):")
print(data.shape)

Shape of the dataset (Rows, Columns):
(2111, 17)

#  Checking the column names and data types
print("\nColumn names and data types:")
print(data.dtypes)

Column names and data types:
Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

# Getting basic information about the dataset
# This shows total non-null values, data types, and memory usage
print("\nBasic Information of Dataset:")
print(data.info())

Basic Information of Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             2111 non-null   float64
 13  TUE                             2111 non-null   float64
 14  CALC                            2111 non-null   object 
 15  MTRANS                          2111 non-null   object 
 16  NObeyesdad                      2111 non-null   object 
dtypes: float64(8), object(9)
memory usage: 280.5+ KB
None

#  Get basic statistical details (only for numerical columns)
print("\nStatistical Summary of Numerical Columns:")
print(data.describe())

Statistical Summary of Numerical Columns:
               Age       Height       Weight         FCVC          NCP  \
count  2111.000000  2111.000000  2111.000000  2111.000000  2111.000000   
mean     24.312600     1.701677    86.586058     2.419043     2.685628   
std       6.345968     0.093305    26.191172     0.533927     0.778039   
min      14.000000     1.450000    39.000000     1.000000     1.000000   
25%      19.947192     1.630000    65.473343     2.000000     2.658738   
50%      22.777890     1.700499    83.000000     2.385502     3.000000   
75%      26.000000     1.768464   107.430682     3.000000     3.000000   
max      61.000000     1.980000   173.000000     3.000000     4.000000   

              CH2O          FAF          TUE  
count  2111.000000  2111.000000  2111.000000  
mean      2.008011     1.010298     0.657866  
std       0.612953     0.850592     0.608927  
min       1.000000     0.000000     0.000000  
25%       1.584812     0.124505     0.000000  
50%       2.000000     1.000000     0.625350  
75%       2.477420     1.666678     1.000000  
max       3.000000     3.000000     2.000000

# Shape before duplicate removal
print("Shape before duplicate removal:", data.shape)

# Check for duplicate rows
duplicate_rows = data[data.duplicated()]
print("Number of duplicate rows found:", duplicate_rows.shape[0])

# Display duplicate rows (if needed)
display(duplicate_rows)

# Remove duplicate rows
data_no_duplicates = data.drop_duplicates()

# Shape after duplicate removal
print("Shape after duplicate removal:", data_no_duplicates.shape)

# Number of rows removed
print("Number of duplicate rows removed:", data.shape[0] - data_no_duplicates.shape[0])

Shape before duplicate removal: (2111, 17)
Number of duplicate rows found: 24

Shape after duplicate removal: (2087, 17)
Number of duplicate rows removed: 24

# Check for missing values in each column
print("Missing Values in Each Column:")
print(data.isnull().sum())

Missing Values in Each Column:
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np
from kneed import KneeLocator

# Select numeric features
numeric_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
X = data[numeric_cols].copy()

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Compute k-distances
k = 6
nbrs = NearestNeighbors(n_neighbors=k).fit(X_scaled)
distances, indices = nbrs.kneighbors(X_scaled)

# Sort distances of the k-th neighbor
k_distances = np.sort(distances[:, k-1])

# Use KneeLocator to find the elbow point (optimal eps)
kneedle = KneeLocator(range(len(k_distances)), k_distances, curve='convex', direction='increasing')
eps_val = k_distances[kneedle.knee]

# Plot k-distance graph with knee
plt.figure(figsize=(8, 4))
plt.plot(k_distances, label="k-distance")
plt.axvline(kneedle.knee, color='r', linestyle='--', label=f"Knee at {kneedle.knee}")
plt.axhline(eps_val, color='g', linestyle='--', label=f"Optimal eps = {eps_val:.3f}")
plt.title("k-distance Plot with KneeLocator")
plt.xlabel("Points (sorted)")
plt.ylabel(f"{k}th Nearest Neighbor Distance")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Output eps
print(f" Optimal eps from k-distance curve: {eps_val:.3f}")

 Optimal eps from k-distance curve: 1.946

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Step 1: Select and scale numeric features
numeric_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
X = data[numeric_cols].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: DBSCAN parameter settings
eps_val = 2  # Choose based on knee/elbow method
min_samples_range = range(2, 11)

# Step 3: Evaluate DBSCAN using all 3 metrics
silhouette_scores = {}
calinski_scores = {}
davies_scores = {}

for m in min_samples_range:
    dbscan = DBSCAN(eps=eps_val, min_samples=m).fit(X_scaled)
    labels = dbscan.labels_
    
    if len(set(labels)) > 1:
        try:
            silhouette = silhouette_score(X_scaled, labels)
            calinski = calinski_harabasz_score(X_scaled, labels)
            davies = davies_bouldin_score(X_scaled, labels)
        except:
            silhouette, calinski, davies = np.nan, np.nan, np.nan
    else:
        silhouette, calinski, davies = np.nan, np.nan, np.nan

    silhouette_scores[m] = silhouette
    calinski_scores[m] = calinski
    davies_scores[m] = davies

# Step 4: Create DataFrame for evaluation
scores_data = pd.DataFrame({
    "Silhouette Score": pd.Series(silhouette_scores),
    "Calinski-Harabasz": pd.Series(calinski_scores),
    "Davies-Bouldin": pd.Series(davies_scores)
})

# Step 5: Plot Calinski-Harabasz Scores
plt.figure(figsize=(8, 5))
sns.lineplot(data=scores_data["Calinski-Harabasz"], marker='o', color='blue')
sns.scatterplot(x=scores_data.index, y=scores_data["Calinski-Harabasz"], s=150)
plt.axvline(x=scores_data["Calinski-Harabasz"].idxmax(), color='red', linestyle='--')
plt.title("Calinski-Harabasz Score vs min_samples (with outliers)")
plt.xlabel("min_samples")
plt.ylabel("Calinski-Harabasz Score")
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 6: Select best min_samples based on Calinski
best_min_samples = scores_data["Calinski-Harabasz"].idxmax()
best_score = scores_data.loc[best_min_samples, "Calinski-Harabasz"]

print(f"\n Best min_samples (by Calinski-Harabasz) = {best_min_samples}")
print(f" Best Calinski-Harabasz score = {best_score:.2f}")
print("\n Full evaluation table:\n")
print(scores_data.round(3))

# Step 7: Final DBSCAN run with best parameters
dbscan = DBSCAN(eps=eps_val, min_samples=best_min_samples).fit(X_scaled)
labels = dbscan.labels_
data['DBSCAN_Cluster_All'] = labels

# Step 8: Outlier reporting
outliers = data[data['DBSCAN_Cluster_All'] == -1]
print(f"\n Total outliers detected: {len(outliers)}")
display("\n Sample outliers:\n", outliers.sample(min(5, len(outliers))))

 Best min_samples (by Calinski-Harabasz) = 5
 Best Calinski-Harabasz score = 12.18

 Full evaluation table:

    Silhouette Score  Calinski-Harabasz  Davies-Bouldin
2              0.309             10.613           1.795
3              0.309             10.613           1.795
4              0.296             11.515           1.820
5              0.284             12.179           1.859
6              0.284             12.179           1.859
7              0.292             11.627           2.226
8              0.287             10.149           2.547
9              0.281              9.615           2.746
10             0.267              8.939           3.069

 Total outliers detected: 9

'\n Sample outliers:\n'

from sklearn.ensemble import IsolationForest

# Select only numerical features
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
X = data[numerical_columns]

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # assuming 5% expected outliers
iso_forest.fit(X)

# Predict outliers (-1 = outlier, 1 = inlier)
outlier_pred = iso_forest.predict(X)

# Print number of detected outliers
print("Number of outliers detected by Isolation Forest:", list(outlier_pred).count(-1))

Number of outliers detected by Isolation Forest: 106

from sklearn.neighbors import LocalOutlierFactor

# Apply LOF
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)  # 5% expected outliers
y_pred = lof.fit_predict(X)

# LOF also returns -1 for outliers
print("Number of outliers detected by LOF:", list(y_pred).count(-1))

Number of outliers detected by LOF: 106

# Updated list of numerical columns
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

print("Outliers detected (per feature) using IQR (Boxplot Method):")
for col in numerical_columns:
    Q1 = data[col].quantile(0.25)   # 25th percentile
    Q3 = data[col].quantile(0.75)   # 75th percentile
    IQR = Q3 - Q1                  # Interquartile Range

    # Outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Find outliers
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    
    print(f"{col}: {outliers.shape[0]} outliers")

Outliers detected (per feature) using IQR (Boxplot Method):
Age: 168 outliers
Height: 1 outliers
Weight: 1 outliers
FCVC: 0 outliers
NCP: 579 outliers
CH2O: 0 outliers
FAF: 0 outliers
TUE: 0 outliers

# Z-score calculation for all numerical columns
z_scores = np.abs(stats.zscore(data[numerical_columns]))

# Threshold for defining outliers
threshold = 3

# Find outlier data points
outliers = (z_scores > threshold)

# Display how many outliers per feature
print("Outliers detected (per feature):")
for i, col in enumerate(numerical_columns):
    print(f"{col}: {np.sum(outliers[:, i])} outliers")

Outliers detected (per feature):
Age: 24 outliers
Height: 0 outliers
Weight: 1 outliers
FCVC: 0 outliers
NCP: 0 outliers
CH2O: 0 outliers
FAF: 0 outliers
TUE: 0 outliers

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# List of numerical columns
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

print("Outliers detected using Tukey's Method:")
for col in numerical_columns:
    Q1 = data[col].quantile(0.25)   # 25th percentile
    Q3 = data[col].quantile(0.75)   # 75th percentile
    IQR = Q3 - Q1                  # Interquartile Range

    # Tukey’s fences
    lower_fence = Q1 - 1.5 * IQR
    upper_fence = Q3 + 1.5 * IQR

    # Find outliers
    outliers = data[(data[col] < lower_fence) | (data[col] > upper_fence)]
    
    print(f"{col}: {outliers.shape[0]} outliers")

Outliers detected using Tukey's Method:
Age: 168 outliers
Height: 1 outliers
Weight: 1 outliers
FCVC: 0 outliers
NCP: 579 outliers
CH2O: 0 outliers
FAF: 0 outliers
TUE: 0 outliers

print("Outliers can visualize by Boxplot:")
# Plot boxplots for each numerical feature
for col in numerical_columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

Outliers can visualize by Boxplot:

import matplotlib.pyplot as plt

# List of numerical columns (as per your project)
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Plot histogram for each numerical feature
plt.figure(figsize=(15, 12))
for i, col in enumerate(numerical_columns):
    plt.subplot(3, 3, i + 1)  # 3 rows, 3 columns grid
    plt.hist(data[col], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Step 9: Remove outliers (label = -1)
data_cleaned = data[data['DBSCAN_Cluster_All'] != -1].copy()

# Step 10: Drop extra outlier label columns if they exist
columns_to_drop = ['DBSCAN_Cluster_All', 'Outlier_Silhouette', 'Outlier_Calinski']
data_cleaned.drop(columns=[col for col in columns_to_drop if col in data_cleaned.columns], inplace=True)

# Step 11: Summary
print(f" Final dataset shape after removing DBSCAN outliers: {data_cleaned.shape}")
print(f" Total outliers removed: {data.shape[0] - data_cleaned.shape[0]}")

 Final dataset shape after removing DBSCAN outliers: (2102, 17)
 Total outliers removed: 9

import matplotlib.pyplot as plt
import seaborn as sns

# Numerical columns
num_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Categorical columns
cat_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']

# Histograms for numerical features
data[num_cols].hist(figsize=(15, 10), bins=20)
plt.suptitle("Histograms of Numerical Features")
plt.show()

# Boxplots for numerical features
for col in num_cols:
    plt.figure(figsize=(6,3))
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Bar plot for categorical features
for col in cat_cols:
    plt.figure(figsize=(6,3))
    sns.countplot(x=data[col])
    plt.title(f'Bar Plot of {col}')
    plt.xticks(rotation=45)
    plt.show()

# Pie chart for Target Variable (NObeyesdad)
plt.figure(figsize=(6,6))
data['NObeyesdad'].value_counts().plot.pie(autopct='%1.1f%%', colors=sns.color_palette('pastel'))
plt.title('Distribution of Obesity Levels')
plt.ylabel('')
plt.show()

# Scatter plot: Height vs Weight colored by Obesity Level
plt.figure(figsize=(8,6))
sns.scatterplot(x='Height', y='Weight', hue='NObeyesdad', data=data, palette='Set1')
plt.title('Height vs Weight colored by Obesity Level')
plt.show()

# Correlation Heatmap (only numerical features)
plt.figure(figsize=(10,8))
sns.heatmap(data[num_cols].corr(), annot=True, cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()

from pandas.plotting import parallel_coordinates
from mpl_toolkits.mplot3d import Axes3D

# Pair Plot (only top 4 features to avoid clutter)
sns.pairplot(data, vars=['Age', 'Height', 'Weight', 'FAF'], hue='NObeyesdad', palette='Set2')
plt.suptitle("Pair Plot of Selected Features")
plt.show()

# 3D Scatter Plot: Age, Weight, Height
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(data['Age'], data['Weight'], data['Height'], c=data['NObeyesdad'].astype('category').cat.codes, cmap='viridis')
plt.colorbar(sc)
ax.set_xlabel('Age')
ax.set_ylabel('Weight')
ax.set_zlabel('Height')
plt.title('3D Scatter Plot')
plt.show()

# Parallel Coordinates Plot
plt.figure(figsize=(12,6))
parallel_coordinates(data[['Age', 'Height', 'Weight', 'FAF', 'NObeyesdad']], 'NObeyesdad', colormap=plt.get_cmap("Set1"))
plt.title("Parallel Coordinates Plot")
plt.xticks(rotation=45)
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

# 1. Trend: Relationship between Age and Obesity Level
plt.figure(figsize=(8,5))
sns.boxplot(x='NObeyesdad', y='Age', data=data, palette='Set2')
plt.title('Age Distribution across Obesity Levels')
plt.xticks(rotation=45)
plt.show()

# 2. Trend: Relationship between Number of Meals (NCP) and Obesity Level
plt.figure(figsize=(8,5))
sns.boxplot(x='NObeyesdad', y='NCP', data=data, palette='Set3')
plt.title('Number of Meals vs Obesity Level')
plt.xticks(rotation=45)
plt.show()

# 3. Pattern: Physical Activity vs Obesity Level
plt.figure(figsize=(8,5))
sns.boxplot(x='NObeyesdad', y='FAF', data=data, palette='Set1')
plt.title('Physical Activity Frequency across Obesity Levels')
plt.xticks(rotation=45)
plt.show()

# 4. Pattern: Water Consumption vs Obesity Level
plt.figure(figsize=(8,5))
sns.boxplot(x='NObeyesdad', y='CH2O', data=data, palette='coolwarm')
plt.title('Water Consumption across Obesity Levels')
plt.xticks(rotation=45)
plt.show()

# 5. Anomaly Detection via Boxplots: Outliers in Weight
plt.figure(figsize=(6,4))
sns.boxplot(x=data['Weight'], color='orange')
plt.title('Outlier Detection in Weight')
plt.show()

# 6. Anomaly Detection via Boxplots: Outliers in Age
plt.figure(figsize=(6,4))
sns.boxplot(x=data['Age'], color='red')
plt.title('Outlier Detection in Age')
plt.show()

# 7. Correlation Heatmap to confirm pattern among features
plt.figure(figsize=(10,8))
sns.heatmap(data[num_cols].corr(), annot=True, cmap='coolwarm', square=True)
plt.title('Correlation Heatmap for Numerical Features')
plt.show()

C:\Users\himan\AppData\Local\Temp\ipykernel_18016\3751490201.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='NObeyesdad', y='Age', data=data, palette='Set2')

C:\Users\himan\AppData\Local\Temp\ipykernel_18016\3751490201.py:13: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='NObeyesdad', y='NCP', data=data, palette='Set3')

C:\Users\himan\AppData\Local\Temp\ipykernel_18016\3751490201.py:20: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='NObeyesdad', y='FAF', data=data, palette='Set1')

C:\Users\himan\AppData\Local\Temp\ipykernel_18016\3751490201.py:27: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='NObeyesdad', y='CH2O', data=data, palette='coolwarm')

# Numerical columns
num_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

print("=== Measures of Central Tendency ===")
for col in num_cols:
    print(f"\nColumn: {col}")
    print("Mean  :", data[col].mean())
    print("Median:", data[col].median())
    print("Mode  :", data[col].mode()[0])  # mode() returns a Series

=== Measures of Central Tendency ===

Column: Age
Mean  : 24.312599908574136
Median: 22.77789
Mode  : 18.0

Column: Height
Mean  : 1.7016773533870204
Median: 1.700499
Mode  : 1.7

Column: Weight
Mean  : 86.58605812648035
Median: 83.0
Mode  : 80.0

Column: FCVC
Mean  : 2.4190430615821885
Median: 2.385502
Mode  : 3.0

Column: NCP
Mean  : 2.6856280497394596
Median: 3.0
Mode  : 3.0

Column: CH2O
Mean  : 2.0080114040738986
Median: 2.0
Mode  : 2.0

Column: FAF
Mean  : 1.0102976958787304
Median: 1.0
Mode  : 0.0

Column: TUE
Mean  : 0.657865923732828
Median: 0.62535
Mode  : 0.0

print("\n=== Measures of Dispersion ===")
for col in num_cols:
    print(f"\nColumn: {col}")
    print("Variance           :", data[col].var())
    print("Standard Deviation :", data[col].std())
    print("Range              :", data[col].max() - data[col].min())
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    print("Interquartile Range (IQR):", IQR)

=== Measures of Dispersion ===

Column: Age
Variance           : 40.27131333121607
Standard Deviation : 6.345968273732234
Range              : 47.0
Interquartile Range (IQR): 6.052807999999999

Column: Height
Variance           : 0.00870578941058501
Standard Deviation : 0.09330481986792007
Range              : 0.53
Interquartile Range (IQR): 0.13846400000000014

Column: Weight
Variance           : 685.977477386809
Standard Deviation : 26.1911717452047
Range              : 134.0
Interquartile Range (IQR): 41.957339000000005

Column: FCVC
Variance           : 0.2850775912322408
Standard Deviation : 0.5339265785033002
Range              : 2.0
Interquartile Range (IQR): 1.0

Column: NCP
Variance           : 0.6053441390916691
Standard Deviation : 0.7780386488418612
Range              : 3.0
Interquartile Range (IQR): 0.34126199999999995

Column: CH2O
Variance           : 0.3757119340697006
Standard Deviation : 0.6129534517968722
Range              : 2.0
Interquartile Range (IQR): 0.8926075

Column: FAF
Variance           : 0.7235074833966828
Standard Deviation : 0.850592430836698
Range              : 3.0
Interquartile Range (IQR): 1.5421725

Column: TUE
Variance           : 0.37079240757698323
Standard Deviation : 0.6089272596763782
Range              : 2.0
Interquartile Range (IQR): 1.0

from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Copy data to avoid modifying original
df_encoded = data.copy()

# Label Encoding for binary categorical columns
binary_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
le = LabelEncoder()

for col in binary_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

# One-Hot Encoding for multi-class categorical columns
multi_class_cols = ['CAEC', 'CALC', 'MTRANS']
df_encoded = pd.get_dummies(df_encoded, columns=multi_class_cols, drop_first=False) 

# Encode the target variable
df_encoded['NObeyesdad'] = le.fit_transform(df_encoded['NObeyesdad'])

print("Categorical Encoding completed successfully!")
df_encoded.head()

Categorical Encoding completed successfully!

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer
import pandas as pd

# Select numerical columns for scaling
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
sample_data = data[numerical_columns].copy()

print("Original Data (First 5 Rows):\n", sample_data.head())

# 1. StandardScaler
scaler_standard = StandardScaler()
standard_scaled = scaler_standard.fit_transform(sample_data)
standard_scaled_df = pd.DataFrame(standard_scaled, columns=numerical_columns)
print("\nStandardScaler Result:\n", standard_scaled_df.head())

# 2. MinMaxScaler
scaler_minmax = MinMaxScaler()
minmax_scaled = scaler_minmax.fit_transform(sample_data)
minmax_scaled_df = pd.DataFrame(minmax_scaled, columns=numerical_columns)
print("\nMinMaxScaler Result:\n", minmax_scaled_df.head())

# 3. MaxAbsScaler
scaler_maxabs = MaxAbsScaler()
maxabs_scaled = scaler_maxabs.fit_transform(sample_data)
maxabs_scaled_df = pd.DataFrame(maxabs_scaled, columns=numerical_columns)
print("\nMaxAbsScaler Result:\n", maxabs_scaled_df.head())

# 4. RobustScaler
scaler_robust = RobustScaler()
robust_scaled = scaler_robust.fit_transform(sample_data)
robust_scaled_df = pd.DataFrame(robust_scaled, columns=numerical_columns)
print("\nRobustScaler Result:\n", robust_scaled_df.head())

# 5. Normalizer (Note: applies normalization row-wise, not column-wise)
scaler_normalizer = Normalizer()
normalized = scaler_normalizer.fit_transform(sample_data)
normalized_df = pd.DataFrame(normalized, columns=numerical_columns)
print("\nNormalizer Result:\n", normalized_df.head())

Original Data (First 5 Rows):
     Age  Height  Weight  FCVC  NCP  CH2O  FAF  TUE
0  21.0    1.62    64.0   2.0  3.0   2.0  0.0  1.0
1  21.0    1.52    56.0   3.0  3.0   3.0  3.0  0.0
2  23.0    1.80    77.0   2.0  3.0   2.0  2.0  1.0
3  27.0    1.80    87.0   3.0  3.0   2.0  2.0  0.0
4  22.0    1.78    89.8   2.0  1.0   2.0  0.0  0.0

StandardScaler Result:
         Age    Height    Weight      FCVC       NCP      CH2O       FAF  \
0 -0.522124 -0.875589 -0.862558 -0.785019  0.404153 -0.013073 -1.188039   
1 -0.522124 -1.947599 -1.168077  1.088342  0.404153  1.618759  2.339750   
2 -0.206889  1.054029 -0.366090 -0.785019  0.404153 -0.013073  1.163820   
3  0.423582  1.054029  0.015808  1.088342  0.404153 -0.013073  1.163820   
4 -0.364507  0.839627  0.122740 -0.785019 -2.167023 -0.013073 -1.188039   

        TUE  
0  0.561997  
1 -1.080625  
2  0.561997  
3 -1.080625  
4 -1.080625  

MinMaxScaler Result:
         Age    Height    Weight  FCVC       NCP  CH2O       FAF  TUE
0  0.148936  0.320755  0.186567   0.5  0.666667   0.5  0.000000  0.5
1  0.148936  0.132075  0.126866   1.0  0.666667   1.0  1.000000  0.0
2  0.191489  0.660377  0.283582   0.5  0.666667   0.5  0.666667  0.5
3  0.276596  0.660377  0.358209   1.0  0.666667   0.5  0.666667  0.0
4  0.170213  0.622642  0.379104   0.5  0.000000   0.5  0.000000  0.0

MaxAbsScaler Result:
         Age    Height    Weight      FCVC   NCP      CH2O       FAF  TUE
0  0.344262  0.818182  0.369942  0.666667  0.75  0.666667  0.000000  0.5
1  0.344262  0.767677  0.323699  1.000000  0.75  1.000000  1.000000  0.0
2  0.377049  0.909091  0.445087  0.666667  0.75  0.666667  0.666667  0.5
3  0.442623  0.909091  0.502890  1.000000  0.75  0.666667  0.666667  0.0
4  0.360656  0.898990  0.519075  0.666667  0.25  0.666667  0.000000  0.0

RobustScaler Result:
         Age    Height    Weight      FCVC     NCP      CH2O       FAF      TUE
0 -0.293730 -0.581371 -0.452841 -0.385502  0.0000  0.000000 -0.648436  0.37465
1 -0.293730 -1.303581 -0.643511  0.614498  0.0000  1.120313  1.296872 -0.62535
2  0.036695  0.718606 -0.143002 -0.385502  0.0000  0.000000  0.648436  0.37465
3  0.697546  0.718606  0.095335  0.614498  0.0000  0.000000  0.648436 -0.62535
4 -0.128517  0.574164  0.162069 -0.385502 -5.8606  0.000000 -0.648436 -0.62535

Normalizer Result:
         Age    Height    Weight      FCVC       NCP      CH2O       FAF  \
0  0.311064  0.023996  0.948005  0.029625  0.044438  0.029625  0.000000   
1  0.349258  0.025280  0.931355  0.049894  0.049894  0.049894  0.049894   
2  0.285648  0.022355  0.956301  0.024839  0.037258  0.024839  0.024839   
3  0.295878  0.019725  0.953386  0.032875  0.032875  0.021917  0.021917   
4  0.237783  0.019239  0.970586  0.021617  0.010808  0.021617  0.000000   

        TUE  
0  0.014813  
1  0.000000  
2  0.012419  
3  0.000000  
4  0.000000

numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Create a scaler object
scaler = StandardScaler()

# Apply scaling to these numerical columns only
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print("Final scaling applied to numerical features after encoding.")
df_encoded.head()

Final scaling applied to numerical features after encoding.

# ---------------------------
# Dimensionality Reduction using PCA (Optional Visualization)
# ---------------------------

from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

# Apply PCA - Reduce dimensions to 2 for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)

# Add PCA results to the dataframe
data['PCA1'] = pca_result[:, 0]
data['PCA2'] = pca_result[:, 1]

# Scatter plot of PCA components
plt.figure(figsize=(8,6))
sns.scatterplot(x='PCA1', y='PCA2', hue='NObeyesdad', data=data, palette='Set1')
plt.title('PCA: 2D Projection of Obesity Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Obesity Level', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

# Explained Variance Ratio
print("Explained Variance by PCA Components:")
print(pca.explained_variance_ratio_)

Explained Variance by PCA Components:
[0.22648604 0.18657893]

final_data = df_encoded  # After encoding + scaling + outlier removal by DBSCAN
### Final Data Confirmation Before Modeling:

 # Select Features and Target Variable:
X = df_encoded.drop('NObeyesdad', axis=1)  # All columns except target
y = df_encoded['NObeyesdad']               # Target variable (encoded)

# Train-Test Split:
from sklearn.model_selection import train_test_split

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

Train shape: (1688, 27)
Test shape : (423, 27)

# Verify Data Shape and Balance:
print("Training target class distribution:\n", y_train.value_counts())
print("\nTesting target class distribution:\n", y_test.value_counts())

Training target class distribution:
 NObeyesdad
2    281
4    259
3    237
5    232
6    232
1    229
0    218
Name: count, dtype: int64

Testing target class distribution:
 NObeyesdad
2    70
4    65
3    60
1    58
6    58
5    58
0    54
Name: count, dtype: int64

# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(" Logistic Regression Accuracy:", lr_accuracy)
precision = precision_score(y_test, y_pred_lr, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred_lr, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_lr, average='macro', zero_division=0)

print("Precision (Macro):", precision)
print("Recall (Macro):", recall)
print("F1 Score (Macro):", f1)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lr))

# Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(8,6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.tight_layout()
plt.show()

 Logistic Regression Accuracy: 0.8794326241134752
Precision (Macro): 0.8758734880731105
Recall (Macro): 0.8769248813583789
F1 Score (Macro): 0.8758005003057373

Classification Report:

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       0.81      0.74      0.77        58
           2       0.88      0.93      0.90        70
           3       0.95      0.97      0.96        60
           4       1.00      0.98      0.99        65
           5       0.74      0.74      0.74        58
           6       0.82      0.78      0.80        58

    accuracy                           0.88       423
   macro avg       0.88      0.88      0.88       423
weighted avg       0.88      0.88      0.88       423

from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc

# Binarize the output for multi-class ROC AUC
y_test_binarized = label_binarize(y_test, classes=np.unique(y))
n_classes = y_test_binarized.shape[1]

# OneVsRest Classifier with Logistic Regression
classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000))
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)

# Compute ROC AUC Score
roc_auc = roc_auc_score(y_test_binarized, y_score, average='macro', multi_class='ovr')
print("ROC AUC Score (Macro):", roc_auc)

# Plot ROC Curve for each class
fpr = dict()
tpr = dict()
roc_auc_dict = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
    roc_auc_dict[i] = auc(fpr[i], tpr[i])

# Plot all ROC curves
plt.figure(figsize=(8,6))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc_dict[i]:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-Class ROC Curves - Logistic Regression')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

ROC AUC Score (Macro): 0.9349332959360489

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(lr_model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Cross-Validation Scores: [0.70921986 0.92417062 0.9028436  0.92890995 0.91232227]
Mean CV Accuracy: 0.8754932607307317

# Get feature names
feature_names = X.columns

# Coefficients for each class
coefficients = lr_model.coef_

# Display as DataFrame
coef_df = pd.DataFrame(coefficients.T, index=feature_names, columns=[f'Class {i}' for i in range(coefficients.shape[0])])
print("Logistic Regression Coefficients:\n")
print(coef_df)

Logistic Regression Coefficients:

                                  Class 0   Class 1   Class 2   Class 3  \
Gender                          -0.429949  0.316381 -0.335083  3.398226   
Age                             -0.941897 -0.413739 -0.080669  1.244417   
Height                           3.239327  1.653316 -1.578265 -2.465515   
Weight                         -11.976597 -6.470442  4.611162  9.406685   
family_history_with_overweight  -0.879616 -0.801501  0.735398  0.091287   
FAVC                            -0.682585 -0.364902  1.214313 -0.342510   
FCVC                             0.019887 -0.300228 -0.619714 -0.403955   
NCP                              0.062357 -0.048067 -0.246730 -0.290571   
SMOKE                           -0.648291  0.527317  0.784747  0.119411   
CH2O                             0.036217 -0.225322  0.212868 -0.491938   
SCC                             -0.278995 -0.031567 -0.567217  0.006174   
FAF                              0.235083  0.356597  0.232735 -0.189270   
TUE                              0.113262 -0.057673  0.047880 -0.140576   
DBSCAN_Cluster_All               0.031188 -0.842781 -0.250839  0.061780   
CAEC_Always                     -0.579802  0.994178  0.329750 -0.076151   
CAEC_Frequently                  1.187350  0.433318 -0.498479 -0.500669   
CAEC_Sometimes                  -0.503878 -1.007857  0.405290  0.528388   
CAEC_no                         -0.027344 -0.447386 -0.190840 -0.041420   
CALC_Always                     -0.028674  0.061468  0.001541  0.000293   
CALC_Frequently                 -0.317710  0.208513  0.365622 -0.694914   
CALC_Sometimes                   0.080027 -0.348666 -0.507607  0.103877   
CALC_no                          0.342682  0.050938  0.186164  0.500892   
MTRANS_Automobile                0.542749 -0.411899  0.213361 -0.373350   
MTRANS_Bike                     -0.491028  0.419794 -0.013020  0.002942   
MTRANS_Motorbike                -0.136606  0.356163  0.678742 -0.173364   
MTRANS_Public_Transportation     0.049877 -0.921110 -0.395199  0.533385   
MTRANS_Walking                   0.111333  0.529306 -0.438163 -0.079464   

                                 Class 4   Class 5   Class 6  
Gender                         -3.503705 -0.386375  0.940505  
Age                            -0.090567 -0.199143  0.481599  
Height                         -2.010264  1.007957  0.153444  
Weight                          7.376912 -2.790594 -0.157126  
family_history_with_overweight  0.326158 -0.387511  0.915785  
FAVC                            0.716697  0.646274 -1.187286  
FCVC                            2.051586 -0.432425 -0.315151  
NCP                             0.956527 -0.112152 -0.321364  
SMOKE                           0.038176 -0.405847 -0.415512  
CH2O                            0.327038  0.121629  0.019509  
SCC                             0.008546  1.065430 -0.202370  
FAF                            -0.691405  0.130293 -0.074032  
TUE                            -0.142357 -0.040980  0.220444  
DBSCAN_Cluster_All             -0.000688 -0.129104  1.130443  
CAEC_Always                    -0.385212  0.014129 -0.296892  
CAEC_Frequently                -0.001960 -0.759419  0.139858  
CAEC_Sometimes                  0.353009 -0.142453  0.367501  
CAEC_no                         0.011071  0.893001 -0.197081  
CALC_Always                     0.000120 -0.020166 -0.014582  
CALC_Frequently                -0.007233  0.131717  0.314005  
CALC_Sometimes                  0.882147  0.255111 -0.464888  
CALC_no                        -0.898125 -0.361403  0.178851  
MTRANS_Automobile              -0.599103  0.427433  0.200810  
MTRANS_Bike                     0.001102  0.448196 -0.367986  
MTRANS_Motorbike                0.002147 -0.730185  0.003103  
MTRANS_Public_Transportation    0.563824 -0.188684  0.357908  
MTRANS_Walking                  0.008939  0.048499 -0.180450

# Import required libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Initialize KNN Classifier (you can choose k=5 or tune further)
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Predict on test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(" KNN Accuracy:", knn_accuracy)

# Precision, Recall, F1 (Macro Average)
knn_precision = precision_score(y_test, y_pred_knn, average='macro', zero_division=0)
knn_recall = recall_score(y_test, y_pred_knn, average='macro', zero_division=0)
knn_f1 = f1_score(y_test, y_pred_knn, average='macro', zero_division=0)

print("Precision (Macro):", knn_precision)
print("Recall (Macro):", knn_recall)
print("F1 Score (Macro):", knn_f1)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_knn))

# Confusion Matrix
cm_knn = confusion_matrix(y_test, y_pred_knn)
print("\nConfusion Matrix:\n", cm_knn)

# Plot Confusion Matrix Heatmap
plt.figure(figsize=(8,6))
sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Purples')
plt.title("Confusion Matrix - KNN")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.tight_layout()
plt.show()
 # Cross-Validation
cv_scores_knn = cross_val_score(knn_model, X, y, cv=5, scoring='accuracy')
print("KNN Cross-Validation Scores:", cv_scores_knn)
print("Mean CV Accuracy:", cv_scores_knn.mean())
# Hyperparameter Tuning (k-value tuning
param_grid = {'n_neighbors': range(1, 15)}
grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_knn.fit(X_train, y_train)

print("Best K value:", grid_knn.best_params_)
print("Best Accuracy Score:", grid_knn.best_score_)

 KNN Accuracy: 0.8368794326241135
Precision (Macro): 0.8351248897301529
Recall (Macro): 0.8307751453071649
F1 Score (Macro): 0.8250029858921464

Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.96      0.88        54
           1       0.81      0.50      0.62        58
           2       0.77      0.97      0.86        70
           3       0.95      0.95      0.95        60
           4       0.98      1.00      0.99        65
           5       0.73      0.66      0.69        58
           6       0.79      0.78      0.78        58

    accuracy                           0.84       423
   macro avg       0.84      0.83      0.83       423
weighted avg       0.84      0.84      0.83       423


Confusion Matrix:
 [[52  2  0  0  0  0  0]
 [ 8 29  5  0  0 11  5]
 [ 0  0 68  1  0  0  1]
 [ 0  0  3 57  0  0  0]
 [ 0  0  0  0 65  0  0]
 [ 3  5  6  0  0 38  6]
 [ 1  0  6  2  1  3 45]]

KNN Cross-Validation Scores: [0.74468085 0.84834123 0.85545024 0.84123223 0.86729858]
Mean CV Accuracy: 0.8314006251890692
Best K value: {'n_neighbors': 1}
Best Accuracy Score: 0.850129053781188

# Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(" Random Forest Accuracy:", rf_accuracy)

rf_precision = precision_score(y_test, y_pred_rf, average='macro', zero_division=0)
rf_recall = recall_score(y_test, y_pred_rf, average='macro', zero_division=0)
rf_f1 = f1_score(y_test, y_pred_rf, average='macro', zero_division=0)

print("Precision (Macro):", rf_precision)
print("Recall (Macro):", rf_recall)
print("F1 Score (Macro):", rf_f1)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:\n", cm_rf)

# Plot Confusion Matrix
plt.figure(figsize=(8,6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.tight_layout()
plt.show()

# Cross-validation
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Mean CV Accuracy:", cv_scores_rf.mean())

# Hyperparameter Tuning (Optional but Recommended)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42),param_grid_rf,cv=5,scoring='f1_macro',n_jobs=-1,verbose=1)
grid_rf.fit(X_train, y_train)

print("Best Parameters for Random Forest:", grid_rf.best_params_)
print("Best F1 score after Tuning:", grid_rf.best_score_)

 Random Forest Accuracy: 0.9527186761229315
Precision (Macro): 0.9588082601461636
Recall (Macro): 0.9516231898005297
F1 Score (Macro): 0.9529704760591676

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        54
           1       0.79      0.98      0.88        58
           2       0.96      0.96      0.96        70
           3       1.00      0.98      0.99        60
           4       1.00      0.98      0.99        65
           5       0.98      0.86      0.92        58
           6       0.98      0.97      0.97        58

    accuracy                           0.95       423
   macro avg       0.96      0.95      0.95       423
weighted avg       0.96      0.95      0.95       423


Confusion Matrix:
 [[50  4  0  0  0  0  0]
 [ 0 57  0  0  0  1  0]
 [ 0  2 67  0  0  0  1]
 [ 0  0  1 59  0  0  0]
 [ 0  1  0  0 64  0  0]
 [ 0  7  1  0  0 50  0]
 [ 0  1  1  0  0  0 56]]

Random Forest Cross-Validation Scores: [0.71158392 0.98341232 0.98341232 0.98104265 0.98578199]
Mean CV Accuracy: 0.9290466426898817
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Best F1 score after Tuning: 0.946788987407697

# Feature Importance
import pandas as pd

feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).plot(kind='bar', figsize=(10,4), color='orange')
plt.title("Feature Importance from Random Forest")
plt.ylabel("Importance Score")
plt.tight_layout()
plt.show()

# Print top features
print("Top 5 important features:\n", feature_importances.sort_values(ascending=False).head())

Top 5 important features:
 Weight    0.290785
FCVC      0.097970
Age       0.092497
Height    0.089875
Gender    0.052231
dtype: float64

# Learning Curve

from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    rf_model, X, y, cv=5, scoring='accuracy', n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5))

train_mean = train_scores.mean(axis=1)
test_mean = test_scores.mean(axis=1)

plt.plot(train_sizes, train_mean, label='Training Score')
plt.plot(train_sizes, test_mean, label='Cross-validation Score')
plt.title("Learning Curve - Random Forest")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize SVM model
svm_model = SVC(probability=True, random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predict on test data
y_pred_svm = svm_model.predict(X_test)

# Evaluation metrics
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(" SVM Accuracy:", svm_accuracy)

svm_precision = precision_score(y_test, y_pred_svm, average='macro', zero_division=0)
svm_recall = recall_score(y_test, y_pred_svm, average='macro', zero_division=0)
svm_f1 = f1_score(y_test, y_pred_svm, average='macro', zero_division=0)

print("Precision (Macro):", svm_precision)
print("Recall (Macro):", svm_recall)
print("F1 Score (Macro):", svm_f1)

# Classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_svm))

# Confusion Matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)
print("\nConfusion Matrix:\n", cm_svm)

# Plot Confusion Matrix
plt.figure(figsize=(8,6))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='coolwarm')
plt.title("Confusion Matrix - SVM")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.tight_layout()
plt.show()

# Cross-validation score for SVM
cv_scores_svm = cross_val_score(svm_model, X, y, cv=5, scoring='accuracy')
print("SVM Cross-Validation Scores:", cv_scores_svm)
print("Mean CV Accuracy:", cv_scores_svm.mean())

# Hyperparameter tuning using GridSearchCV 
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring='accuracy')
grid_svm.fit(X_train, y_train)

print("Best Parameters for SVM:", grid_svm.best_params_)
print("Best CV Accuracy after Tuning:", grid_svm.best_score_)

 SVM Accuracy: 0.9267139479905437
Precision (Macro): 0.9279454672311817
Recall (Macro): 0.9240245523496755
F1 Score (Macro): 0.9253223217722298

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        54
           1       0.78      0.88      0.83        58
           2       0.94      0.97      0.96        70
           3       1.00      0.98      0.99        60
           4       1.00      1.00      1.00        65
           5       0.84      0.81      0.82        58
           6       0.93      0.88      0.90        58

    accuracy                           0.93       423
   macro avg       0.93      0.92      0.93       423
weighted avg       0.93      0.93      0.93       423


Confusion Matrix:
 [[51  3  0  0  0  0  0]
 [ 0 51  0  0  0  7  0]
 [ 0  0 68  0  0  1  1]
 [ 0  0  1 59  0  0  0]
 [ 0  0  0  0 65  0  0]
 [ 0  8  0  0  0 47  3]
 [ 0  3  3  0  0  1 51]]

SVM Cross-Validation Scores: [0.78959811 0.96208531 0.93601896 0.95023697 0.98104265]
Mean CV Accuracy: 0.923796399000594
Best Parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best CV Accuracy after Tuning: 0.957939002335259

# Hyperparameter tuning using GridSearchCV 
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(
    SVC(probability=True, random_state=42),
    param_grid_svm,
    cv=5,
    scoring='f1_macro',    # optimize for Macro F1
    n_jobs=-1,
    verbose=1
)
grid_svm.fit(X_train, y_train)

print("Best Parameters for SVM:", grid_svm.best_params_)
print("Best CV Macro-F1 after Tuning:", grid_svm.best_score_)

# Evaluate tuned SVM on the test set
best_svm = grid_svm.best_estimator_
y_pred_svm_tuned = best_svm.predict(X_test)

# Test-set metrics
svm_accuracy_tuned = accuracy_score(y_test, y_pred_svm_tuned)
svm_f1_tuned = f1_score(y_test, y_pred_svm_tuned, average='macro', zero_division=0)

print("\nTuned SVM Test Accuracy    :", svm_accuracy_tuned)
print("Tuned SVM Test Macro-F1    :", svm_f1_tuned)

# (Optional) Updated classification report and confusion matrix for tuned SVM
print("\nClassification Report (Tuned SVM):\n")
print(classification_report(y_test, y_pred_svm_tuned, zero_division=0))

cm_svm_tuned = confusion_matrix(y_test, y_pred_svm_tuned)
plt.figure(figsize=(8,6))
sns.heatmap(cm_svm_tuned, annot=True, fmt='d', cmap='coolwarm')
plt.title("Confusion Matrix - Tuned SVM")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.tight_layout()
plt.show()

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best CV Macro-F1 after Tuning: 0.9565826722040933

Tuned SVM Test Accuracy    : 0.9621749408983451
Tuned SVM Test Macro-F1    : 0.9615480692028925

Classification Report (Tuned SVM):

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        54
           1       0.95      0.91      0.93        58
           2       0.97      0.96      0.96        70
           3       0.97      1.00      0.98        60
           4       1.00      0.98      0.99        65
           5       0.93      0.93      0.93        58
           6       0.95      0.95      0.95        58

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.96      0.96      0.96       423

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predict on test set
y_pred_dt = dt_model.predict(X_test)

# Evaluation
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt, average='macro', zero_division=0)
dt_recall = recall_score(y_test, y_pred_dt, average='macro', zero_division=0)
dt_f1 = f1_score(y_test, y_pred_dt, average='macro', zero_division=0)

print("Decision Tree Accuracy:", dt_accuracy)
print("Precision (Macro):", dt_precision)
print("Recall (Macro):", dt_recall)
print("F1 Score (Macro):", dt_f1)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_dt))

# Confusion Matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(8,6))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.tight_layout()
plt.show()

# Cross-validation
cv_scores_dt = cross_val_score(dt_model, X, y, cv=5, scoring='accuracy')
print("Decision Tree Cross-Validation Scores:", cv_scores_dt)
print("Mean CV Accuracy:", cv_scores_dt.mean())

Decision Tree Accuracy: 0.9054373522458629
Precision (Macro): 0.907484975642342
Recall (Macro): 0.9026409915572478
F1 Score (Macro): 0.9042209303348893

Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.85      0.90        54
           1       0.75      0.84      0.80        58
           2       0.92      0.93      0.92        70
           3       0.97      0.95      0.96        60
           4       1.00      0.98      0.99        65
           5       0.84      0.84      0.84        58
           6       0.91      0.91      0.91        58

    accuracy                           0.91       423
   macro avg       0.91      0.90      0.90       423
weighted avg       0.91      0.91      0.91       423

Decision Tree Cross-Validation Scores: [0.8463357  0.95023697 0.94075829 0.92890995 0.95260664]
Mean CV Accuracy: 0.9237695091481519

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize Neural Network model
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam',
                    max_iter=500, random_state=42)

# Train the model
mlp.fit(X_train, y_train)

# Predict
y_pred_nn = mlp.predict(X_test)

# Evaluate
nn_accuracy = accuracy_score(y_test, y_pred_nn)
nn_precision = precision_score(y_test, y_pred_nn, average='macro', zero_division=0)
nn_recall = recall_score(y_test, y_pred_nn, average='macro', zero_division=0)
nn_f1 = f1_score(y_test, y_pred_nn, average='macro', zero_division=0)

print("Neural Network Accuracy:", nn_accuracy)
print("Precision (Macro):", nn_precision)
print("Recall (Macro):", nn_recall)
print("F1 Score (Macro):", nn_f1)

# Classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_nn))

# Confusion matrix
cm_nn = confusion_matrix(y_test, y_pred_nn)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Oranges')
plt.title("Confusion Matrix - Neural Network (MLPClassifier)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# Cross-validation
cv_scores_nn = cross_val_score(mlp, X, y, cv=5, scoring='accuracy')
print("\nNeural Network Cross-Validation Scores:", cv_scores_nn)
print("Mean CV Accuracy:", cv_scores_nn.mean())

Neural Network Accuracy: 0.9574468085106383
Precision (Macro): 0.9561515606341656
Recall (Macro): 0.956169364297443
F1 Score (Macro): 0.9558386462504265

Classification Report:

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       0.93      0.86      0.89        58
           2       0.96      0.99      0.97        70
           3       1.00      0.98      0.99        60
           4       1.00      1.00      1.00        65
           5       0.91      0.90      0.90        58
           6       0.95      0.97      0.96        58

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.96      0.96      0.96       423

Neural Network Cross-Validation Scores: [0.83451537 0.97630332 0.96682464 0.97630332 0.98815166]
Mean CV Accuracy: 0.9484196609637771

# ROC Curve & AUC Score (One-vs-Rest)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
import numpy as np

# Binarize the output
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_score = mlp.predict_proba(X_test) 
# Compute AUC (macro average)
auc_score = roc_auc_score(y_test_bin, y_score, average='macro', multi_class='ovr')
print("One-vs-Rest AUC Score:", auc_score)

One-vs-Rest AUC Score: 0.9981292174940134

# Log Loss (a.k.a. Cross-Entropy Loss)
from sklearn.metrics import log_loss
logloss = log_loss(y_test, y_score)
print("Log Loss:", logloss)

Log Loss: 0.1353732285282633

from sklearn.metrics import cohen_kappa_score

kappa = cohen_kappa_score(y_test, y_pred_nn)  # Use your NN prediction variable
print("Cohen’s Kappa Score:", kappa)

Cohen’s Kappa Score: 0.9502957189299284

# Hamming Loss
from sklearn.metrics import hamming_loss
hl = hamming_loss(y_test, y_pred_nn)
print("Hamming Loss:", hl)

# Matthews Correlation Coefficient (MCC)
from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(y_test, y_pred_nn)
print("Matthews Correlation Coefficient:", mcc)

Hamming Loss: 0.0425531914893617
Matthews Correlation Coefficient: 0.9503949996555148

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    roc_auc_score, log_loss, cohen_kappa_score,
    matthews_corrcoef, hamming_loss, jaccard_score
)

# 1. Initialize and train AdaBoost model
ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_model.fit(X_train, y_train)

# 2. Predict on test set
y_pred_ada = ada_model.predict(X_test)

# 3. Evaluation metrics
ada_accuracy = accuracy_score(y_test, y_pred_ada)
ada_precision = precision_score(y_test, y_pred_ada, average='macro', zero_division=0)
ada_recall = recall_score(y_test, y_pred_ada, average='macro', zero_division=0)
ada_f1 = f1_score(y_test, y_pred_ada, average='macro', zero_division=0)

# 4. Print results
print(" AdaBoost Accuracy       :", ada_accuracy)
print(" AdaBoost Precision (Macro):", ada_precision)
print(" AdaBoost Recall (Macro)   :", ada_recall)
print(" AdaBoost F1 Score (Macro) :", ada_f1)

# 5. Classification report
print("\n Classification Report (AdaBoost):\n")
print(classification_report(y_test, y_pred_ada))


# 6. Confusion Matrix
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred_ada), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - AdaBoost")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.tight_layout()
plt.show()

 AdaBoost Accuracy       : 0.44680851063829785
 AdaBoost Precision (Macro): 0.43169859536552835
 AdaBoost Recall (Macro)   : 0.4636706023405531
 AdaBoost F1 Score (Macro) : 0.43698206970744075

 Classification Report (AdaBoost):

              precision    recall  f1-score   support

           0       0.71      0.85      0.77        54
           1       0.61      0.47      0.53        58
           2       0.16      0.27      0.20        70
           3       0.51      0.45      0.48        60
           4       0.00      0.00      0.00        65
           5       0.60      0.48      0.53        58
           6       0.43      0.72      0.54        58

    accuracy                           0.45       423
   macro avg       0.43      0.46      0.44       423
weighted avg       0.41      0.45      0.42       423

# Cross-validation accuracy
cv_scores_ada = cross_val_score(ada_model, X_train, y_train, cv=5, scoring='accuracy')

# Log loss
ada_logloss = log_loss(y_test, ada_model.predict_proba(X_test))

# Kappa score
ada_kappa = cohen_kappa_score(y_test, y_pred_ada)

# Matthews Correlation Coefficient
ada_mcc = matthews_corrcoef(y_test, y_pred_ada)

# Hamming Loss
ada_hamming = hamming_loss(y_test, y_pred_ada)

# Jaccard Score (for multi-class, use average='macro')
ada_jaccard = jaccard_score(y_test, y_pred_ada, average='macro')

# Print them
print("\nAdvanced Evaluation Metrics for AdaBoost:")
print("Cross-Validation Scores:", cv_scores_ada)
print("Mean CV Accuracy       :", cv_scores_ada.mean())
print("Log Loss               :", ada_logloss)
print("Cohen's Kappa Score    :", ada_kappa)
print("Matthews Corr. Coeff.  :", ada_mcc)
print("Hamming Loss           :", ada_hamming)
print("Jaccard Score (Macro)  :", ada_jaccard)

Advanced Evaluation Metrics for AdaBoost:
Cross-Validation Scores: [0.4408284  0.36094675 0.50591716 0.51335312 0.33827893]
Mean CV Accuracy       : 0.43186487103401044
Log Loss               : 1.9233342693918771
Cohen's Kappa Score    : 0.3536713572104868
Matthews Corr. Coeff.  : 0.3638453064151737
Hamming Loss           : 0.5531914893617021
Jaccard Score (Macro)  : 0.30750049890489545

# Import required libraries
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, log_loss, cohen_kappa_score,
    matthews_corrcoef, hamming_loss, jaccard_score
)
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Initialize and train the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

#  2. Predict on test data
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)

#  3. Basic Evaluation Metrics
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_precision = precision_score(y_test, y_pred_xgb, average='macro', zero_division=0)
xgb_recall = recall_score(y_test, y_pred_xgb, average='macro', zero_division=0)
xgb_f1 = f1_score(y_test, y_pred_xgb, average='macro', zero_division=0)

print("XGBoost Model Evaluation")
print("Accuracy         :", xgb_accuracy)
print("Precision (Macro):", xgb_precision)
print("Recall (Macro)   :", xgb_recall)
print("F1 Score (Macro) :", xgb_f1)

#  4. Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_xgb))

#  5. Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d', cmap='YlGnBu')
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

#  6. Advanced Evaluation Metrics
cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
xgb_logloss = log_loss(y_test, y_prob_xgb)
xgb_kappa = cohen_kappa_score(y_test, y_pred_xgb)
xgb_mcc = matthews_corrcoef(y_test, y_pred_xgb)
xgb_hamming = hamming_loss(y_test, y_pred_xgb)
xgb_jaccard = jaccard_score(y_test, y_pred_xgb, average='macro')

print("\n Advanced Metrics:")
print("Cross-Validation Scores:", cv_scores_xgb)
print("Mean CV Accuracy       :", cv_scores_xgb.mean())
print("Log Loss               :", xgb_logloss)
print("Cohen’s Kappa Score    :", xgb_kappa)
print("Matthews Corr. Coeff.  :", xgb_mcc)
print("Hamming Loss           :", xgb_hamming)
print("Jaccard Score (Macro)  :", xgb_jaccard)

XGBoost Model Evaluation
Accuracy         : 0.9621749408983451
Precision (Macro): 0.96388458602126
Recall (Macro)   : 0.9604484823696646
F1 Score (Macro) : 0.9612253592905303

Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.91      0.93        54
           1       0.85      0.97      0.90        58
           2       0.97      0.99      0.98        70
           3       0.98      0.98      0.98        60
           4       1.00      0.98      0.99        65
           5       1.00      0.91      0.95        58
           6       0.98      0.98      0.98        58

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.96      0.96      0.96       423

 Advanced Metrics:
Cross-Validation Scores: [0.96153846 0.9556213  0.97928994 0.97329377 0.94362018]
Mean CV Accuracy       : 0.9626727301459098
Log Loss               : 0.14251816673749076
Cohen’s Kappa Score    : 0.9558163978090992
Matthews Corr. Coeff.  : 0.9561285844910393
Hamming Loss           : 0.037825059101654845
Jaccard Score (Macro)  : 0.9269408632615789

from sklearn.model_selection import GridSearchCV

# 1. Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# 2. Wrap XGBClassifier in GridSearchCV, optimizing Macro-F1
grid_xgb = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    param_grid=param_grid_xgb,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# 3. Fit on training data
grid_xgb.fit(X_train, y_train)

# 4. Best hyperparameters and CV Macro-F1
print("Best XGBoost Parameters:", grid_xgb.best_params_)
print("Best CV Macro-F1 (XGB):", grid_xgb.best_score_)

# 5. Evaluate the tuned XGBoost on the test set
best_xgb = grid_xgb.best_estimator_
y_pred_xgb_tuned = best_xgb.predict(X_test)

# Basic metrics including tuned F1
xgb_accuracy_tuned = accuracy_score(y_test, y_pred_xgb_tuned)
xgb_f1_tuned = f1_score(y_test, y_pred_xgb_tuned, average='macro')

print("\nTuned XGBoost Test Accuracy       :", xgb_accuracy_tuned)
print("Tuned XGBoost Test Macro F1-Score :", xgb_f1_tuned)

# (Optional) Updated classification report & confusion matrix
print("\nClassification Report (Tuned XGB):\n", classification_report(y_test, y_pred_xgb_tuned))

cm_xgb_tuned = confusion_matrix(y_test, y_pred_xgb_tuned)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_xgb_tuned, annot=True, fmt='d', cmap='YlGnBu')
plt.title("Confusion Matrix - Tuned XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best XGBoost Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Best CV Macro-F1 (XGB): 0.9708284421412771

Tuned XGBoost Test Accuracy       : 0.9527186761229315
Tuned XGBoost Test Macro F1-Score : 0.9517625021913382

Classification Report (Tuned XGB):
               precision    recall  f1-score   support

           0       0.96      0.91      0.93        54
           1       0.85      0.95      0.89        58
           2       0.96      0.97      0.96        70
           3       0.98      0.98      0.98        60
           4       1.00      0.98      0.99        65
           5       0.96      0.91      0.94        58
           6       0.96      0.95      0.96        58

    accuracy                           0.95       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.95      0.95      0.95       423

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Collect results from all models 
results_final = [
    {
        "Model": "Logistic Regression",
        "Accuracy": accuracy_score(y_test, y_pred_lr),
        "Precision (Macro)": precision_score(y_test, y_pred_lr, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_lr, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_lr, average='macro', zero_division=0)
    },
    {
        "Model": "KNN",
        "Accuracy": accuracy_score(y_test, y_pred_knn),
        "Precision (Macro)": precision_score(y_test, y_pred_knn, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_knn, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_knn, average='macro', zero_division=0)
    },
    {
        "Model": "Random Forest",
        "Accuracy": accuracy_score(y_test, y_pred_rf),
        "Precision (Macro)": precision_score(y_test, y_pred_rf, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_rf, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_rf, average='macro', zero_division=0)
    },
    {
        "Model": "SVM",
        "Accuracy": accuracy_score(y_test, y_pred_svm),
        "Precision (Macro)": precision_score(y_test, y_pred_svm, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_svm, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_svm, average='macro', zero_division=0)
    },
    {
        "Model": "Decision Tree",
        "Accuracy": accuracy_score(y_test, y_pred_dt),
        "Precision (Macro)": precision_score(y_test, y_pred_dt, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_dt, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_dt, average='macro', zero_division=0)
    },
    {
        "Model": "Neural Network (MLP)",
        "Accuracy": accuracy_score(y_test, y_pred_nn),
        "Precision (Macro)": precision_score(y_test, y_pred_nn, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_nn, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_nn, average='macro', zero_division=0)
    },
    {
        "Model": "AdaBoost",
        "Accuracy": accuracy_score(y_test, y_pred_ada),
        "Precision (Macro)": precision_score(y_test, y_pred_ada, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_ada, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_ada, average='macro', zero_division=0)
    },
    {
        "Model": "XGBoost",
        "Accuracy": accuracy_score(y_test, y_pred_xgb),
        "Precision (Macro)": precision_score(y_test, y_pred_xgb, average='macro', zero_division=0),
        "Recall (Macro)": recall_score(y_test, y_pred_xgb, average='macro', zero_division=0),
        "F1 Score (Macro)": f1_score(y_test, y_pred_xgb, average='macro', zero_division=0)
    }
]

# Create and display DataFrame
results_df = pd.DataFrame(results_final).sort_values(by="F1 Score (Macro)", ascending=False)
display(results_df.round(3))

# Bar Plot: Accuracy and F1 Score
results_df.set_index("Model")[["Accuracy", "F1 Score (Macro)"]].plot(
    kind='bar', figsize=(10, 6), colormap='Set2', edgecolor='black'
)

plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.ylim(0, 1.05)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 📦 Imports
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# 1) Define all  models & their hyperparameter grids
model_grids = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {
            "model__C": [0.01, 0.1, 1, 10],
            "model__penalty": ["l2"],
            "model__solver": ["lbfgs"]
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "model__n_neighbors": [3,5,7,9],
            "model__weights": ["uniform","distance"],
            "model__metric": ["minkowski","euclidean"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "model__max_depth": [None, 5, 10, 20],
            "model__min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "model__n_estimators": [100,200,300],
            "model__max_depth": [None,5,10],
            "model__min_samples_split": [2,5,10]
        }
    },
    "AdaBoost": {
        "model": AdaBoostClassifier(random_state=42),
        "params": {
            "model__n_estimators": [50,100,200],
            "model__learning_rate": [0.5,1.0,1.5]
        }
    },
    "SVM": {
        "model": SVC(probability=True, random_state=42),
        "params": {
            "model__C": [0.1,1,10],
            "model__kernel": ["rbf","linear"],
            "model__gamma": ["scale","auto"]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
        "params": {
            "model__n_estimators": [100,200],
            "model__max_depth": [3,5,7],
            "model__learning_rate": [0.01,0.1,0.2]
        }
    }
}

# 2) Prepare a placeholder for results
results = []

# 3) Loop through each model, perform GridSearchCV, evaluate test F1
for name, config in model_grids.items():
    pipe = Pipeline([("model", config["model"])])
    grid = GridSearchCV(
        estimator=pipe,
        param_grid=config["params"],
        scoring="f1_macro",
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)

    # CV best
    best_cv_f1 = grid.best_score_
    best_params = grid.best_params_
    
    # Test set evaluation
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred, average="macro")
    
    # Collect
    results.append({
        "Model": name,
        "Best CV Macro-F1": best_cv_f1,
        "Test Accuracy": test_accuracy,
        "Test Macro-F1": test_f1,
        "Best Params": best_params
    })

# 4) Create a summary DataFrame
results_df = pd.DataFrame(results).sort_values("Test Macro-F1", ascending=False)
display(results_df.round(3))
# After you have 'results_df' as shown above:

# 5) Identify the best model by Test Macro-F1
best_idx = results_df['Test Macro-F1'].idxmax()
best_model_info = results_df.loc[best_idx]

print(f"🎉 Best Model: {best_model_info['Model']}")
print(f"Test Macro-F1: {best_model_info['Test Macro-F1']:.3f}")
print(f"Test Accuracy : {best_model_info['Test Accuracy']:.3f}")
print("Best Hyperparameters:", best_model_info['Best Params'])

🎉 Best Model: SVM
Test Macro-F1: 0.962
Test Accuracy : 0.962
Best Hyperparameters: {'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'linear'}

from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC

# Binarize the target (for multi-class ROC)
y_test_bin = label_binarize(y_test, classes=np.unique(y))
n_classes = y_test_bin.shape[1]

# Train One-vs-Rest SVM classifier with probability
ovr_svm = OneVsRestClassifier(SVC(kernel='linear', C=10, probability=True, random_state=42))
ovr_svm.fit(X_train, y_train)
y_score = ovr_svm.predict_proba(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot all ROC curves
plt.figure(figsize=(10, 8))
colors = plt.cm.get_cmap('tab10', n_classes)

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], lw=2, label='Class {} (AUC = {:.2f})'.format(i, roc_auc[i]), color=colors(i))

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-Class ROC Curve - SVM')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

C:\Users\himan\AppData\Local\Temp\ipykernel_18016\3442751617.py:28: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
  colors = plt.cm.get_cmap('tab10', n_classes)

from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt


# Compute permutation importance
result = permutation_importance(
    svm_model, X_test, y_test, n_repeats=10, random_state=42
)

# Plot
sorted_idx = result.importances_mean.argsort()
plt.figure(figsize=(10, 6))
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=X_test.columns[sorted_idx]
)
plt.title("Permutation Importance (SVM)")
plt.show()

C:\Users\himan\AppData\Local\Temp\ipykernel_18016\71362869.py:13: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot(

import xgboost as xgb
import matplotlib.pyplot as plt

# Train XGBoost with your best params
model_xgb = xgb.XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.2,
    max_depth=5,
    n_estimators=200,
    subsample=1.0,
    random_state=42
)
model_xgb.fit(X_train, y_train)

# Plot importance (default: 'weight' = number of times a feature is used in splits
xgb.plot_importance(model_xgb, max_num_features=10)
plt.show()

from sklearn.inspection import permutation_importance

result = permutation_importance(
    model_xgb, X_test, y_test, n_repeats=10, random_state=42
)

# Plot
sorted_idx = result.importances_mean.argsort()
plt.figure(figsize=(10, 6))
plt.barh(
    range(len(sorted_idx)),
    result.importances_mean[sorted_idx],
    xerr=result.importances_std[sorted_idx],
    align='center'
)
plt.yticks(range(len(sorted_idx)), X_test.columns[sorted_idx])
plt.title("Permutation Importance (XGBoost)")
plt.show()

Metric	Value
Accuracy	95.51%
Precision (Macro)	95.37%
Recall (Macro)	95.37%
F1 Score (Macro)	95.35%

Metric	Value	Interpretation
One-vs-Rest AUC Score	`0.837`	Measures the model’s ability to distinguish between each class and the rest. Values >0.8 indicate strong class separation.
Log Loss	`11.68`	Captures the confidence of predictions. Lower is better. A value this high suggests the model assigns low probability to true classes for some instances.
Cohen’s Kappa	`0.439`	Measures agreement between predicted and true labels adjusted for chance. Values between 0.41–0.60 indicate moderate agreement.
Hamming Loss	`0.4799`	Measures the fraction of incorrect class assignments. ~48% means on average, nearly half of the labels are incorrect across samples (can be inflated in multi-class).
Matthews Correlation Coefficient (MCC)	`0.462`	Balanced metric accounting for all confusion matrix outcomes. Values >0.4 imply moderate correlation between predicted and actual classes.

Metric	SVM (Linear)	XGBoost
F1-Score (Test)	Highest	Slightly lower
Accuracy	High	Comparable
ROC-AUC	High	Slightly lower

Aspect	SVM (Linear)	XGBoost
Method Used	Permutation Importance	SHAP, Gain, Cover, Permutation
Local Explanations	Not available	SHAP Summary & Force plots
Speed of Interpretation	Slower (with SHAP)	Faster and more detailed
Ranking Consistency	Reasonable	Clear & Reliable

Component	Role	Strength
SVM	Final prediction model	High F1-score, generalization
XGBoost	Interpretation tool	Deep feature insights

	Gender	Age	Height	Weight	family_history_with_overweight	FAVC	FCVC	NCP	CAEC	SMOKE	CH2O	SCC	FAF	TUE	CALC	MTRANS	NObeyesdad
0	Female	21.0	1.62	64.0	yes	no	2.0	3.0	Sometimes	no	2.0	no	0.0	1.0	no	Public_Transportation	Normal_Weight
1	Female	21.0	1.52	56.0	yes	no	3.0	3.0	Sometimes	yes	3.0	yes	3.0	0.0	Sometimes	Public_Transportation	Normal_Weight
2	Male	23.0	1.80	77.0	yes	no	2.0	3.0	Sometimes	no	2.0	no	2.0	1.0	Frequently	Public_Transportation	Normal_Weight
3	Male	27.0	1.80	87.0	no	no	3.0	3.0	Sometimes	no	2.0	no	2.0	0.0	Frequently	Walking	Overweight_Level_I
4	Male	22.0	1.78	89.8	no	no	2.0	1.0	Sometimes	no	2.0	no	0.0	0.0	Sometimes	Public_Transportation	Overweight_Level_II

	Gender	Age	Height	Weight	family_history_with_overweight	FAVC	FCVC	NCP	CAEC	SMOKE	CH2O	SCC	FAF	TUE	CALC	MTRANS	NObeyesdad	DBSCAN_Cluster_All
358	Male	41.0	1.75	110.0	yes	no	2.0	1.0	Sometimes	no	1.0	no	1.0	0.0	Frequently	Automobile	Obesity_Type_II	-1
92	Male	55.0	1.78	84.0	yes	no	3.0	4.0	Frequently	no	3.0	yes	3.0	0.0	Frequently	Walking	Overweight_Level_I	-1
284	Male	20.0	1.77	70.0	yes	yes	1.0	1.0	Sometimes	no	2.0	no	1.0	1.0	Sometimes	Public_Transportation	Normal_Weight	-1
367	Female	40.0	1.58	63.0	no	yes	2.0	3.0	Frequently	no	2.0	no	3.0	1.0	Sometimes	Public_Transportation	Overweight_Level_I	-1
232	Female	51.0	1.59	50.0	yes	no	3.0	3.0	Sometimes	yes	3.0	yes	2.0	0.0	no	Public_Transportation	Normal_Weight	-1

min_samples	Silhouette Score	Calinski-Harabasz Score	Davies-Bouldin Score
2	0.309	10.613	1.795
3	0.309	10.613	1.795
4	0.296	11.515	1.820
5 (Best)	0.284	12.179	1.859
6	0.284	12.179	1.859
7	0.292	11.627	2.226
8	0.287	10.149	2.547
9	0.281	9.615	2.746
10	0.267	8.939	3.069

	Gender	Age	Height	Weight	family_history_with_overweight	FCVC	NCP	SMOKE	CH2O	...	CAEC_no	CALC_Always	CALC_Frequently	CALC_Sometimes	CALC_no	MTRANS_Automobile	MTRANS_Bike	MTRANS_Motorbike	MTRANS_Public_Transportation	MTRANS_Walking
0	0	-0.522124	-0.875589	-0.862558	1	-0.785019	0.404153	0	-0.013073	...	False	False	False	False	True	False	False	False	True	False
1	0	-0.522124	-1.947599	-1.168077	1	1.088342	0.404153	1	1.618759	...	False	False	False	True	False	False	False	False	True	False
2	1	-0.206889	1.054029	-0.366090	1	-0.785019	0.404153	0	-0.013073	...	False	False	True	False	False	False	False	False	True	False
3	1	0.423582	1.054029	0.015808	0	1.088342	0.404153	0	-0.013073	...	False	False	True	False	False	False	False	False	False	True
4	1	-0.364507	0.839627	0.122740	0	-0.785019	-2.167023	0	-0.013073	...	False	False	False	True	False	False	False	False	True	False

Metric	Value
Accuracy	0.4468
Precision (Macro Avg)	0.4317
Recall (Macro Avg)	0.4637
F1 Score (Macro Avg)	0.4370
Cross-Validation Mean Accuracy	0.4318
Log Loss	1.9233
Cohen’s Kappa Score	0.3537
Matthews Corr. Coefficient	0.3638
Hamming Loss	0.5531
Jaccard Score (Macro)	0.3075

	Model	Accuracy	Precision (Macro)	Recall (Macro)	F1 Score (Macro)
7	XGBoost	0.962	0.964	0.960	0.961
5	Neural Network (MLP)	0.957	0.956	0.956	0.956
2	Random Forest	0.953	0.959	0.952	0.953
3	SVM	0.927	0.928	0.924	0.925
4	Decision Tree	0.905	0.907	0.903	0.904
0	Logistic Regression	0.879	0.876	0.877	0.876
1	KNN	0.837	0.835	0.831	0.825
6	AdaBoost	0.447	0.432	0.464	0.437

	Model	Best CV Macro-F1	Test Accuracy	Test Macro-F1	Best Params
5	SVM	0.957	0.962	0.962	{'model__C': 10, 'model__gamma': 'scale', 'mod...
6	XGBoost	0.966	0.953	0.951	{'model__learning_rate': 0.2, 'model__max_dept...
3	Random Forest	0.947	0.948	0.948	{'model__max_depth': None, 'model__min_samples...
0	Logistic Regression	0.935	0.929	0.927	{'model__C': 10, 'model__penalty': 'l2', 'mode...
2	Decision Tree	0.933	0.905	0.904	{'model__max_depth': None, 'model__min_samples...
1	KNN	0.829	0.868	0.853	{'model__metric': 'minkowski', 'model__n_neigh...
4	AdaBoost	0.414	0.414	0.346	{'model__learning_rate': 0.5, 'model__n_estima...

Model	Best CV Macro-F1	Test Accuracy	Test Macro-F1	Best Parameters
SVM	0.957	0.962	0.962	{'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'linear'}
XGBoost	0.966	0.953	0.951	{'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 100}
Random Forest	0.947	0.948	0.948	{'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Logistic Regression	0.935	0.929	0.927	{'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
Decision Tree	0.933	0.905	0.904	{'model__max_depth': None, 'model__min_samples_split': 2}
K-Nearest Neighbors	0.829	0.868	0.853	{'model__metric': 'minkowski', 'model__n_neighbors': 3, 'model__weights': 'distance'}
AdaBoost	0.414	0.414	0.346	{'model__learning_rate': 0.5, 'model__n_estimators': 50}

PROJECT REPORT¶

TITLE :¶

"Obesity Level Estimation using Eating Habits and Physical Condition Dataset"¶

INTRODUCTION :¶

The target variable, NObeyesdad (Obesity Level), classifies individuals into 7 categories:¶

PROBLEM STATEMENT :¶

"The objective of this project is to estimate the obesity level of individuals based on their eating habits, physical condition, and lifestyle attributes. The analysis includes data preprocessing, visualization, model building, evaluation, and inference generation."¶

1. Data Collection:¶

A. Loading the dataset:¶

B. Checking Basic Information:¶

2. Data Cleaning¶

A. Check for Duplicate Rows¶

B. Missing Value Check¶

Observation : "The dataset does not contain any missing values as confirmed by the .isnull().sum() function. # Therefore, no imputation or removal of data points is required at this stage."¶

3. Handling Outliers:¶

The various methods by which we detects outliers are as follows:¶

(i). Machine Learning Approaches¶

a.DBSCAN (Density-Based Spatial Clustering of Applications with Noise):¶

Observation and Interpretation on DBSCAN Parameter Selection and Evaluation¶

Objective:¶

Step 1: Feature Selection & Scaling¶

Step 2: Determining Optimal eps (Epsilon)¶

Step 3: Finding the Best min_samples (MinPts)¶

Step 4: Evaluation Results¶

Step 5: Final DBSCAN Run¶

Interpretation & Observations:¶

Conclusion:¶

b.Isolation Forest:¶

Observation:¶

c.Local Outlier Factor (LOF):¶

Observation:¶

(ii). Statistical Methods¶

a. Using IQR Method:¶

:Observation :¶

Summary:¶

b. Using z-score¶

Observation:¶

c.Using Tukey's Method¶

(iii).Visualization Techniques¶

a. Using Boxplot¶

b. Using Histograms¶

Observation:¶

Removal of outlier by DBSCAN Method¶

Observation :¶

DBSCAN Outlier Removal Summary:¶

4. Data Visualization¶

1. Univariate Analysis (Single Variable)¶

Plot: Histograms, Boxplots, Bar Charts, Pie Charts¶

Observations — Univariate Analysis:¶

2. Bivariate Analysis (Two Variables)¶

Plots: Scatter, Correlation Heatmap¶

Observations — Bivariate Analysis:¶

3. Multivariate Analysis (Multiple Variables)¶

Plots: Pair Plot, 3D Plot, Parallel Coordinates¶

Observations — Multivariate Analysis:¶

Trends, Patterns, and Anomalies Check¶

Trends, Patterns, and Anomalies — Observations:¶

Summary:¶

1. Univariate Analysis:¶

2. Bivariate Analysis:¶

3. Multivariate Analysis:¶

4. Trend, Pattern & Anomaly Detection:¶

5.Descriptive Statistics:¶

1. Measures of Central Tendency: (Mean, Median, Mode)¶

2. Measures of Dispersion: (Variance, Standard Deviation, Range, IQR)¶

Observation on Descriptive Statistics¶

Measures of Central Tendency:¶

Measures of Dispersion:¶

Summary:¶

6. Feature Engineering & Transformation¶

Encoding Categorical Variables:¶

Categorical Variable Encoding:¶

Feature Scaling¶

Observation on Different Feature Scaling Techniques¶

Final Observation:¶

Notes : ### Demonstration of Various Scaling Methods:¶

Scaling on the encoded data.¶

Observation:¶

7.Dimensionality Reduction¶

PCA (Principal Component Analysis) — Observations:¶