import sqlite3
import pandas as pd
import os

# EDA may be run on Anaconda Jupyter Notebooks
# There are known issues with the current working directory being different from the actual directory of the notebook
# It is best to specify the file path explicitly to avoid errors
# Note that because the original database stored on a server has been removed I am using a local copy.

path = "C:\\Users\jooer\OneDrive\Desktop\AIAP_ASSESSMENT\data\score.db"
os.chdir(path)
conn = sqlite3.connect(path)

df = pd.read_sql_query("SELECT * FROM 'score'", conn)


import pandas as pd

# Quick scan of the data to confirm all attributes are in place, notice that the dataset is relatively small

os.chdir('C:\\Users\jooer\OneDrive\Desktop\CODE\AIAP_ASSESSMENT_SUBMISSION\AIAP_ASSESSMENT\data')
df = pd.read_csv('score.csv')


# Conduct profiling of attributes and overall dataset with pandas_profiling, due to the small dataset, a full report can be generated

from pandas_profiling import ProfileReport

profile_report = ProfileReport(df)


# Quickly get a summary of the data using pandas_profiling, this specific dataset was small enough for us to get a full summary report on the data
# The warnings consolidated (under the 'Warnings' tab) by pandas_profiling are very useful for immediately identifying abnormalities and confirming expectations of the data
# For this case, the dataset report generated in less than 1 minute.

profile_report


# Remove entries that definitely will not be able to help us with prediction
df.dropna(subset=['final_test'], inplace=True)
print(len(df))
# Check out the duplicate entries first and foremost
duplicate_rows = df[df.duplicated(['student_id'],
                                  keep=False)].sort_values(by=['student_id'])
duplicate_rows

15405


# Time to remove the duplicate entries. By some weird 'error' the bag color is different for the duplicate student_id cases
# Since it is extremely likely that bag color is going to be removed from the features later on, duplicates will first be removed on everything except bag_color

df.drop_duplicates(subset=[
    'number_of_siblings', 'direct_admission', 'CCA', 'learning_style',
    'student_id', 'gender', 'attendance_rate', 'tuition', 'final_test',
    'n_male', 'n_female', 'age', 'hours_per_week', 'sleep_time', 'wake_time',
    'mode_of_transport'
],
                   inplace=True,
                   ignore_index=True)
df


# Additionally there are numerous cases where for a specifid student_id attendance rate is NaN for one entry and not NaN for the other
# There should only be one attendance rate per student so we will remove cases where student_id is the same and attendance rate is NaN
df['attendance_rate'] = df['attendance_rate'].fillna(-1)
duplicate_rows = df[df.duplicated(['student_id'],
                                  keep=False)].sort_values(by=['student_id'])
df.drop(duplicate_rows.loc[df['attendance_rate'] == -1].index, inplace=True)
df


# Additionally there are numerous cases where for a specifid student_id final test score is NaN for one entry and not NaN for the other
# There should only be one final_test score per student so we will remove cases where student_id is the same and final_test score is NaN
df['final_test'] = df['final_test'].fillna(-1)
duplicate_rows = df[df.duplicated(['student_id'],
                                  keep=False)].sort_values(by=['student_id'])
df.drop(duplicate_rows.loc[df['final_test'] == -1].index, inplace=True)
df


df.loc[df['final_test'] == -1]


df.loc[df['attendance_rate'] == -1]


# Replace back the -1 values with NaN to prevent later analysis from being affected by the -1

import numpy as np

df['attendance_rate'] = df['attendance_rate'].replace(-1, np.nan)


import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 7))
sns.swarmplot(x=df['bag_color'], y=df['final_test'],
              s=1).set_title('Swarm Plot of Bag Colors and Final Scores')

Text(0.5, 1.0, 'Swarm Plot of Bag Colors and Final Scores')


(df.groupby(['bag_color']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


# Noticed earlier that CCA has some labels that differ superficially that should be combined
df["CCA"].replace(
    {
        "ARTS": "Arts",
        "SPORTS": "Sports",
        "CLUBS": "Clubs",
        "NONE": "None"
    },
    inplace=True)
set(df["CCA"])

{'Arts', 'Clubs', 'None', 'Sports'}


plt.figure(figsize=(14, 7))
sns.swarmplot(x=df['CCA'], y=df['final_test'],
              s=1).set_title('Swarm Plot of CCA and Final Scores')

Text(0.5, 1.0, 'Swarm Plot of CCA and Final Scores')


(df.groupby(['CCA']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


plt.figure(figsize=(14, 7))
sns.swarmplot(
    x=df['direct_admission'], y=df['final_test'],
    s=1).set_title('Swarm Plot of Direct Admission State and Final Scores')

Text(0.5, 1.0, 'Swarm Plot of Direct Admission State and Final Scores')


(df.groupby(['direct_admission']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.kdeplot(data=df.loc[df['direct_admission'] == 'Yes']['final_test'],
            label="DA",
            shade=True)
sns.kdeplot(data=df.loc[df['direct_admission'] == 'No']['final_test'],
            label="Non-DA",
            shade=True)

plt.title(
    "Score Distribution Comparison between Direct Admission and Non-Direct Admission Students"
)
plt.legend()

<matplotlib.legend.Legend at 0x219602b7c10>


sns.kdeplot(data=df.loc[(df['direct_admission'] == 'Yes')
                        & (df['CCA'] == 'None')]['final_test'],
            label="No CCA DA",
            shade=True)
sns.kdeplot(data=df.loc[(df['direct_admission'] == 'Yes')
                        & (df['CCA'] == 'Clubs')]['final_test'],
            label="Club DA",
            shade=True)
sns.kdeplot(data=df.loc[(df['direct_admission'] == 'Yes')
                        & (df['CCA'] == 'Sports')]['final_test'],
            label="Sports DA",
            shade=True)
sns.kdeplot(data=df.loc[(df['direct_admission'] == 'Yes')
                        & (df['CCA'] == 'Arts')]['final_test'],
            label="Arts DA",
            shade=True)

plt.title(
    "Score Distribution Comparison between Direct Admission Students of Different Clubs"
)
plt.legend()

<matplotlib.legend.Legend at 0x219602b3fa0>


sns.kdeplot(data=df.loc[(df['direct_admission'] == 'Yes')
                        & (df['CCA'] == 'None')]['final_test'],
            label="No CCA DA",
            shade=True)
sns.kdeplot(data=df.loc[(df['direct_admission'] == 'No')
                        & (df['CCA'] == 'None')]['final_test'],
            label="No CCA Non-DA",
            shade=True)
plt.title(
    "Score Distribution Comparison between Direct Admission Students and Non-Direct Admission Students with No CCA"
)
plt.legend()

<matplotlib.legend.Legend at 0x2195fec8e80>


plt.figure(figsize=(14, 7))
sns.swarmplot(x=df['wake_time'], y=df['final_test'],
              s=1).set_title('Swarm Plot of Wake Time and Final Scores')

Text(0.5, 1.0, 'Swarm Plot of Wake Time and Final Scores')


plt.figure(figsize=(14, 7))
sns.stripplot(x=df['sleep_time'], y=df['final_test'], s=1)

<AxesSubplot:xlabel='sleep_time', ylabel='final_test'>


from datetime import datetime, date

df['wake_time1'] = pd.to_datetime(df['wake_time'])
df['sleep_time1'] = pd.to_datetime(df['sleep_time'])

df['wake_time1'] = [
    datetime.combine(date.min, d.time()) for d in df['wake_time1']
]
df['sleep_time1'] = [
    datetime.combine(date.min, d.time()) for d in df['sleep_time1']
]

# Create the new feature 'sleep_hours'
df['sleep_hours'] = df['wake_time1'] - df['sleep_time1']
df['sleep_hours'] = [d.seconds / 3600 for d in df['sleep_hours']]

# Check that the output is correct
df[['sleep_time', 'wake_time', 'sleep_hours']].sort_values(by=['sleep_hours'])


plt.figure(figsize=(14, 7))
sns.stripplot(x=df['sleep_hours'], y=df['final_test'], s=1)

<AxesSubplot:xlabel='sleep_hours', ylabel='final_test'>


(df.groupby(['sleep_hours']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.regplot(x="hours_per_week", y="final_test", data=df, order=2)

<AxesSubplot:xlabel='hours_per_week', ylabel='final_test'>


sns.catplot(
    y="final_test",
    col="hours_per_week",
    data=df,
    kind='boxen',
    sharey=False,
    col_wrap=5,
)

<seaborn.axisgrid.FacetGrid at 0x2195aa0cf40>


# Identify and quantify the anomalies
anomalies = df.loc[(df['hours_per_week'] <= 4) & (df['final_test'] >= 75)]

anomalies


sns.regplot(x="attendance_rate", y="final_test", data=df, order=2)

<AxesSubplot:xlabel='attendance_rate', ylabel='final_test'>


sns.scatterplot(x=df['age'], y=df['final_test'], hue=df['gender'])
set(df['age'])

{-5.0, -4.0, 5.0, 6.0, 15.0, 16.0}


# Quick scan of the negative age entries to determine if the error is related to some other feature.
df.loc[(df['age'] < 0)]


# Fix the age errors
df.drop(df.loc[df.age < 0].index, inplace=True)
df["age"].replace({
    5: 15,
    6: 16,
}, inplace=True)


(df.groupby(['age']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.kdeplot(data=df.loc[(df['age'] == 15)]['final_test'],
            label="15 Y/O",
            shade=True)
sns.kdeplot(data=df.loc[(df['age'] == 16)]['final_test'],
            label="16 Y/O",
            shade=True)
plt.title("Age Distribution Comparison between 15 and 16 Y/O Students")
plt.legend()

<matplotlib.legend.Legend at 0x21952d22340>


# Noticed that tuition data had different labels that meant the same thing as well
df["tuition"].replace({
    'N': 'No',
    'Y': 'Yes',
}, inplace=True)

(df.groupby(['tuition']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.kdeplot(data=df.loc[(df['tuition'] == 'Yes')]['final_test'],
            label="Tuition",
            shade=True)
sns.kdeplot(data=df.loc[(df['tuition'] == 'No')]['final_test'],
            label="No Tuition",
            shade=True)
plt.title(
    "Score Distribution Comparison between Students with and without Tuition")
plt.legend()

<matplotlib.legend.Legend at 0x21952f0d8b0>


sns.scatterplot(x=df['hours_per_week'], y=df['final_test'], hue=df['tuition'])

<AxesSubplot:xlabel='hours_per_week', ylabel='final_test'>


print('Percentage of Students with tuition in anomalies: ' + str(100 * round(
    len(anomalies.loc[anomalies['tuition'] == 'Yes']) / len(anomalies), 3)) +
      '%')
print('Percentage of Students with tuition in dataset: ' +
      str(100 * round(len(df.loc[df['tuition'] == 'Yes']) / len(df), 3)) + '%')

Percentage of Students with tuition in anomalies: 54.2%
Percentage of Students with tuition in dataset: 56.699999999999996%


df.drop(anomalies.index, inplace=True)


sns.scatterplot(x=df['hours_per_week'], y=df['final_test'], hue=df['tuition'])

<AxesSubplot:xlabel='hours_per_week', ylabel='final_test'>


(df.groupby(['gender']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.kdeplot(data=df.loc[(df['gender'] == 'Female')]['final_test'],
            label="Female",
            shade=True)
sns.kdeplot(data=df.loc[(df['gender'] == 'Male')]['final_test'],
            label="Male",
            shade=True)
plt.title("Score Distribution Comparison between Male and Female Students")
plt.legend()

<matplotlib.legend.Legend at 0x21952cd8910>


(df.groupby(['learning_style']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.kdeplot(data=df.loc[(df['learning_style'] == 'Auditory')]['final_test'],
            label="Auditory",
            shade=True)
sns.kdeplot(data=df.loc[(df['learning_style'] == 'Visual')]['final_test'],
            label="Visual",
            shade=True)
plt.title("Score Distribution Comparison between Auditory and Visual Learners")
plt.legend()

<matplotlib.legend.Legend at 0x2195b295160>


(df.groupby(['mode_of_transport']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.kdeplot(
    data=df.loc[(df['mode_of_transport'] == 'public transport')]['final_test'],
    label="Public",
    shade=True)
sns.kdeplot(data=df.loc[(
    df['mode_of_transport'] == 'private transport')]['final_test'],
            label="Private",
            shade=True)
sns.kdeplot(data=df.loc[(df['mode_of_transport'] == 'walk')]['final_test'],
            label="Walk",
            shade=True)
plt.title("Score Distribution Comparison between Auditory and Visual Learners")
plt.legend()

<matplotlib.legend.Legend at 0x2195317f6d0>


(df.groupby(['mode_of_transport']).agg(
    {'sleep_hours': ['mean', 'count', 'median', 'min', 'count', 'std',
                     'var']}))


sns.kdeplot(data=df.loc[(
    df['mode_of_transport'] == 'public transport')]['sleep_hours'],
            label="Public",
            shade=True)
sns.kdeplot(data=df.loc[(
    df['mode_of_transport'] == 'private transport')]['sleep_hours'],
            label="Private",
            shade=True)
sns.kdeplot(data=df.loc[(df['mode_of_transport'] == 'walk')]['sleep_hours'],
            label="Walk",
            shade=True)
plt.title("Transport Mode Distribution Comparison Across Different Hours")
plt.legend(loc='upper left')

<matplotlib.legend.Legend at 0x2195fb7b0a0>


(df.groupby(['mode_of_transport']).agg({
    'hours_per_week':
    ['mean', 'count', 'median', 'min', 'count', 'std', 'var']
}))


sns.catplot(x="mode_of_transport",
            y="final_test",
            hue="sleep_hours",
            kind="swarm",
            data=df.sample(n=2000, random_state=1),
            s=8,
            height=8.27,
            aspect=11.7 / 8.27)

<seaborn.axisgrid.FacetGrid at 0x21952cb09a0>


sns.catplot(x="sleep_hours",
            y="final_test",
            hue="mode_of_transport",
            kind="swarm",
            data=df.sample(n=800, random_state=1),
            s=6,
            height=8.27,
            aspect=11.7 / 8.27)

<seaborn.axisgrid.FacetGrid at 0x2195fde6820>


plt.figure(figsize=(14, 7))
sns.swarmplot(
    x=df['wake_time'], y=df['final_test'], hue=df['mode_of_transport'], s=1
).set_title(
    'Swarm Plot of Wake Time and Final Scores with Mode of Transport Label')

Text(0.5, 1.0, 'Swarm Plot of Wake Time and Final Scores with Mode of Transport Label')


df['wake_time3'] = pd.to_numeric(
    df['wake_time'].str[0]) + pd.to_numeric(df['wake_time'].str[2]) * 5 / 30
(df.groupby(['mode_of_transport']).agg(
    {'wake_time3': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


sns.set(rc={'figure.figsize': (11.7, 8.27)})

sns.kdeplot(data=df.loc[(df['n_female'] >= 20)]['final_test'],
            label=">=20 Females",
            shade=True)
sns.kdeplot(data=df.loc[(df['n_female'] >= 10)
                        & (df['n_female'] < 20)]['final_test'],
            label="10-20 Females",
            shade=True)
sns.kdeplot(data=df.loc[(df['n_female'] < 10)]['final_test'],
            label="<10 Females",
            shade=True)
plt.title(
    "Score Distribution Comparison between Different Number of Female Students"
)
plt.legend()

<matplotlib.legend.Legend at 0x2195fbf9850>


sns.set(rc={'figure.figsize': (11.7, 8.27)})

sns.kdeplot(data=df.loc[(df['n_male'] >= 20)]['final_test'],
            label=">=20 Males",
            shade=True)
sns.kdeplot(data=df.loc[(df['n_male'] >= 10)
                        & (df['n_male'] < 20)]['final_test'],
            label="10-20 Males",
            shade=True)
sns.kdeplot(data=df.loc[(df['n_male'] < 10)]['final_test'],
            label="<10 Males",
            shade=True)
plt.title(
    "Score Distribution Comparison between Different Number of Male Students")
plt.legend()

<matplotlib.legend.Legend at 0x219531731c0>


sns.set_style("whitegrid")
sns.jointplot(x=df['n_female'], y=df['final_test'], kind="kde")

<seaborn.axisgrid.JointGrid at 0x2195fbf9100>


df['n_female_cat'] = None

df.loc[df['n_female'] >= 20, 'n_female_cat'] = 3
df.loc[(df['n_female'] >= 10) & (df['n_female'] < 20), 'n_female_cat'] = 2
df.loc[(df['n_female'] > 0) & (df['n_female'] < 10), 'n_female_cat'] = 1
df.loc[(df['n_female'] == 0), 'n_female_cat'] = 0

sns.set_style("whitegrid")
sns.jointplot(x=df['n_female'],
              y=df['final_test'],
              kind="kde",
              hue=df['n_female_cat'])

D:\ANACONDA\lib\site-packages\seaborn\distributions.py:1078: UserWarning: Dataset has 0 variance; skipping density estimate.
  warnings.warn(msg, UserWarning)
D:\ANACONDA\lib\site-packages\seaborn\distributions.py:306: UserWarning: Dataset has 0 variance; skipping density estimate.
  warnings.warn(msg, UserWarning)

<seaborn.axisgrid.JointGrid at 0x2195fbfab20>


sns.set_style("whitegrid")
sns.jointplot(x=df['n_male'], y=df['final_test'], kind="kde")

<seaborn.axisgrid.JointGrid at 0x2195ab0f9a0>


df['n_male_cat'] = None

df.loc[df['n_male'] >= 20, 'n_male_cat'] = 3
df.loc[(df['n_male'] >= 10) & (df['n_male'] < 20), 'n_male_cat'] = 2
df.loc[(df['n_male'] > 0) & (df['n_male'] < 10), 'n_male_cat'] = 1
df.loc[(df['n_male'] == 0), 'n_male_cat'] = 0

sns.set_style("whitegrid")
sns.jointplot(x=df['n_male'],
              y=df['final_test'],
              kind="kde",
              hue=df['n_male_cat'])

D:\ANACONDA\lib\site-packages\seaborn\distributions.py:1078: UserWarning: Dataset has 0 variance; skipping density estimate.
  warnings.warn(msg, UserWarning)
D:\ANACONDA\lib\site-packages\seaborn\distributions.py:306: UserWarning: Dataset has 0 variance; skipping density estimate.
  warnings.warn(msg, UserWarning)

<seaborn.axisgrid.JointGrid at 0x21952e66d30>


df['class_size'] = df['n_male'] + df['n_female']


sns.set_style("whitegrid")
sns.jointplot(x=df['class_size'], y=df['final_test'], kind="kde")

<seaborn.axisgrid.JointGrid at 0x2195cb51850>


df['male_class'] = 0
df['female_class'] = 0

df.loc[(df['n_female'] == 0), 'male_class'] = 1
df.loc[(df['n_male'] == 0), 'female_class'] = 1


sns.set_style("whitegrid")
sns.jointplot(x=df['class_size'],
              y=df['final_test'],
              kind="kde",
              hue=df['male_class'])

<seaborn.axisgrid.JointGrid at 0x2195a54d550>


sns.jointplot(data=df, x="class_size", y="final_test", hue="male_class")

<seaborn.axisgrid.JointGrid at 0x2195a537b80>


sns.set_style("whitegrid")
sns.jointplot(x=df['class_size'],
              y=df['final_test'],
              kind="kde",
              hue=df['female_class'])

D:\ANACONDA\lib\site-packages\seaborn\distributions.py:1182: UserWarning: No contour levels were found within the data range.
  cset = contour_func(

<seaborn.axisgrid.JointGrid at 0x2195fa62340>


sns.jointplot(data=df, x="class_size", y="final_test", hue="female_class")

<seaborn.axisgrid.JointGrid at 0x219660dd2b0>


# Note that there are n_male = 0 values which can give a division over 0 error. However, the plot has ignored such values.
df['gender_ratio'] = df['n_female'] / df['n_male']
sns.lmplot(x="gender_ratio", y="final_test", data=df)

D:\ANACONDA\lib\site-packages\numpy\core\function_base.py:151: RuntimeWarning: invalid value encountered in multiply
  y *= step
D:\ANACONDA\lib\site-packages\numpy\lib\nanfunctions.py:1395: RuntimeWarning: All-NaN slice encountered
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,

<seaborn.axisgrid.FacetGrid at 0x2195fbf9760>


sns.lmplot(x="number_of_siblings", y="final_test", data=df)

<seaborn.axisgrid.FacetGrid at 0x2195cd27f70>


sns.set(rc={'figure.figsize': (11.7, 8.27)})

sns.kdeplot(data=df.loc[(df['number_of_siblings'] == 0)]['final_test'],
            label="0 Siblings",
            shade=True)
sns.kdeplot(data=df.loc[(df['number_of_siblings'] == 1)]['final_test'],
            label="1 Sibling",
            shade=True)
sns.kdeplot(data=df.loc[(df['number_of_siblings'] == 2)]['final_test'],
            label="2 Siblings",
            shade=True)
plt.title("Score Distribution Comparison between Different Number of Siblings")
plt.legend()

<matplotlib.legend.Legend at 0x219533cf5e0>


sns.set(rc={'figure.figsize': (11.7, 8.27)})

sns.kdeplot(data=df.loc[(df['number_of_siblings'] == 2)
                        & (df['tuition'] == 'Yes')]['final_test'],
            label="Tuition",
            shade=True)
sns.kdeplot(data=df.loc[(df['number_of_siblings'] == 2)
                        & (df['tuition'] == 'No')]['final_test'],
            label="No Tuition",
            shade=True)

plt.title(
    "Score Distribution Comparison between Students (with 2 siblings) with and without Tuition"
)
plt.legend()

<matplotlib.legend.Legend at 0x2195a39b070>


(df.groupby(['number_of_siblings']).agg({
    'hours_per_week':
    ['mean', 'count', 'median', 'min', 'count', 'std', 'var']
}))


sns.set(rc={'figure.figsize': (11.7, 8.27)})

sns.kdeplot(data=df.loc[(df['number_of_siblings'] == 0)
                        & (df['tuition'] == 'Yes')]['final_test'],
            label="Tuition",
            shade=True)
sns.kdeplot(data=df.loc[(df['number_of_siblings'] == 0)
                        & (df['tuition'] == 'No')]['final_test'],
            label="No Tuition",
            shade=True)

plt.title(
    "Score Distribution Comparison between Students (with 0 siblings) with and without Tuition"
)
plt.legend()

<matplotlib.legend.Legend at 0x21954029400>


# The default privilege rating will be 2, in between 1 (under privileged) and 3 (privileged).
df['privilege_rating'] = 2

df.loc[(df['tuition'] == 'Yes') & (df['number_of_siblings'] == 0),
       'privilege_rating'] = 3

df.loc[(df['tuition'] == 'No') & (df['number_of_siblings'] == 2),
       'privilege_rating'] = 1

(df.groupby(['privilege_rating']).agg({
    'hours_per_week':
    ['mean', 'count', 'median', 'min', 'count', 'std', 'var']
}))


underprivileged = df.loc[(df['privilege_rating'] == 1)]
(underprivileged.groupby(['hours_per_week']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


privileged = df.loc[(df['privilege_rating'] == 3)]
(privileged.groupby(['hours_per_week']).agg(
    {'final_test': ['mean', 'count', 'median', 'min', 'count', 'std', 'var']}))


# Look at data types to quickly get an idea of which entries need to be encoded
df.dtypes

df_index                int64
number_of_siblings      int64
direct_admission       object
CCA                    object
learning_style         object
student_id             object
gender                 object
tuition                object
final_test            float64
n_male                float64
n_female              float64
age                   float64
hours_per_week        float64
attendance_rate       float64
sleep_time             object
wake_time              object
mode_of_transport      object
bag_color              object
wake_time1             object
sleep_time1            object
sleep_hours           float64
wake_time3            float64
n_female_cat           object
n_male_cat             object
class_size            float64
male_class              int64
female_class            int64
gender_ratio          float64
privilege_rating        int64
dtype: object


# Drop columns that are redundant or known to be poor performers
df1 = df.drop(columns=[
    'bag_color', 'gender_ratio', 'wake_time1', 'sleep_time1', 'wake_time3',
    'student_id'
])


# Group the columns that need to be one-hot encoded
# n_male_cat and n_female_cat are ordinal and should not be converted
df1['n_female_cat'] = pd.to_numeric(df1['n_female_cat'])
df1['n_male_cat'] = pd.to_numeric(df1['n_male_cat'])
categorical_cols = [
    cname for cname in df1.columns
    if df1[cname].nunique() < 5 and df1[cname].dtype == "object"
]
categorical_cols

['direct_admission',
 'CCA',
 'learning_style',
 'gender',
 'tuition',
 'mode_of_transport']


import math

time_map = dict()
for time in np.arange(0.0, 24.0, 0.5):
    if time <= 12:
        time_map[str(int(math.modf(time)[1])) + ':' +
                 str(int(math.modf(time)[0] * 6)) + '0'] = time + 12
    else:
        time_map[str(int(math.modf(time)[1])) + ':' +
                 str(int(math.modf(time)[0] * 6)) + '0'] = time - 12

time_map

{'0:00': 12.0,
 '0:30': 12.5,
 '1:00': 13.0,
 '1:30': 13.5,
 '2:00': 14.0,
 '2:30': 14.5,
 '3:00': 15.0,
 '3:30': 15.5,
 '4:00': 16.0,
 '4:30': 16.5,
 '5:00': 17.0,
 '5:30': 17.5,
 '6:00': 18.0,
 '6:30': 18.5,
 '7:00': 19.0,
 '7:30': 19.5,
 '8:00': 20.0,
 '8:30': 20.5,
 '9:00': 21.0,
 '9:30': 21.5,
 '10:00': 22.0,
 '10:30': 22.5,
 '11:00': 23.0,
 '11:30': 23.5,
 '12:00': 24.0,
 '12:30': 0.5,
 '13:00': 1.0,
 '13:30': 1.5,
 '14:00': 2.0,
 '14:30': 2.5,
 '15:00': 3.0,
 '15:30': 3.5,
 '16:00': 4.0,
 '16:30': 4.5,
 '17:00': 5.0,
 '17:30': 5.5,
 '18:00': 6.0,
 '18:30': 6.5,
 '19:00': 7.0,
 '19:30': 7.5,
 '20:00': 8.0,
 '20:30': 8.5,
 '21:00': 9.0,
 '21:30': 9.5,
 '22:00': 10.0,
 '22:30': 10.5,
 '23:00': 11.0,
 '23:30': 11.5}


df1['wake_time'].to_string()
df1['sleep_time'].to_string()
df1['wake_time'].replace(time_map, inplace=True)
df1['sleep_time'].replace(time_map, inplace=True)


set(df1['wake_time'])

{17.0, 17.5, 18.0, 18.5, 19.0}


set(df1['sleep_time'])

{9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0, 12.5, 13.0, 13.5, 14.0, 14.5, 15.0}


df1 = pd.get_dummies(data=df1, columns=categorical_cols)

df1


df1.dtypes

df_index                                 int64
number_of_siblings                       int64
final_test                             float64
n_male                                 float64
n_female                               float64
age                                    float64
hours_per_week                         float64
attendance_rate                        float64
sleep_time                             float64
wake_time                              float64
sleep_hours                            float64
n_female_cat                             int64
n_male_cat                               int64
class_size                             float64
male_class                               int64
female_class                             int64
privilege_rating                         int64
direct_admission_No                      uint8
direct_admission_Yes                     uint8
CCA_Arts                                 uint8
CCA_Clubs                                uint8
CCA_None                                 uint8
CCA_Sports                               uint8
learning_style_Auditory                  uint8
learning_style_Visual                    uint8
gender_Female                            uint8
gender_Male                              uint8
tuition_No                               uint8
tuition_Yes                              uint8
mode_of_transport_private transport      uint8
mode_of_transport_public transport       uint8
mode_of_transport_walk                   uint8
dtype: object


# Confirm that attendance rate still has missing values
set(df1['attendance_rate'])

{nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 40.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 41.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}


df1['final_test'].quantile([0.1, 0.3, 0.6, 1])

0.1     48.0
0.3     58.0
0.6     71.0
1.0    100.0
Name: final_test, dtype: float64


df1['final_grade'] = df1['final_test']
df1[['final_test', 'final_grade']].head()


# Create label for classifier

df1['final_grade']
df1['final_grade'] = df1['final_test']
df1.loc[df1['final_grade'] <= df1['final_grade'].quantile(0.1),
        'final_grade'] = 1
df1.loc[(df1['final_grade'] < df1['final_grade'].quantile(0.3)) &
        (df1['final_grade'] > df1['final_grade'].quantile(0.1)),
        'final_grade'] = 2
df1.loc[(df1['final_grade'] < df1['final_grade'].quantile(0.6)) &
        (df1['final_grade'] >= df1['final_grade'].quantile(0.3)),
        'final_grade'] = 3
df1.loc[(df1['final_grade'] >= df1['final_grade'].quantile(0.6)),
        'final_grade'] = 4


df1[['final_test', 'final_grade']].head()


df1.dropna(axis=0, subset=['final_test'], inplace=True)
# Raw score values are set as the target for the regressor
y = df1.final_test
y_c = df1.final_grade
X = df1.copy()
X.drop(['final_test', 'final_grade', 'df_index'], axis=1, inplace=True)


# Iterative imputation is generally better than simple imputation but both will be attmempted

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Simple Imputer
X_simp = X.copy()
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(np.array(X_simp['attendance_rate']).reshape(-1, 1))
X_simp['attendance_rate'] = imp.transform(
    np.array(X_simp['attendance_rate']).reshape(-1, 1))

# Iterative Imputer
X_iter = X.copy()
imp_mean = IterativeImputer(random_state=0)
imp_mean.fit(X_iter, y)
imputed_data = imp_mean.transform(X_iter)
X_iter = pd.DataFrame(data=imputed_data, columns=list(X_iter.columns))

# Set up vanilla XGB models for regression and multiclass labelling
xgb_r = XGBRegressor(objective="reg:squarederror")
xgb_c = XGBClassifier(objective="multi:softprob")

cv = RepeatedKFold(n_splits=3, n_repeats=2, random_state=1)


X_train_full, X_test, y_train_full, y_test = train_test_split(X_simp,
                                                              y,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_r,
                                   X_train,
                                   y_train,
                                   scoring='neg_mean_squared_error',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')


print(xgb_r)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)


X_train_full, X_test, y_train_full, y_test = train_test_split(X_iter,
                                                              y,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_r,
                                   X_train,
                                   y_train,
                                   scoring='neg_mean_squared_error',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [-31.4023928  -32.44422675 -33.18844678 -32.1455363  -30.80067775
 -33.55715614]
K-fold Average Validation Score: -32.256406085624924


# Remember to change y to y_c since the classification target is different
X_train_full, X_test, y_train_full, y_test = train_test_split(X_simp,
                                                              y_c,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y_c)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_c,
                                   X_train,
                                   y_train,
                                   scoring='accuracy',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [0.78358399 0.77491716 0.78281927 0.78715269 0.7802702  0.78052511]
K-fold Average Validation Score: 0.7815447361712975


X_train_full, X_test, y_train_full, y_test = train_test_split(X_iter,
                                                              y_c,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y_c)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_c,
                                   X_train,
                                   y_train,
                                   scoring='accuracy',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [0.78205455 0.77976039 0.78001529 0.78995667 0.7764466  0.77568188]
K-fold Average Validation Score: 0.7806525618149376


from numpy import mean
from numpy import std

features_to_scale = [
    'number_of_siblings', 'n_male', 'n_female', 'age', 'hours_per_week',
    'attendance_rate', 'sleep_time', 'wake_time', 'sleep_hours',
    'n_female_cat', 'n_male_cat', 'class_size', 'privilege_rating'
]

X_to_scale = X_iter.loc[:, features_to_scale]
X_scaled = (X_to_scale - X_to_scale.mean(axis=0)) / X_to_scale.std(axis=0)


# Check that scaling has the intended effect
X_scaled.describe()


from sklearn.decomposition import PCA

# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_pca


# Get the PCA Loadings

loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=X_scaled.columns,  # and the rows are the original features
)
loadings


def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(xlabel="Component",
               title="% Explained Variance",
               ylim=(0.0, 1.0))
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(xlabel="Component",
               title="% Cumulative Variance",
               ylim=(0.0, 1.0))
    # Set up figure
    fig.set(figwidth=14, dpi=100)
    return axs


# Scree Plot - shows which PC contributes the most variance to the data
plot_variance(pca)

array([<AxesSubplot:title={'center':'% Explained Variance'}, xlabel='Component'>,
       <AxesSubplot:title={'center':'% Cumulative Variance'}, xlabel='Component'>],
      dtype=object)


# Use mutual information to determine which components of the PCA are most informative about the target

from sklearn.feature_selection import mutual_info_regression


def make_mi_scores(X, y, discrete_features=False):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X,
                                       y,
                                       discrete_features=discrete_features,
                                       random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


mi_scores = make_mi_scores(X_pca, y, discrete_features=False)
mi_scores

PC8     0.560506
PC9     0.246075
PC11    0.200014
PC3     0.189373
PC5     0.183730
PC6     0.174763
PC7     0.167088
PC2     0.153347
PC4     0.127400
PC1     0.110944
PC13    0.054836
PC10    0.053474
PC12    0.032515
Name: MI Scores, dtype: float64


from sklearn import linear_model
from sklearn.feature_selection import SequentialFeatureSelector

lasso = linear_model.Lasso(alpha=0.1)

sfs_forward = SequentialFeatureSelector(lasso,
                                        n_features_to_select=20,
                                        direction='forward').fit(X_iter, y)

sfs_backward = SequentialFeatureSelector(lasso,
                                         n_features_to_select=20,
                                         direction='backward').fit(X_iter, y)

feature_names = X_iter.columns

print("Features selected by forward sequential selection: "
      f"{feature_names[sfs_forward.get_support()]}")

print("Features selected by backward sequential selection: "
      f"{feature_names[sfs_backward.get_support()]}")

Features selected by forward sequential selection: Index(['number_of_siblings', 'n_female', 'age', 'hours_per_week',
       'attendance_rate', 'sleep_time', 'wake_time', 'n_female_cat',
       'class_size', 'male_class', 'privilege_rating', 'direct_admission_No',
       'direct_admission_Yes', 'CCA_Arts', 'CCA_None', 'CCA_Sports',
       'learning_style_Auditory', 'learning_style_Visual', 'tuition_No',
       'tuition_Yes'],
      dtype='object')
Features selected by backward sequential selection: Index(['number_of_siblings', 'n_male', 'n_female', 'hours_per_week',
       'attendance_rate', 'sleep_time', 'n_male_cat', 'male_class',
       'CCA_Clubs', 'CCA_None', 'CCA_Sports', 'learning_style_Auditory',
       'learning_style_Visual', 'gender_Female', 'gender_Male', 'tuition_No',
       'tuition_Yes', 'mode_of_transport_private transport',
       'mode_of_transport_public transport', 'mode_of_transport_walk'],
      dtype='object')


forward_set_removed = set(X_iter.columns) - \
    set(feature_names[sfs_forward.get_support()])
backward_set_removed = set(X_iter.columns) - \
    set(feature_names[sfs_backward.get_support()])
to_be_removed = forward_set_removed.intersection(backward_set_removed)
to_be_removed

{'female_class', 'sleep_hours'}


X_cut = X_iter.copy()
X_cut.drop(list(to_be_removed), axis=1, inplace=True)

X_train_full, X_test, y_train_full, y_test = train_test_split(X_cut,
                                                              y,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_r,
                                   X_train,
                                   y_train,
                                   scoring='neg_mean_squared_error',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [-31.64088678 -32.46067175 -33.40253244 -32.81733731 -30.68136902
 -33.2850493 ]
K-fold Average Validation Score: -32.38130776629861


X_cut = X_iter.copy()
X_cut.drop(list(to_be_removed), axis=1, inplace=True)

X_train_full, X_test, y_train_full, y_test = train_test_split(X_cut,
                                                              y_c,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_c,
                                   X_train,
                                   y_train,
                                   scoring='accuracy',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [0.76879939 0.77313281 0.7693092  0.77721132 0.77721132 0.77338771]
K-fold Average Validation Score: 0.7731752910187781


X_cut = X_iter.copy()
X_cut.drop(['female_class'], axis=1, inplace=True)

X_train_full, X_test, y_train_full, y_test = train_test_split(X_cut,
                                                              y,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_r,
                                   X_train,
                                   y_train,
                                   scoring='neg_mean_squared_error',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [-31.4023928  -32.44422675 -33.18844678 -32.1455363  -30.80067775
 -33.55715614]
K-fold Average Validation Score: -32.256406085624924


X_cut = X_iter.copy()
X_cut.drop(['female_class'], axis=1, inplace=True)

X_train_full, X_test, y_train_full, y_test = train_test_split(X_cut,
                                                              y_c,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_c,
                                   X_train,
                                   y_train,
                                   scoring='accuracy',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [0.77211318 0.77211318 0.76599541 0.77925057 0.77236809 0.77721132]
K-fold Average Validation Score: 0.7731752910187781


# Additional testing with backward seqential selection
sfs_backward = SequentialFeatureSelector(lasso,
                                         n_features_to_select=25,
                                         direction='backward').fit(X_iter, y)


print("Features selected by backward sequential selection: "
      f"{feature_names[sfs_backward.get_support()]}")

Features selected by backward sequential selection: Index(['number_of_siblings', 'n_male', 'n_female', 'hours_per_week',
       'attendance_rate', 'sleep_time', 'n_female_cat', 'n_male_cat',
       'male_class', 'female_class', 'direct_admission_No',
       'direct_admission_Yes', 'CCA_Arts', 'CCA_Clubs', 'CCA_None',
       'CCA_Sports', 'learning_style_Auditory', 'learning_style_Visual',
       'gender_Female', 'gender_Male', 'tuition_No', 'tuition_Yes',
       'mode_of_transport_private transport',
       'mode_of_transport_public transport', 'mode_of_transport_walk'],
      dtype='object')


backward_set_removed = set(X_iter.columns) - \
    set(feature_names[sfs_backward.get_support()])


backward_set_removed

{'age', 'class_size', 'privilege_rating', 'sleep_hours', 'wake_time'}


# Tested removing additional features from the data to see if it benefits the classification model, turns out age and wake_time can be removed without affecting model performnace.

X_cut = X_iter.copy()
X_cut.drop(['age', 'wake_time',], axis=1, inplace=True)

X_train_full, X_test, y_train_full, y_test = train_test_split(X_cut,
                                                              y_c,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_c,
                                   X_train,
                                   y_train,
                                   scoring='accuracy',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [0.77134846 0.78078002 0.77134846 0.77593678 0.77950548 0.7802702 ]
K-fold Average Validation Score: 0.7765315659784179


# Further removal of sleep hours deteriorates the model
X_cut = X_iter.copy()
X_cut.drop(['age', 'wake_time','sleep_hours'], axis=1, inplace=True)

X_train_full, X_test, y_train_full, y_test = train_test_split(X_cut,
                                                              y_c,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_full,
    y_train_full,
    train_size=0.9,
    test_size=0.1,
    random_state=0,
    stratify=y_train_full)

validation_score = cross_val_score(xgb_c,
                                   X_train,
                                   y_train,
                                   scoring='accuracy',
                                   cv=cv,
                                   n_jobs=-1,
                                   error_score='raise')

print('K-fold Validation Scores: {score}'.format(score=validation_score))
print('K-fold Average Validation Score: {score}'.format(
    score=sum(validation_score) / len(validation_score)))

K-fold Validation Scores: [0.77466225 0.77338771 0.76879939 0.78230946 0.77721132 0.78179964]
K-fold Average Validation Score: 0.776361628005778


from yellowbrick.classifier import ConfusionMatrix

y_c = df1.final_grade
X = df1.copy()
X.drop(['final_test', 'final_grade', 'df_index'], axis=1, inplace=True)

X_cut = X.copy()
X_cut.drop(['age', 'wake_time','sleep_hours'], axis=1, inplace=True)

X_train_full, X_test, y_train_full, y_test = train_test_split(X_cut,
                                                              y_c,
                                                              train_size=0.9,
                                                              test_size=0.1,
                                                              random_state=0,
                                                              stratify=y_c)

xgb_c = XGBClassifier()

cm = ConfusionMatrix(xgb_c, classes=[1,2,3,4])

cm.fit(X_train_full, y_train_full)

cm.score(X_test, y_test)

cm.show()

D:\ANACONDA\lib\site-packages\xgboost\sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[17:31:46] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

<AxesSubplot:title={'center':'XGBClassifier Confusion Matrix'}, xlabel='Predicted Class', ylabel='True Class'>


xgb_c = XGBClassifier(objective="multi:softprob")
xgb_c.fit(X_train,y_train)
print('Test Set Score: {:.4f}'.format(xgb_c.score(X_test, y_test)))

D:\ANACONDA\lib\site-packages\xgboost\sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[17:31:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Test Set Score: 0.8665


xgb_c = XGBClassifier(objective="multi:softprob")
xgb_c.fit(X_train_full, y_train_full)
print('Test Set Score: {:.4f}'.format(xgb_c.score(X_test, y_test)))

[17:31:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

D:\ANACONDA\lib\site-packages\xgboost\sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

Test Set Score: 0.7791


xgb_c = XGBClassifier(objective="multi:softprob", learning_rate=0.05,
                      max_depth=30, reg_lambda=0, min_child_weight=0)
xgb_c.fit(X_train_full, y_train_full)
print('Test Set Score: {:.4f}'.format(xgb_c.score(X_train_full, y_train_full)))

[17:31:53] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

D:\ANACONDA\lib\site-packages\xgboost\sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

Test Set Score: 0.9998

	final_test
	mean	count	median	min	count	std	var
bag_color
black	67.356468	2435	68.0	32.0	2435	13.893606	193.032287
blue	67.087917	2400	68.0	32.0	2400	13.799711	190.432034
green	66.598844	2423	67.0	32.0	2423	14.169321	200.769644
red	67.560132	2428	68.0	34.0	2428	13.886974	192.848051
white	67.144909	2367	68.0	32.0	2367	13.904202	193.326837
yellow	67.372706	2506	68.0	32.0	2506	14.184947	201.212732

	final_test
	mean	count	median	min	count	std	var
CCA
Arts	64.097106	3594	63.0	32.0	3594	13.155275	173.061261
Clubs	63.913407	3707	63.0	32.0	3707	12.985392	168.620400
None	76.748687	3617	78.0	32.0	3617	12.223655	149.417742
Sports	64.077177	3641	64.0	32.0	3641	13.017880	169.465196

	final_test
	mean	count	median	min	count	std	var
sleep_hours
4.0	43.830882	136	44.0	32.0	136	3.541406	12.541558
5.0	45.074766	214	46.0	32.0	214	3.917460	15.346496
6.0	45.637615	218	47.0	32.0	218	3.521083	12.398026
7.0	61.583612	598	60.0	32.0	598	15.686166	246.055811
8.0	68.380049	13393	69.0	32.0	13393	13.306867	177.072696

	final_test
	mean	count	median	min	count	std	var
mode_of_transport
private transport	67.254307	5804	68.0	32.0	5804	14.001690	196.047328
public transport	67.104120	5801	68.0	32.0	5801	13.962703	194.957088
walk	67.160342	2925	68.0	32.0	2925	13.994989	195.859713

	number_of_siblings	n_male	n_female	age	hours_per_week	attendance_rate	sleep_time	wake_time	sleep_hours	n_female_cat	n_male_cat	class_size	privilege_rating
count	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04	1.453000e+04
mean	-6.919601e-16	9.540430e-17	-2.809789e-16	1.266617e-15	4.414607e-16	6.766783e-17	-1.277773e-15	-8.610685e-16	1.995498e-16	2.711603e-16	4.016459e-16	-7.445295e-17	-3.424652e-16
std	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00
min	-1.182634e+00	-2.116603e+00	-1.339712e+00	-1.002860e+00	-2.319617e+00	-6.752689e+00	-1.251874e+00	-1.408788e+00	-6.411108e+00	-1.949731e+00	-2.826675e+00	-2.073389e+00	-2.020016e+00
25%	-1.182634e+00	-5.895951e-01	-7.390251e-01	-1.002860e+00	-7.486053e-01	-1.634311e-01	-7.085725e-01	-7.039575e-01	2.544900e-01	-5.843178e-01	6.009974e-02	-6.568676e-01	-1.661231e-01
50%	1.471658e-01	2.120785e-02	-1.383379e-01	9.970793e-01	-2.997448e-01	2.167184e-01	-1.652713e-01	8.731557e-04	2.544900e-01	-5.843178e-01	6.009974e-02	-1.846938e-01	-1.661231e-01
75%	1.471658e-01	6.320109e-01	6.125211e-01	9.970793e-01	8.224062e-01	4.701514e-01	3.780300e-01	7.057038e-01	2.544900e-01	7.810951e-01	6.009974e-02	9.957409e-01	-1.661231e-01
max	1.476966e+00	2.617121e+00	3.315613e+00	9.970793e-01	2.168987e+00	8.503009e-01	5.267741e+00	1.410534e+00	2.544900e-01	2.146508e+00	1.503487e+00	1.940089e+00	1.687770e+00

Student Score Prediction Machine Learning

Preamble

Introduction¶

Data Extraction¶

Exploring Feature Profiles and Relationships¶

Summary of Dataset and Feature Profiles with pandas_profiling¶

Analyzing Pandas Profiling Report Warnings¶

Focused Feature Analysis and Feature Engineering¶

Bag color and Student ID (bag_color,student_id)¶

Enrolled CCA (CCA)¶

Direct Admission State and Enrolled CCA (direct_admission, CCA)¶

Waking Time and Sleeping Time (wake_time, sleep_time)¶

Attendance Rate, Study Hours and Age (attendance_rate,hours_per_week,age)¶

Tuition, Gender and Learning Style (tuition,gender,learning_style)¶

Mode of Transport (mode_of_transport)¶

Female and Male Classmates (n_female,n_male)¶

Number of Siblings (number_of_siblings)¶

Encoding, Feature Selection and Data Imputation or Removal¶

Encoding¶

Imputation¶

Creating a Target for Classification¶

Conduct Validation of Regression Model with Simply Imputed Data¶

Conduct Validation of Regression Model with Iteratively Imputed Data¶

Conduct Validation of Classification Model with Simply Imputed Data¶

Conduct Validation of Classification Model with Iteratively Imputed Data¶

Feature Selection and Feature Engineering¶

Feature Engineering with PCA¶

Feature Selection¶

Feature Selection for Regression Model¶

Feature Selection for Classification Model¶

Expected Performance¶

Conclusion¶

Annex¶

	index	number_of_siblings	direct_admission	CCA	learning_style	student_id	gender	tuition	final_test	n_male	n_female	age	hours_per_week	attendance_rate	sleep_time	wake_time	mode_of_transport	bag_color
0	0	0	Yes	Sports	Visual	ACN2BE	Female	No	69.0	14.0	2.0	16.0	10.0	91.0	22:00	6:00	private transport	yellow
1	1	2	No	Sports	Auditory	FGXIIZ	Female	No	47.0	4.0	19.0	16.0	7.0	94.0	22:30	6:30	private transport	green
2	2	0	Yes	None	Visual	B9AI9F	Male	No	85.0	14.0	2.0	15.0	8.0	92.0	22:30	6:30	private transport	white
3	3	1	No	Clubs	Auditory	FEVM1T	Female	Yes	64.0	2.0	20.0	15.0	18.0	NaN	21:00	5:00	public transport	yellow
4	4	0	No	Sports	Auditory	AXZN2E	Male	No	66.0	24.0	3.0	16.0	7.0	95.0	21:30	5:30	public transport	yellow
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
15895	15895	1	No	Clubs	Visual	XPECN2	Female	No	56.0	12.0	14.0	16.0	9.0	96.0	22:00	6:00	private transport	black
15896	15896	1	Yes	None	Auditory	7AMC7S	Male	Yes	85.0	17.0	5.0	16.0	7.0	91.0	22:30	6:30	private transport	white
15897	15897	1	Yes	Sports	Auditory	XKZ6VN	Female	Yes	76.0	7.0	10.0	15.0	7.0	93.0	23:00	7:00	walk	red
15898	15898	1	No	Clubs	Visual	2OU4UQ	Male	Yes	45.0	18.0	12.0	16.0	3.0	94.0	23:00	7:00	walk	yellow
15899	15899	2	Yes	None	Visual	D9OKLV	Male	No	87.0	11.0	7.0	16.0	9.0	91.0	23:00	7:00	walk	yellow

	df_index	number_of_siblings	direct_admission	CCA	learning_style	student_id	gender	tuition	final_test	n_male	n_female	age	hours_per_week	attendance_rate	sleep_time	wake_time	mode_of_transport	bag_color
5534	5534	0	No	Clubs	Auditory	00811H	Female	Yes	88.0	21.0	4.0	15.0	8.0	92.0	23:00	7:00	walk	green
12290	12290	0	No	Clubs	Auditory	00811H	Female	Yes	88.0	21.0	4.0	15.0	8.0	92.0	23:00	7:00	walk	white
13541	13541	1	No	Arts	Visual	0195IO	Female	No	52.0	8.0	22.0	16.0	15.0	99.0	22:00	6:00	private transport	yellow
12270	12270	1	No	Arts	Visual	0195IO	Female	No	52.0	8.0	22.0	16.0	15.0	99.0	22:00	6:00	private transport	yellow
4303	4303	0	No	Clubs	Auditory	02RSAH	Female	Yes	64.0	12.0	9.0	15.0	17.0	97.0	22:00	6:00	private transport	yellow
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7511	7511	0	No	None	Auditory	ZUGVXE	Female	No	67.0	24.0	3.0	16.0	9.0	91.0	21:30	5:30	public transport	red
9953	9953	1	No	Arts	Auditory	ZZICEC	Female	Yes	54.0	11.0	13.0	15.0	12.0	93.0	22:00	6:00	private transport	blue
4429	4429	1	No	Arts	Auditory	ZZICEC	Female	Yes	54.0	11.0	13.0	15.0	12.0	93.0	22:00	6:00	private transport	green
1241	1241	0	No	None	Visual	ZZNA57	Male	No	72.0	23.0	5.0	16.0	13.0	95.0	21:30	5:30	public transport	green
15113	15113	0	No	None	Visual	ZZNA57	Male	No	72.0	23.0	5.0	16.0	13.0	95.0	21:30	5:30	public transport	red

	sleep_time	wake_time	sleep_hours
7884	2:30	6:30	4.0
10514	1:00	5:00	4.0
13658	1:00	5:00	4.0
855	3:00	7:00	4.0
8792	1:30	5:30	4.0
...	...	...	...
5069	21:30	5:30	8.0
5070	22:00	6:00	8.0
5071	21:00	5:00	8.0
5059	21:30	5:30	8.0
14641	23:00	7:00	8.0

	df_index	number_of_siblings	direct_admission	CCA	learning_style	student_id	gender	tuition	final_test	n_male	...	age	hours_per_week	attendance_rate	sleep_time	wake_time	mode_of_transport	bag_color	wake_time1	sleep_time1	sleep_hours
7	7	0	No	Sports	Visual	HTP8CW	Male	No	76.0	20.0	...	15.0	3.0	97.0	21:00	5:00	public transport	green	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
69	71	1	Yes	None	Auditory	903WGD	Male	No	76.0	11.0	...	16.0	3.0	93.0	22:30	6:30	private transport	black	0001-01-01 06:30:00	0001-01-01 22:30:00	8.0
627	638	0	Yes	Clubs	Visual	EJUBLN	Female	No	75.0	12.0	...	16.0	2.0	93.0	21:00	5:00	public transport	blue	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
669	680	2	No	Arts	Auditory	3SY04Z	Female	Yes	75.0	7.0	...	15.0	3.0	91.0	21:00	5:00	public transport	white	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
740	751	1	Yes	Clubs	Auditory	ZF4NK4	Female	Yes	76.0	1.0	...	15.0	4.0	91.0	22:00	6:00	private transport	green	0001-01-01 06:00:00	0001-01-01 22:00:00	8.0
1729	1782	0	Yes	Clubs	Visual	C3VS2F	Male	No	75.0	18.0	...	15.0	4.0	91.0	23:30	6:30	private transport	blue	0001-01-01 06:30:00	0001-01-01 23:30:00	7.0
1952	2015	1	Yes	Sports	Auditory	Z1W8MB	Male	Yes	76.0	16.0	...	16.0	3.0	96.0	21:30	5:30	public transport	red	0001-01-01 05:30:00	0001-01-01 21:30:00	8.0
3047	3170	1	Yes	Clubs	Visual	EI2XB2	Male	No	76.0	15.0	...	15.0	2.0	96.0	22:30	6:30	private transport	red	0001-01-01 06:30:00	0001-01-01 22:30:00	8.0
3761	3930	1	Yes	Arts	Visual	33SCVR	Male	Yes	76.0	15.0	...	15.0	3.0	95.0	22:30	6:30	private transport	black	0001-01-01 06:30:00	0001-01-01 22:30:00	8.0
4770	5010	0	No	Clubs	Auditory	6MCOSY	Male	N	76.0	13.0	...	15.0	0.0	94.0	21:30	5:30	public transport	yellow	0001-01-01 05:30:00	0001-01-01 21:30:00	8.0
5060	5324	0	Yes	Sports	Auditory	9I0TGF	Female	Yes	76.0	12.0	...	16.0	4.0	100.0	21:00	5:00	public transport	green	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
5825	6133	1	Yes	Clubs	Auditory	JE992B	Male	Yes	75.0	16.0	...	16.0	3.0	99.0	21:00	5:00	public transport	yellow	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
6809	7175	1	Yes	Arts	Auditory	RM1DEA	Male	Yes	75.0	16.0	...	15.0	0.0	100.0	21:00	5:00	public transport	white	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
7712	8146	0	Yes	None	Visual	Q5OD55	Male	No	76.0	13.0	...	16.0	4.0	98.0	22:30	6:30	private transport	yellow	0001-01-01 06:30:00	0001-01-01 22:30:00	8.0
7881	8333	0	No	Sports	Auditory	HIWQOZ	Male	Yes	75.0	16.0	...	16.0	3.0	93.0	21:00	5:00	public transport	blue	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
8826	9378	0	No	Sports	Auditory	GTAXJR	Female	Yes	75.0	4.0	...	16.0	1.0	96.0	22:30	6:30	private transport	black	0001-01-01 06:30:00	0001-01-01 22:30:00	8.0
9218	9808	1	No	Arts	Visual	WT1ZR0	Female	Yes	75.0	15.0	...	15.0	3.0	98.0	21:30	5:30	public transport	yellow	0001-01-01 05:30:00	0001-01-01 21:30:00	8.0
9519	10147	0	Yes	Sports	Auditory	YWNO15	Female	Yes	75.0	10.0	...	15.0	0.0	94.0	22:30	6:30	private transport	green	0001-01-01 06:30:00	0001-01-01 22:30:00	8.0
11099	11882	1	Yes	Clubs	Auditory	MSK772	Female	Yes	75.0	17.0	...	16.0	2.0	93.0	23:00	7:00	walk	green	0001-01-01 07:00:00	0001-01-01 23:00:00	8.0
11239	12053	0	Yes	Arts	Auditory	3P7DY7	Male	No	75.0	18.0	...	15.0	4.0	92.0	21:00	5:00	public transport	blue	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
12395	13335	0	No	None	Auditory	TGE8U5	Female	No	76.0	4.0	...	15.0	4.0	91.0	21:00	5:00	public transport	blue	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
13997	15151	1	Yes	Sports	Auditory	A3WIF4	Male	No	76.0	17.0	...	16.0	3.0	93.0	23:00	7:00	walk	red	0001-01-01 07:00:00	0001-01-01 23:00:00	8.0
14176	15353	0	Yes	Clubs	Auditory	T14GWN	Female	No	76.0	18.0	...	16.0	0.0	94.0	23:00	7:00	walk	black	0001-01-01 07:00:00	0001-01-01 23:00:00	8.0
14617	15870	1	Yes	None	Auditory	911IV5	Female	Yes	75.0	15.0	...	16.0	4.0	96.0	22:00	6:00	private transport	red	0001-01-01 06:00:00	0001-01-01 22:00:00	8.0

	df_index	number_of_siblings	direct_admission	CCA	learning_style	student_id	gender	tuition	final_test	n_male	...	age	hours_per_week	attendance_rate	sleep_time	wake_time	mode_of_transport	bag_color	wake_time1	sleep_time1	sleep_hours
4310	4513	2	No	Sports	Auditory	RXTBXJ	Male	No	48.0	15.0	...	-4.0	2.0	94.0	21:30	5:30	public transport	red	0001-01-01 05:30:00	0001-01-01 21:30:00	8.0
7518	7932	1	No	Sports	Auditory	UAMI3G	Female	No	52.0	3.0	...	-5.0	13.0	92.0	21:00	5:00	public transport	yellow	0001-01-01 05:00:00	0001-01-01 21:00:00	8.0
8131	8602	0	No	None	Visual	XQMSBU	Female	No	67.0	10.0	...	-5.0	18.0	91.0	22:00	6:00	private transport	green	0001-01-01 06:00:00	0001-01-01 22:00:00	8.0
8184	8663	0	Yes	Arts	Visual	39XWY2	Male	Yes	85.0	17.0	...	-5.0	5.0	94.0	23:00	7:00	walk	blue	0001-01-01 07:00:00	0001-01-01 23:00:00	8.0
8349	8846	2	No	Sports	Auditory	Z33FOS	Female	Yes	74.0	4.0	...	-5.0	13.0	90.0	22:00	6:00	private transport	white	0001-01-01 06:00:00	0001-01-01 22:00:00	8.0

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13
0	-0.866481	-0.562319	-0.413981	-1.064993	1.202344	-0.874191	0.657136	1.118992	-0.271564	-0.452677	-0.216644	-4.804279e-14	2.663325e-13
1	2.480403	0.911782	2.073728	-1.018560	-0.423917	-1.058665	-1.250625	0.355943	0.378348	-0.046809	0.395774	6.377904e-13	1.197080e-14
2	-0.877195	-0.179684	-0.747534	-1.625417	0.874134	1.099219	0.003418	1.094640	-0.273332	-0.366766	-0.215312	1.962037e-13	3.251122e-14
3	3.571929	-1.264994	1.092365	1.109192	-0.129585	1.023996	1.458001	0.011195	-0.600194	0.008453	-0.118229	1.861232e-16	4.606575e-16
4	-2.169971	-0.716462	-0.741002	1.536452	0.009989	-1.108441	-0.862661	0.850251	-0.168507	0.004192	-0.122798	-6.208816e-16	2.251491e-14
...	...	...	...	...	...	...	...	...	...	...	...	...	...
14525	0.902226	-0.088561	0.088570	0.537498	-0.584832	-1.095985	-0.659625	-0.082189	-0.258278	0.071361	0.145206	-4.637123e-17	-3.744739e-17
14526	-0.907553	0.413143	-0.090311	-0.806461	0.143402	-0.999896	-0.631062	0.041746	0.315334	-0.377092	-0.185735	-1.470764e-16	-1.021645e-16
14527	1.589163	0.570913	-0.109949	-2.303440	0.299752	1.053790	-0.531249	0.115894	-0.136882	-0.260844	-0.528192	-1.687900e-16	-1.086766e-16
14528	0.239772	1.213700	-0.780948	0.048923	-1.323623	-1.199887	-2.282971	-0.181739	0.180674	-0.074526	-0.421858	-1.316622e-16	-4.455663e-17
14529	-0.419315	1.179053	1.988123	-2.218371	-0.271039	-0.927566	-0.537424	0.480690	-0.109435	-0.420277	0.380889	-3.377706e-16	-7.207673e-17

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13
number_of_siblings	-0.030083	0.149231	0.668353	-0.098142	-0.069353	-0.004061	-0.119396	-0.708065	0.002149	0.005522	-0.004978	-3.535222e-16	2.319855e-15
n_male	-0.504657	0.029409	-0.054292	0.209948	-0.163149	-0.014417	-0.087401	-0.017977	0.135458	0.012905	-0.486158	6.381908e-01	2.215356e-02
n_female	0.510619	0.071934	-0.025370	0.168475	-0.154468	-0.021185	-0.080594	-0.028111	0.084093	0.013151	0.494569	6.489382e-01	2.252664e-02
age	-0.009276	0.018792	0.008238	-0.034093	0.033527	-0.995971	0.072002	0.007148	-0.003211	0.001324	-0.000661	-1.700029e-16	7.556889e-17
hours_per_week	0.001662	-0.085820	0.153718	0.062068	-0.459205	0.044809	0.866863	0.016715	0.000931	-0.023231	-0.001322	-9.714451e-17	1.864828e-17
attendance_rate	0.017180	-0.529197	0.014414	-0.193536	-0.292029	-0.024669	-0.174248	-0.008042	-0.067627	0.749058	-0.003638	-1.526557e-16	3.486794e-16
sleep_time	-0.028699	0.529057	-0.184073	-0.324505	-0.180232	0.015453	0.018126	0.000168	-0.022067	0.226591	-0.001159	-2.441319e-02	7.032852e-01
wake_time	-0.021429	0.228873	-0.212676	-0.567400	-0.471534	-0.005148	-0.153775	-0.002308	0.018142	-0.198479	0.000719	1.881830e-02	-5.421099e-01
sleep_hours	0.018681	-0.540798	0.030883	-0.173083	-0.281013	-0.029784	-0.209579	-0.002986	0.055288	-0.582125	0.002627	-1.591901e-02	4.585884e-01
n_female_cat	0.491250	0.070571	-0.028846	0.174627	-0.157921	-0.018807	-0.085208	-0.026094	-0.632217	-0.057700	-0.534887	-4.163336e-17	1.099381e-16
n_male_cat	-0.489339	0.026039	-0.050198	0.199823	-0.156406	-0.011939	-0.083716	-0.025166	-0.670589	-0.057102	0.481895	-1.001803e-16	-1.956511e-16
class_size	0.022513	0.158557	-0.123825	0.589456	-0.495082	-0.055596	-0.261831	-0.071987	0.341633	0.040627	0.025881	-4.127810e-01	-1.432890e-02
privilege_rating	0.019568	-0.168457	-0.653495	0.018679	0.138324	0.003506	0.184090	-0.700446	0.003976	-0.012366	-0.002396	-2.220446e-16	3.469447e-18

	hours_per_week
	mean	count	median	min	count	std	var
number_of_siblings
0	9.919616	5001	9.0	1.0	5001	4.068211	16.550337
1	10.373859	6136	10.0	0.0	6136	4.613088	21.280581
2	10.879458	3393	10.0	0.0	3393	4.647532	21.599557

	hours_per_week
	mean	count	median	min	count	std	var
privilege_rating
1	10.689678	1521	10.0	0.0	1521	4.772559	22.777321
2	10.498233	10186	9.0	0.0	10186	4.527302	20.496462
3	9.557917	2823	8.0	1.0	2823	3.899258	15.204210

	final_test
	mean	count	median	min	count	std	var
hours_per_week
0.0	45.500000	6	45.0	42.0	6	3.209361	10.300000
1.0	42.394737	38	42.0	36.0	38	3.071574	9.434566
2.0	43.333333	24	44.0	36.0	24	3.509821	12.318841
3.0	43.285714	21	42.0	37.0	21	3.509172	12.314286
4.0	43.000000	27	43.0	38.0	27	3.050851	9.307692
5.0	57.461538	65	55.0	37.0	65	13.921742	193.814904
6.0	64.806452	124	69.0	39.0	124	14.158038	200.450039
7.0	64.291339	127	69.0	37.0	127	14.438765	208.477940
8.0	64.085271	129	68.0	36.0	129	14.070154	197.969234
9.0	66.375940	133	70.0	37.0	133	13.469158	181.418205
10.0	64.216495	97	67.0	38.0	97	14.078816	198.213058
11.0	57.783505	97	57.0	38.0	97	12.373718	153.108892
12.0	57.170455	88	57.5	37.0	88	11.852757	140.487853
13.0	58.204819	83	58.0	39.0	83	12.555981	157.652659
14.0	58.606061	99	59.0	36.0	99	11.480968	131.812616
15.0	63.183099	71	68.0	40.0	71	10.793265	116.494567
16.0	64.140625	64	66.0	51.0	64	7.217411	52.091022
17.0	65.446154	65	69.0	51.0	65	7.086587	50.219712
18.0	63.500000	64	62.0	50.0	64	7.415128	54.984127
19.0	63.289855	69	62.0	51.0	69	7.110683	50.561807
20.0	65.566667	30	69.5	51.0	30	8.447335	71.357471

	final_test
	mean	count	median	min	count	std	var
hours_per_week
1.0	49.000000	1	49.0	49.0	1	NaN	NaN
3.0	49.000000	1	49.0	49.0	1	NaN	NaN
5.0	82.075556	225	82.0	53.0	225	9.548213	91.168373
6.0	81.705382	353	81.0	56.0	353	8.855430	78.418636
7.0	82.119469	452	83.0	49.0	452	9.152508	83.768401
8.0	81.414520	427	82.0	51.0	427	9.497668	90.205704
9.0	82.804071	393	83.0	49.0	393	8.936555	79.862024
10.0	80.513158	228	80.0	56.0	228	9.354723	87.510839
11.0	68.163934	61	70.0	49.0	61	6.726020	45.239344
12.0	68.544444	90	69.5	50.0	90	5.627303	31.666542
13.0	68.116279	86	69.0	53.0	86	5.292433	28.009850
14.0	67.506667	75	69.0	50.0	75	6.065795	36.793874
15.0	68.592593	81	69.0	55.0	81	5.161826	26.644444
16.0	67.105263	76	68.0	52.0	76	5.658219	32.015439
17.0	67.988095	84	69.0	52.0	84	5.576400	31.096242
18.0	68.436782	87	69.0	54.0	87	5.220044	27.248864
19.0	67.121212	66	69.0	53.0	66	5.887445	34.662005
20.0	68.756757	37	70.0	58.0	37	4.505585	20.300300