accident.py

# -*- coding: utf-8 -*-
"""Untitled

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1T8m11IVM12L2jouu5GAoMP7L_I0Z-WGZ
"""

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

#load and read the file
df=pd.read_csv("/content/drive/MyDrive/SGTL-Accident_data1.csv")#load and read the file

df.head()

df.shape

#checking the numerical statistics of the data
df.describe()

#dropping Count and SPV
df=df.drop(['Count','SPV'],axis=1)

#dropping Count and SPV
df=df.drop(['MONTH'],axis=1)

df.head()

df.describe(include="all")

#checking data types of each columns
df.info()

df6=df.drop('NON-INJURY',axis=1)

df=df9=df.drop('NON-INJURY',axis=1)

print(df9)

#finding duplicate values
df.duplicated().sum()

#Distribution of Accident severity
df['ACC_TYPE'].value_counts()

#plotting the final class
sns.countplot(x = df['ACC_TYPE'])
plt.title('Distribution of Accident severity')

#checking missing values
df.isna().sum()

#df.drop(['Service_year_of_vehicle','Defect_of_vehicle','Work_of_casuality', 'Fitness_of_casuality','Time'], axis = 1, inplace = True)
#df.head()

#storing categorical column names to a new variable
categorical=[i for i in df.columns if df[i].dtype=='O']
print('The categorical variables are',categorical)

#for categorical values we can replace the null values with the Mode of it
for i in categorical:
    df[i].fillna(df[i].mode()[0],inplace=True)

#checking the current null values
df.isna().sum()

#plotting relationship between Number_of_casualties and Number_of_vehicles_involved
sns.scatterplot(x=df['DIR'], y=df['ACC_TYPE'], hue=df['WEATHER'])

df.corr()

sns.heatmap(df.corr())

#storing numerical column names to a variable
numerical=[i for i in df.columns if df[i].dtype!='O']
print('The numerica variables are',numerical)

#distribution for numerical columns
plt.figure(figsize=(10,10))
plotnumber = 1
for i in numerical:
    if plotnumber <= df.shape[1]:
        ax1 = plt.subplot(2,2,plotnumber)
        plt.hist(df[i],color='red')
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.title('frequency of '+i, fontsize=10)
    plotnumber +=1

#count plot for categorical values
plt.figure(figsize=(10,200))
plotnumber = 1

for col in categorical:
    if plotnumber <= df.shape[1] and col!='Pedestrian_movement':
        ax1 = plt.subplot(28,1,plotnumber)
        sns.countplot(data=df, y=col, palette='muted')
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.title(col.title(), fontsize=14)
        plt.xlabel('')
        plt.ylabel('')
    plotnumber +=1

df.dtypes

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

#creating a new data frame from performing the chi2 analysis
df1=pd.DataFrame()

#adding all the categorical columns except the output to new data frame
for i in categorical:
    if i!= 'ACC_TYPE':
        df1[i]=le.fit_transform(df[i])

df1.info()

plt.figure(figsize=(22,17))
sns.set(font_scale=1)
sns.heatmap(df1.corr(), annot=True)

#label encoded data set
df1.head()

#import chi2 test
from sklearn.feature_selection import chi2
f_p_values=chi2(df1,df['ACC_TYPE'])

#f_p_values will return Fscore and pvalues
f_p_values

#for better understanding and ease of access adding them to a new dataframe
f_p_values1=pd.DataFrame({'features':df1.columns, 'Fscore': f_p_values[0], 'Pvalues':f_p_values[1]})
f_p_values1

#since we want lower Pvalues we are sorting the features
f_p_values1.sort_values(by='Pvalues',ascending=True)

#after evaluating we are removing lesser important columns and storing to a new data frame
df2=df

df2.head()

df2.shape

#to check distinct values in each categorical columns we are storing them to a new variable
categorical_new=[i for i in df2.columns if df2[i].dtype=='O']
print(categorical_new)

for i in categorical_new:
    print(df2[i].value_counts())


df2.head()

df3=df2
x=df3.drop(['ACC_TYPE'],axis=1)
x.shape

x.head()

y=df3.iloc[:,10]
y.head()

#checking the count of each item in the output column
y.value_counts()

#plotting count plot using seaborn
sns.countplot(x = y, palette='muted')

#get_dummies
dummy=pd.get_dummies(df2[['PRIMARY_REASON','ACC_TYPE','ACCIDENT_CAUSE_TYPE',"LANE_TYPE",'ROAD_CONDITION','SECONDARY_REASON',	'DIR','ROAD_FEATURE','WEATHER']],drop_first=True)
dummy.head()


df4=pd.concat([df2,dummy],axis=1)
df4.head()

#dropping dummied columns
df4.drop(['PRIMARY_REASON','ACCIDENT_CAUSE_TYPE',"LANE_TYPE",'ROAD_CONDITION','SECONDARY_REASON','CHAINAGE','Date',	'DIR','ROAD_FEATURE','WEATHER',],axis=1,inplace=True)
df4.head()

x=df4.drop(['ACC_TYPE'],axis=1)
x.shape

y=df4.iloc[:,0]
y.head()

#checking the count of each item in the output column
y.value_counts()

#plotting count plot using seaborn
sns.countplot(x = y, palette='muted')

columns_with_missing_values = df.columns[df.isnull().any()]

df.dropna(inplace=True)

df4.isna().sum()


df4.dropna(axis=1, inplace=True)

print(df4.isnull().sum())

# Assuming 'target_variable' is the name of your target variable column
X = df4.drop('ACC_TYPE', axis=1)  # Features
y = df4['ACC_TYPE']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df4.head()

df4.drop('ACC_TYPE',axis=1)

#checking the oversampling output
y1=pd.DataFrame(yo)
y1.value_counts()

sns.countplot(x = yo, palette='muted')

#converting data to training data and testing data
from sklearn.model_selection import train_test_split
#splitting 70% of the data to training data and 30% of data to testing data
x_train,x_test,y_train,y_test=train_test_split(xo,yo,test_size=0.30,random_state=42)

print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

#KNN model alg
from sklearn.neighbors import KNeighborsClassifier
model_KNN=KNeighborsClassifier(n_neighbors=5)
model_KNN.fit(x_train,y_train)

y_KNN=model_KNN.predict(x_test)
y_KNN

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,ConfusionMatrixDisplay
matrix_KNN=confusion_matrix(y_test,y_KNN)
print(matrix_KNN,'\n')
print(ConfusionMatrixDisplay.from_predictions(y_test,y_KNN))
accuracy_KNN=accuracy_score(y_test,y_KNN)
print(accuracy_KNN,'\n')
report_KNN=classification_report(y_test,y_KNN)
print(report_KNN)

print(xo)

#naive bayes model alg
from sklearn.naive_bayes import MultinomialNB
model_naive=MultinomialNB()
model_naive.fit(x_train,y_train)

y_naive=model_naive.predict(x_test)
y_naive

matrix_naive=confusion_matrix(y_test,y_naive)
print(matrix_naive,'\n')
print(ConfusionMatrixDisplay.from_predictions(y_test,y_naive))
accuracy_naive=accuracy_score(y_test,y_naive)
print(accuracy_naive,'\n')
report_naive=classification_report(y_test,y_naive)
print(report_naive)

#SVM model alg
from sklearn.svm import SVC
model_SVC=SVC()
model_SVC.fit(x_train,y_train)

y_SVC=model_SVC.predict(x_test)
y_SVC

matrix_SVC=confusion_matrix(y_test,y_SVC)
print(matrix_SVC,'\n')
print(ConfusionMatrixDisplay.from_predictions(y_test,y_SVC))
accuracy_SVC=accuracy_score(y_test,y_SVC)
print(accuracy_SVC,'\n')
report_SVC=classification_report(y_test,y_SVC)
print(report_SVC)

y_dec=model_dec.predict(x_test)
y_dec

#Decision Tree model alg
from sklearn.tree import DecisionTreeClassifier
model_dec=DecisionTreeClassifier(criterion='entropy')
model_dec.fit(x_train,y_train)

matrix_dec=confusion_matrix(y_test,y_dec)
print(matrix_dec,'\n')
print(ConfusionMatrixDisplay.from_predictions(y_test,y_dec))
accuracy_dec=accuracy_score(y_test,y_dec)
print(accuracy_dec,'\n')
report_dec=classification_report(y_test,y_dec)
print(report_dec)

#Randomforest classifier
from sklearn.ensemble import RandomForestClassifier
model_ran=RandomForestClassifier(n_estimators=25,criterion='entropy')
model_ran.fit(x_train,y_train)

y_ran=model_ran.predict(x_test)
y_ran

matrix_dec=confusion_matrix(y_test,y_ran)
print(matrix_dec,'\n')
print(ConfusionMatrixDisplay.from_predictions(y_test,y_ran))
accuracy_ran=accuracy_score(y_test,y_ran)
print(accuracy_ran,'\n')
report_ran=classification_report(y_test,y_ran)
print(report_ran)

alg=['KNN','Naive Bayes','SVM','Decision Tree','Random Forest']
acc=[accuracy_KNN,accuracy_naive,accuracy_SVC,accuracy_dec,accuracy_ran]
Accuracy_Scores=pd.DataFrame({'Algorithms':alg, 'Accuracy': acc})
Accuracy_Scores['Accuracy']=Accuracy_Scores['Accuracy']*100
Accuracy_Scores

#sorting models based on their accuracy score
Accuracy_Scores.sort_values(by='Accuracy',ascending=False)

ax = sns.barplot(x='Algorithms', y='Accuracy',
                 palette='muted', data=Accuracy_Scores.sort_values(by='Accuracy',ascending=False),
                 errwidth=0)
for i in ax.containers:
    ax.bar_label(i,)

# Assuming 'target_variable' is the name of your target variable column
X = df4.drop('ACC_TYPE', axis=1)  # Features
y = df4['ACC_TYPE']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

df4.head()

df4.drop('ACC_TYPE',axis=1)

model = LinearRegression()
model.fit(X_train, y_train)

df5=df4.drop('ACC_TYPE',axis=1)

df5.head()

df5.describe()

df5.drop('...',axis=1)

df5.dtype()

# Assuming 'target_variable' is the name of your target variable column
X = df9.drop('ACC_TYPE', axis=1)  # Features
y = df9['ACC_TYPE']  # Target variable
model = LinearRegression()
model.fit(X_train, y_train)

#get_dummies
dummy=pd.get_dummies(df2[['PRIMARY_REASON','ACC_TYPE','ACCIDENT_CAUSE_TYPE',"LANE_TYPE",'ROAD_CONDITION','SECONDARY_REASON',	'DIR','ROAD_FEATURE','WEATHER']],drop_first=True)
dummy.head()

df110=pd.concat([df2,dummy],axis=1)
df110.head()
#dropping dummied columns

df110.drop(['PRIMARY_REASON','ACCIDENT_CAUSE_TYPE',"LANE_TYPE",'ROAD_CONDITION','SECONDARY_REASON','CHAINAGE','Date',	'DIR','ROAD_FEATURE','WEATHER',],axis=1,inplace=True)
df110.head()

df111=df110.drop("MONTH",axis=1)

df112=df111.drop("SPV",axis=1)

df112.describe()

df113=df112.drop('Count',axis=1)
df113.describe()

# Assuming 'target_variable' is the name of your target variable column
X = df113.drop('ACC_TYPE', axis=1)  # Featuresd
y = df113['ACC_TYPE']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

df114=df113.drop('column_name',axis=1)

df114.head()

# Assuming 'target_variable' is the name of your target variable column
X = df114.drop('ACC_TYPE', axis=1)  # Featuresd
y = df114['ACC_TYPE']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)