copy_of_sgtl_visualizations.py

# -*- coding: utf-8 -*-
"""Copy of SGTL-Visualizations.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ncASVA_HoeJ7bPCx0hhPXabxJk5CieDj
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load the Dataset
df = pd.read_csv('/content/drive/MyDrive/SGTL-Visualizations.csv')

# Step 2: Read and Understand the Data

df.head(5)  # Preview the first few rows of the data

df.shape  # Check the dimensions of the dataset

print(df.columns)  # Check the column names

# Step 3: Clean the Data (if necessary)
# Handle missing values
print(df.isnull().sum())  # Check the number of missing values in each column

# Handle duplicates
df = df.drop_duplicates()

# Step 4: Explore Individual Variables
# Examine the distribution of numeric variables
print(df.describe())

plt.figure(figsize=(20, 10))
df.hist(figsize=(10, 8))

# Add count labels
for ax in plt.gcf().get_axes():
    for patch in ax.patches:
        height = patch.get_height()
        x = patch.get_x() + patch.get_width() / 2
        ax.annotate(f'{int(height)}', (x, height), ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# Step 5: Analyze Relationships between Variables
# Calculate the correlation matrix for numeric variables
corr_matrix = df.corr()

# Visualize the correlation matrix using a heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

df.describe(include="all")

df.info()

#finding duplicate values
df.duplicated().sum()

df['Category'].value_counts()

plt.figure(figsize=(15, 8))
ax = sns.countplot(x=df['Category'])

# Add count labels
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2, height), ha='center', va='bottom')

plt.title('Distribution of Accident Severity')
plt.show()

#checking missing values
df.isna().sum()

#dropping columns which has more than 2500 missing values and Time column
#df.drop(['Service_year_of_vehicle','Defect_of_vehicle','Work_of_casuality', 'Fitness_of_casuality','Time'], axis = 1, inplace = True)
#df.head()

#storing categorical column names to a new variable
categorical=[i for i in df.columns if df[i].dtype=='O']
print('The categorical variables are',categorical)

#for categorical values we can replace the null values with the Mode of it
for i in categorical:
    df[i].fillna(df[i].mode()[0],inplace=True)

#checking the current null values
df.isna().sum()

"""Visualizations"""

#plotting relationship between Number_of_casualties and Number_of_vehicles_involved
sns.scatterplot(x=df['Category'], y=df['SECONDARY_REASON'], hue=df['ROAD_FEATURE'])

"""Observation

There is no visible correlation between Number_of_casualties and Number_of_vehicles_involved columns
"""

#plotting the correlation using heatmap
sns.heatmap(df.corr())

#storing numerical column names to a variable
numerical=[i for i in df.columns if df[i].dtype!='O']
print('The numerica variables are',numerical)

plt.figure(figsize=(10, 10))
plotnumber = 1
for i in numerical:
    if plotnumber <= df.shape[1]:
        ax1 = plt.subplot(2, 2, plotnumber)
        plt.hist(df[i], color='red')

        # Add count labels
        counts, bins, patches = plt.hist(df[i], color='red')
        for count, patch in zip(counts, patches):
            x = patch.get_x() + patch.get_width() / 2
            y = patch.get_height()
            ax1.annotate(f'{int(count)}', (x, y), ha='center', va='bottom', fontsize=8)

        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.title('Frequency of ' + i, fontsize=10)

    plotnumber += 1

plt.tight_layout()
plt.show()

"""Observation

Most accidents are occured when 2 vehicles are involved and 1 casuality is happend mostly in the accidents.
"""

plt.figure(figsize=(10, 200))
plotnumber = 1

for col in categorical:
    if plotnumber <= df.shape[1] and (col != 'CHAINAGE' and col != 'Date'):
        ax1 = plt.subplot(28, 1, plotnumber)
        sns.countplot(data=df, y=col, palette='muted')

        # Add count labels
        for p in ax1.patches:
            width = p.get_width()
            height = p.get_height()
            x, y = p.get_xy()
            count = int(width)  # Use width instead of height for y-axis count
            ax1.annotate(f'{count}', (x + width, y + height/2), ha='left', va='center')

        plt.xticks(fontsize=10)
        plt.yticks(fontsize=10)
        plt.title(col.title(), fontsize=10)
        plt.xlabel('')
        plt.ylabel('')

        # Adjust the x-axis limits to avoid cutting off labels
        ax1.set_xlim(0, df[col].value_counts().max() + 1)

    plotnumber += 1

plt.tight_layout()
plt.show()

"""Handling Categorical values"""

df.dtypes

"""Since there are so many categorical values, we need to use feature selection We need to perform label encoding before applying chi 2 analysis"""

#importing label encoing module
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

#creating a new data frame from performing the chi2 analysis
df1=pd.DataFrame()

#adding all the categorical columns except the output to new data frame
for i in categorical:
    if i!= 'ACCIDENT_CAUSE_TYPE':
        df1[i]=le.fit_transform(df[i])

#confirming the data type
df1.info()

"""Correlation Graph"""

plt.figure(figsize=(22,17))
sns.set(font_scale=1)
sns.heatmap(df1.corr(), annot=True)

#label encoded data set
df1.head()

#import chi2 test
from sklearn.feature_selection import chi2
f_p_values=chi2(df1,df['ACCIDENT_CAUSE_TYPE'])

#f_p_values will return Fscore and pvalues
f_p_values

#for better understanding and ease of access adding them to a new dataframe
f_p_values1=pd.DataFrame({'features':df1.columns, 'Fscore': f_p_values[0], 'Pvalues':f_p_values[1]})
f_p_values1

#since we want lower Pvalues we are sorting the features
f_p_values1.sort_values(by='Pvalues',ascending=True)

"""we need higher Fscore and lower the Pvalues, so by evaluating, we can remove Owner_of_vehicle, Type_of_vehicle, Road_surface_conditions, Pedestrian_movement,Casualty_severity,Educational_level,Day_of_week,Sex_of_driver,Road_allignment, Sex_of_casualty"""

#after evaluating we are removing lesser important columns and storing to a new data frame
df2=df.drop(['DIR', 'WEATHER', 'ROAD_FEATURE'],axis=1)
df2.head()

df2.shape

df2.info()

#to check distinct values in each categorical columns we are storing them to a new variable
categorical_new=[i for i in df2.columns if df2[i].dtype=='O']
print(categorical_new)

for i in categorical_new:
    print(df2[i].value_counts())

#get_dummies
dummy=pd.get_dummies(df2[['ACCIDENT_CAUSE_TYPE']])
dummy.head()

x=df2.drop(['CHAINAGE','Date','Category','LANE_TYPE','PRIMARY_REASON','ROAD_CONDITION','SECONDARY_REASON'],axis=1)
x.shape

x.head()

#concatinate dummy and old data frame
df3=pd.concat([dummy,x],axis=1)
df3.head()

df4=df3.drop(['ACCIDENT_CAUSE_TYPE'],axis=1)
df4.shape

df4.head()

y=df3.iloc[:,11]
y.head()

x.head()

#checking the count of each item in the output column
y.value_counts()

y = df3.iloc[:, 11]
x = df3.iloc[:, :11]

plt.figure(figsize=(8, 6))
sns.countplot(x=y, palette='muted')

# Add count labels
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().annotate(f'{int(height)}', (p.get_x() + p.get_width() / 2, height), ha='center', va='bottom', fontsize=8)

plt.xticks(fontsize=10, rotation=90)  # Rotate x-axis labels if needed
plt.yticks(fontsize=10)
plt.title('Incident- Accident Cause Type', fontsize=12)
plt.xlabel('Category', fontsize=5)
plt.ylabel('Count', fontsize=10)

plt.tight_layout()
plt.show()

from google.colab import drive
drive.mount('/content/drive')

#converting data to training data and testing data
from sklearn.model_selection import train_test_split
#splitting 70% of the data to training data and 30% of data to testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

x_test

y_test

x_train

y_train

x_test.isna().sum

x1_test=x_test.dropna()
x1_train=x_train.dropna()
y1_train=y_train.dropna()
y1_test=y_test.dropna()

print(x1_train.shape,x1_test.shape,y1_train.shape,y1_test.shape)

#KNN model alg
from sklearn.neighbors import KNeighborsClassifier
model_KNN=KNeighborsClassifier(n_neighbors=5)
model_KNN.fit(x1_train,y1_train)

y_KNN=model_KNN.predict(x1_test)
y_KNN

y2_test=y1_test.iloc[0:379]

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,ConfusionMatrixDisplay
matrix_KNN=confusion_matrix(y2_test,y_KNN)
print(matrix_KNN,'\n')

print(ConfusionMatrixDisplay.from_predictions(y2_test,y_KNN))
accuracy_KNN=accuracy_score(y2_test,y_KNN)
print(accuracy_KNN,'\n')
report_KNN=classification_report(y2_test,y_KNN)
print(report_KNN)

"""naive bayes"""

#Decision Tree model alg
from sklearn.tree import DecisionTreeClassifier
model_dec=DecisionTreeClassifier(criterion='entropy')
model_dec.fit(x1_train,y1_train)

y_dec=model_dec.predict(x1_test)
y_dec

y3_test=y_test.iloc[0:379]

matrix_dec=confusion_matrix(y3_test,y_dec)
print(matrix_dec,'\n')
print(ConfusionMatrixDisplay.from_predictions(y3_test,y_dec))
accuracy_dec=accuracy_score(y3_test,y_dec)
print(accuracy_dec,'\n')

#Randomforest classifier
from sklearn.ensemble import RandomForestClassifier
model_ran=RandomForestClassifier(n_estimators=25,criterion='entropy')
model_ran.fit(x_train,y_train)

y_ran=model_ran.predict(x_test)
y_ran

matrix_dec=confusion_matrix(y_test,y_ran)
print(matrix_dec,'\n')
print(ConfusionMatrixDisplay.from_predictions(y_test,y_ran))
accuracy_ran=accuracy_score(y_test,y_ran)
print(accuracy_ran,'\n')
report_ran=classification_report(y_test,y_ran)
print(report_ran)

report_dec=classification_report(y3_test,y_dec)
print(report_dec)

alg=['KNN','Decision Tree','Radom Forest']
acc=[accuracy_KNN,accuracy_dec,accuracy_ran]
Accuracy_Scores=pd.DataFrame({'Algorithms':alg, 'Accuracy': acc})
Accuracy_Scores['Accuracy']=Accuracy_Scores['Accuracy']*100
Accuracy_Scores

#sorting models based on their accuracy score
Accuracy_Scores.sort_values(by='Accuracy',ascending=False)

# Specify the figure size
plt.figure(figsize=(10, 6))

# Create the barplot
ax = sns.barplot(x='Algorithms', y='Accuracy',
                 palette='muted', data=Accuracy_Scores.sort_values(by='Accuracy',ascending=False),
                 errwidth=0)

# Add labels to the bars
for i in ax.containers:
    ax.bar_label(i)

# Display the plot
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import drive
drive.mount('/content/drive')
df=pd.read_csv('/content/drive/MyDrive/SGTL-Accident_data1.csv')
df.head()
df.describe()

df.duplicated().sum()

df=df.dropna()
df.isna().sum()

df.corr()

sns.heatmap(df.corr())

numerical=[i for i in df.columns if df[i].dtype!='O']
print('The numerica variables are',numerical)

df2=df.drop(['SPV'],axis=1)
df2.head()

df2.shape
categorical_new=[i for i in df2.columns if df2[i].dtype=='O']
print(categorical_new)

for i in categorical_new:
    print(df2[i].value_counts())

df3=pd.concat([df2],axis=1)
df3.head()

y=df3.iloc[:,1]
y.head()

plt.figure(figsize=(20, 8))

ax =sns.countplot(x = y, palette='muted')
for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')


plt.title('Cause of accident-Acc_Type')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 8))

ax = sns.countplot(x='MONTH', hue='ACC_TYPE', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('No of accidents on each month and their type')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 8))

ax = sns.countplot(x='PRIMARY_REASON', hue='ACC_TYPE', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident- Total Count')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(50, 50))

ax = sns.countplot(x='SECONDARY_REASON', hue='ACC_TYPE', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident - Secondary Reason')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(50, 50))

ax = sns.countplot(x='SECONDARY_REASON', hue='MONTH', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident - Month wise')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(50, 50))

ax = sns.countplot(x='ACC_TYPE', hue='ROAD_CONDITION', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident-Category')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 8))

ax = sns.countplot(x='ACC_TYPE', hue='DIR', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident - Direction')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 8))

ax = sns.countplot(x='ACC_TYPE', hue='LANE_TYPE', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident- Lane Type')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 8))

ax = sns.countplot(x='ACC_TYPE', hue='ACCIDENT_CAUSE_TYPE', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident')
plt.show()

df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)

# Extract the month and year values from the 'DATE' column
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Filter the data for a specific year
year = 2022
filtered_df = df[df['Year'] == year]

# Group the data by month and calculate the number of accidents
monthly_accidents = filtered_df.groupby('Month').size()

# Map month numbers to month names
month_names = pd.Series(['January', 'February', 'March', 'April', 'May', 'June',
                         'July', 'August', 'September', 'October', 'November', 'December'])
monthly_accidents.index = month_names[monthly_accidents.index - 1]

# Generate a bar chart to represent the data
ax = monthly_accidents.plot(kind='bar', figsize=(15, 10))
plt.title(f'Accidents per Month - {year}')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=0)

# Add data labels to the bars
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.show()

# Convert the 'DATE' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)

# Extract the month and year values from the 'DATE' column
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Filter the data for a specific year
year = 2021
filtered_df = df[df['Year'] == year]

# Group the data by month and calculate the number of accidents
monthly_accidents = filtered_df.groupby('Month').size()

# Map month numbers to month names
month_names = pd.Series(['January', 'February', 'March', 'April', 'May', 'June',
                         'July', 'August', 'September', 'October', 'November', 'December'])
monthly_accidents.index = month_names[monthly_accidents.index - 1]

# Generate a bar chart to represent the data
ax = monthly_accidents.plot(kind='bar', figsize=(15, 10))
plt.title(f'Accidents per Month - {year}')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=0)

# Add data labels to the bars
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.show()

# Convert the 'DATE' column to datetime format
df['DATE'] = pd.to_datetime(df['Date'], infer_datetime_format=True)

# Extract the month and year values from the 'DATE' column
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Filter the data for a specific year
year = 2022
filtered_df = df[df['Year'] == year]

# Group the data by month and accident type, calculate the number of accidents
monthly_accidents = filtered_df.groupby(['Month', 'ACC_TYPE']).size().unstack()

# Map month numbers to month names
month_names = pd.Series(['January', 'February', 'March', 'April', 'May', 'June',
                         'July', 'August', 'September', 'October', 'November', 'December'])
monthly_accidents.index = month_names[monthly_accidents.index - 1]

# Generate a stacked bar chart to represent the data
ax = monthly_accidents.plot(kind='bar', stacked=False, figsize=(20, 10))
plt.title(f'Accidents per Month - {year}')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=0)
plt.legend(title='Accident Type')

# Add data labels to the bars
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.show()

# Convert the 'DATE' column to datetime format
df['DATE'] = pd.to_datetime(df['Date'], infer_datetime_format=True)

# Extract the month and year values from the 'DATE' column
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Filter the data for a specific year
year = 2021
filtered_df = df[df['Year'] == year]

# Group the data by month and accident type, calculate the number of accidents
monthly_accidents = filtered_df.groupby(['Month', 'ACC_TYPE']).size().unstack()

# Map month numbers to month names
month_names = pd.Series(['January', 'February', 'March', 'April', 'May', 'June',
                         'July', 'August', 'September', 'October', 'November', 'December'])
monthly_accidents.index = month_names[monthly_accidents.index - 1]

# Generate a stacked bar chart to represent the data
ax = monthly_accidents.plot(kind='bar', stacked=False, figsize=(20, 10))
plt.title(f'Accidents per Month - {year}')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=0)
plt.legend(title='Accident Type')

# Add data labels to the bars
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.show()

# Convert the 'DATE' column to datetime format
df['DATE'] = pd.to_datetime(df['Date'], infer_datetime_format=True)

# Extract the month and year values from the 'DATE' column
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Filter the data for a specific year
year = 2022
filtered_df = df[df['Year'] == year]

# Group the data by month and accident type, calculate the number of accidents
monthly_accidents = filtered_df.groupby(['Month', 'SECONDARY_REASON']).size().unstack()

# Map month numbers to month names
month_names = pd.Series(['January', 'February', 'March', 'April', 'May', 'June',
                         'July', 'August', 'September', 'October', 'November', 'December'])
monthly_accidents.index = month_names[monthly_accidents.index - 1]

# Generate a stacked bar chart to represent the data
ax = monthly_accidents.plot(kind='bar', stacked=True, figsize=(20, 20))
plt.title(f'Accidents per Month - {year}')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=0)
plt.legend(title='Cause of accident')

# Add data labels to the bars
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 15))

ax = sns.countplot(x='SECONDARY_REASON', hue='FATAL', data=df3, palette='muted')

for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.title('Cause of Accident')
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)

# Extract the year from the 'Date' column
df['Year'] = df['Date'].dt.year

# Filter the data for a specific year
year = 2021
filtered_df = df[df['Year'] == year]

# Group the data by ACC_TYPE and CHAINAGE, calculate the count of accidents
accidents_chainage = filtered_df.groupby(['ACC_TYPE', 'CHAINAGE']).size().unstack()

# Reduce the number of displayed CHAINAGE values
num_displayed_chainage = 10
selected_chainage = accidents_chainage.columns[:num_displayed_chainage]

# Filter the data to include only the selected CHAINAGE values
filtered_accidents_chainage = accidents_chainage[selected_chainage]

# Generate a stacked bar chart to represent the data
ax = filtered_accidents_chainage.plot(kind='bar', stacked=False, figsize=(20, 10))
plt.title(f'Accidents per CHAINAGE - {year}')
plt.xlabel('CHAINAGE')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=0)
plt.legend(title='Accident Type')

# Add data labels to the bars
for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.show()

plt.figure(figsize=(10, 200))
plotnumber = 1

for col in categorical_new:
    if plotnumber <= df.shape[1] and (col != 'CHAINAGE' and col != 'Date'):
        ax1 = plt.subplot(28, 1, plotnumber)
        sns.countplot(data=df, y=col, palette='muted')

        # Add count labels
        for p in ax1.patches:
            width = p.get_width()
            height = p.get_height()
            x, y = p.get_xy()
            count = int(width)  # Use width instead of height for y-axis count
            ax1.annotate(f'{count}', (x + width, y + height/2), ha='left', va='center')

        plt.xticks(fontsize=10)
        plt.yticks(fontsize=10)
        plt.title(col.title(), fontsize=10)
        plt.xlabel('')
        plt.ylabel('')

        # Adjust the x-axis limits to avoid cutting off labels
        ax1.set_xlim(0, df[col].value_counts().max() + 1)

    plotnumber += 1

plt.tight_layout()
plt.show()

#plotting the final class
plt.figure(figsize=(15,8))
sns.countplot(x = df['ACC_TYPE'])
for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')
plt.title('Distribution of Accident severity')
plt.show()

plt.figure(figsize=(15, 8))
ax = sns.countplot(x=df['ACC_TYPE'])

# Add count labels
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2, height), ha='center', va='bottom')

plt.title('Distribution of Accident Severity')
plt.show()