-
Notifications
You must be signed in to change notification settings - Fork 0
/
offense-cleaning.Rmd
295 lines (237 loc) · 11.9 KB
/
offense-cleaning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(janitor)
library(lubridate)
```
getting the real codes and associating them with the offenses listed ... but the trouble is that the offenses listed are in a group and not one by one
```{r}
incidents_clean <- incidents %>%
select(incident_num, date, incident_month, officer, address_clean, desc_clean, starts_with("offense")) %>%
mutate_all(~str_replace(., "/", "/ ")) %>%
mutate_all(~(str_replace_all(.,'[\\.,]',''))) %>%
mutate_all(str_trim) %>%
group_by(incident_num, date, officer, address_clean) %>%
rowid_to_column()
incidents_clean
```
So, now I want a situation where I can look at the total number of offenses. The tricky part is differentiating a situation where it's one offense listed in multiple rows OR two different offenses.
I try to cheat the system by saying that if there are more than two words on the next line, leave it alone. If there are only two words, bring it to the previous row
That doesn't always work out super well. So, I've been guessing and checking the system. Sometimes, I alter the word count so a charge with just two words is seen as one charge. Other times, there are offenses with three or more words that belong with the charge above.
####My process
So I kinda start by altering the solo charges that are two words long and make them 10 words. Then the long-name charges that have more than one or two words on the next line, make that "next line" go back to the first line ... then I delete any phrase that matches that "next line"
So if whole charge were "failed to pick up" (first line) "TRASH WEEDS OR OTHER MATTER" (second line) I'd say that when the value equals TRASH WEEDS OR OTHER MATTER, have that phrase go up to the previous line, and then I'd filter it out.
THEN I eventually realized that I could just make that phrase equal to one word, and I thought that may be more efficient. So I did that, but sometimes it didn't cooperate, so I went back to the "old way"
```{r}
incidents_longer <- incidents_clean %>%
pivot_longer(cols = c(offenses_clean_p1, offenses_3, offenses_4, offenses_5, offenses_6, offenses_7, offenses_8, offenses_9), names_to = "offenses_gen") %>%
filter(value != "") %>%
#filter(value != offenses_clean) %>%
mutate(value = str_trim(value)) %>%
mutate(word_cnt = lengths(gregexpr("\\W+", value)) + 1) %>%
mutate(word_cnt = case_when(
offenses_clean == value ~ 10,
value == "PUBLIC INTOXICATION" ~ 10,
value == "IMPOUNDED VEHICLE" ~ 10,
value == "CRIMINAL TRESPASS" ~ 10,
value == "FORGERY" ~ 10,
value == "FORGERY/ GOV" ~ 10,
value == "AGGRAVATED ROBBERY" ~ 10,
value == "ROBBERY" ~ 10,
value == "DISTURBANCE" ~ 10,
value == "PUBLIC LEWDNESS" ~ 10,
value == "RECKLESS DRIVING" ~ 10,
value == "INVESTIGATION / INFORMATION" ~ 10,
value == "BURGLARY" ~ 10,
value == "EMERGENCY DETENTION" ~ 10,
value == "DEADLY CONDUCT" ~ 10,
value == "HARASSMENT" ~ 10,
value == "RECOVERED STOLEN" ~ 10,
value == "STALKING" ~ 10,
value == "RUNAWAY" ~ 10,
value == "WELFARE CHECK" ~ 10,
value == "ASSAULT" ~ 10,
value == "UNLAWFUL RESTRAINT" ~ 10,
value == "THEFT < $50" ~ 10,
value == "ILLEGAL DUMPING" ~ 10,
value == "DANGER BODILY INJ" ~ 1,
value == "EVIDENCE - ALTER/DESTROY/CONCEAL" ~ 1,
value == "OPERATOR/ DRIVERS LICENSE" ~ 1,
value == "MORE PREV CONVIC" ~ 1,
value == "VALID OPERATOR/ DRIVERS LICENSE" ~ 1,
value == "IN FEAR OF SBI" ~ 1,
value == "VEH W/ PREV CONVIC" ~ 1,
value == "TRASH WEEDS OR OTHER MATTER" ~ 1,
value == "LARGE VEHICLES BOATS AND OTHER" ~ 1,
value == "INSPECTION CERT" ~ 1,
value == "OBSCURED LICENSE PLATE" ~ 1,
value == "INJURY:FAMILY MEMBER" ~ 1,
value == "MONUMENT COMMUNITY CNTR" ~ 1,
value == "CONSUMP" ~ 1,
value == "TOUCH:FAMILY MEMBER" ~ 1,
value == "PROPERTY OF OTHERS" ~ 1,
value == "W/ INT INHALE" ~ 1,
value == "VACCINATION OF DOGS CATS AND FERRETS" ~ 1,
value == "VEH W/PREV CONVIC" ~ 1,
value == "THAN 4 MONTHS" ~ 1,
value == "UPON/ OVER STREETS/SIDEWALKS" ~ 1,
value == "REQUIRED" ~ 1,
value == "POSS FICT DL OR ID CERT" ~ 1,
value == "ARMOR BY FELON" ~ 1,
value == "REQUIREMENTS ON STRIKING UNATTENDED VEH" ~ 1,
value == "MORE PREV CONVIC" ~ 1,
value == "FIXTURE OR HIGHWAY LANDSCAPING" ~ 1,
value == "2/ MORE PREV CONV" ~ 1,
value == "EVIDENCE - ALTER/ DESTROY/CONCEAL" ~ 1,
grepl("INFORMATION", value) & grepl("INVESTIGATION", value) ~ 10,
TRUE ~ word_cnt)) %>%
mutate(value_clean = case_when(
lead(word_cnt) <= 2 & word_cnt > 2 & word_cnt != 10 ~ paste(value, lead(value)),
word_cnt <= 2 & lag(word_cnt) >2 & lag(word_cnt) != 10 ~ "",
TRUE ~ value
)) %>%
mutate(value_clean = case_when(
lead(value_clean) == "CONSUMP" ~ paste(value_clean, lead(value_clean)),
lead(value_clean) == "REQUIRED" ~ paste(value_clean, lead(value_clean)),
lead(value_clean) == "VACCINATION OF DOGS CATS AND FERRETS" ~ paste(value_clean, lead(value_clean)),
lead(value_clean) == "REQUIREMENTS ON STRIKING UNATTENDED VEH" ~ paste(value_clean, lead(value_clean)),
TRUE ~ value_clean
)) %>%
filter(value_clean != "CONSUMP" & value_clean != "REQUIRED" & value_clean != "VACCINATION OF DOGS CATS AND FERRETS" & value_clean != "REQUIREMENTS ON STRIKING UNATTENDED VEH") %>%
filter(value_clean != "") %>%
select(-word_cnt, -offenses_gen, -value, -offenses_2) %>%
mutate(date = as.Date(date)) %>%
mutate(incident_month = as.Date(incident_month))
incidents_longer
```
```{r}
clean_offenses <- incidents_longer %>%
mutate(value_clean = case_when(
grepl("OPERATING ALARM", value_clean) ~ "COV - OPERATING ALARM SYSTEM W/ O PERMIT",
value_clean == "OPRATING ALARM SYSTEM" | value_clean == "COV - OPERATINGING ALARM SYSTEM W/ O PERMIT" | value_clean == "COV - OPERATE ALARM SYSTEM W/ O PERMIT" | value_clean == "COV - OPERATION ALARM SYSTEM W/ O PERMIT" | value_clean == "COV-OPRATING ALARM SYSTEM W/ O PERMIT" ~ "COV - OPERATING ALARM SYSTEM W/ O PERMIT",
grepl("ACCIDENT", value_clean) & grepl("MOTOR VEHICLE", value_clean) ~ "ACCIDENT MOTOR VEHICLE",
grepl("CITY CODE VIOLATION", value_clean) ~ "CITY CODE VIOLATION",
value_clean == "COV - ACCUMULATION OF GARBAGE" ~ "COV - ACCUMULATION OF GARBAGE TRASH WEEDS OR OTHER MATTER",
value_clean == "TAMPER W/ GOVT RECORD DEFRAUD/ HARM" ~ "TAMPER W/ GOVT RECORD DEFRAUD/HARM",
value_clean == "COV - ANIMAL NUISANCE - ATTACK OTHER ANIMALS" ~ "ANIMAL NUISANCE",
value_clean == "COV - IMPOUNDING VEHICLES - ILLEGALLY PARKED OR ABANDONED" ~ "COV - IMPOUNDING VEHICLES -",
value_clean == "COV - SHRUBBERY/ VEGETATION UPON/ OVER STREETS/SIDEWALKS" ~ "COV - SHRUBBERY/ VEGETATION UPON/OVER STREETS/SIDEWALKS",
value_clean == "DISPLAY/ POSSESS ALTERED OPERATOR/ DRIVERS LICENSE" ~ "DISPLAY/ POSSESS ALTERED OPERATOR/DRIVERS LICENSE",
value_clean == "DISPLAY/ POSSESS FICTICIOUS OPERATOR/ DRIVERS LICENSE" ~ "DISPLAY/ POSSESS FICTICIOUS OPERATOR/DRIVERS LICENSE",
value_clean == "FRAUDULENT USE/ POSS OF IDENTIFYING" ~ "FRAUDULENT USE/ POSS OF IDENTIFYING INFO",
value_clean == "POSSESSION OF DRUG PARAPHERNALIA" | value_clean == "POSSESSION OR DELIVERY OF DRUG" | value_clean == "POSESSION OF DRUG PARAPHENALIA" ~ "POSSESSION OR DELIVERY OF DRUG PARAPHERNALIA",
value_clean == "RUNAWAY" ~ "RUNAWAY/ MISSING PERSON",
value_clean == "TAMPER/ FABRICATE PHYSICAL EVIDENCE - ALTER/ DESTROY/CONCEAL" ~ "TAMPER/ FABRICATE PHYSICAL EVIDENCE - ALTER/DESTROY/CONCEAL",
value_clean == "TRAFFIC ACCIDENT" ~ "ACCIDENT MOTOR VEHICLE",
value_clean == "VEHICLE ACCIDENT"~ "ACCIDENT MOTOR VEHICLE",
value_clean == "POSS USE ABUSABLE/ AEROSOL PAINT W/ INT INHALE" ~ "POSS USE ABUSABLE/ AEROSOL PAINT W/INT INHALE",
value_clean == "WARRANT ARREST - OUT OF" ~ "WARRANT ARREST - OUT OF JURISDICTION",
value_clean == "investigation/ information" ~ "INVESTIGATION / INFORMATION",
TRUE ~ value_clean))
# I could attempt to do something to limit two instances where investigation/information are in the thing
# mutate(value_clean_2 == case_when(
# grepl("INVESTIGATION", value_clean) & ("INFORMATION", value_clean) ~
# ))
```
## Prepping to make codes for the police
Here is the list of offenses I have in my clean dataset
```{r}
types_of_offenses <- clean_offenses %>%
mutate(year = floor_date(date, unit = "year")) %>%
#filter(year == as.Date("2021-01-01"))%>%
group_by(value_clean) %>%
tally()
types_of_offenses
```
Here is are all of the incidents joined to alamo_codes, the list of codes given by the police department
```{r}
full_incident_code_desc <- clean_offenses %>%
mutate(year = floor_date(date, unit = "year")) %>%
#filter(year == as.Date("2021-01-01")) %>%
inner_join(alamo_codes, by = c("value_clean" = "clean_offenses")) %>%
select(-rowid)
#mutate(street = substr(address_clean, 1, nchar(address_clean)-22)) %>%
#mutate(city_state_zip = str_sub(address_clean, -22, -1)) %>%
#mutate(city = substr(city_state_zip, 1, 13)) %>%
#mutate(state = substr(city_state_zip, 15, 16)) %>%
#mutate(zip = str_sub(city_state_zip, -5, -1)) %>%
#select(-city_state_zip)
full_incident_code_desc
```
```{r}
all_incident_code_desc <- clean_offenses %>%
mutate(year = floor_date(date, unit = "year")) %>%
#filter(year == as.Date("2021-01-01")) %>%
inner_join(alamo_codes, by = c("value_clean" = "clean_offenses")) %>%
select(-rowid) %>%
group_by(value_clean, priority, type, class, nibr, ucr) %>%
tally()
#mutate(street = substr(address_clean, 1, nchar(address_clean)-22)) %>%
#mutate(city_state_zip = str_sub(address_clean, -22, -1)) %>%
#mutate(city = substr(city_state_zip, 1, 13)) %>%
#mutate(state = substr(city_state_zip, 15, 16)) %>%
#mutate(zip = str_sub(city_state_zip, -5, -1)) %>%
#select(-city_state_zip)
all_incident_code_desc
```
```{r}
missing_incident_code_desc <- clean_offenses %>%
mutate(year = floor_date(date, unit = "year")) %>%
#filter(year == as.Date("2021-01-01")) %>%
anti_join(alamo_codes, by = c("value_clean" = "clean_offenses")) %>%
select(-rowid) %>%
group_by(value_clean) %>%
tally() %>%
mutate(priority = "UNKNOWN") %>%
mutate(class = "UNKNOWN") %>%
mutate(type = "UNKNOWN") %>%
mutate(nibr = "UNKNOWN") %>%
mutate(ucr = "UNKNOWN")
#mutate(street = substr(address_clean, 1, nchar(address_clean)-22)) %>%
#mutate(city_state_zip = str_sub(address_clean, -22, -1)) %>%
#mutate(city = substr(city_state_zip, 1, 13)) %>%
#mutate(state = substr(city_state_zip, 15, 16)) %>%
#mutate(zip = str_sub(city_state_zip, -5, -1)) %>%
#select(-city_state_zip)
missing_incident_code_desc
```
```{r}
missing_incident_code_desc_FULL <- clean_offenses %>%
mutate(year = floor_date(date, unit = "year")) %>%
#filter(year == as.Date("2021-01-01")) %>%
anti_join(alamo_codes, by = c("value_clean" = "clean_offenses"))
#filter(grepl("DISPLAY FICTITIOUS / ALTERED", value_clean))
missing_incident_code_desc_FULL
```
## um so i kinda messed up with some documents so this is me reworking some shit so i can get categories approved by the chief
```{r}
#police_proof <- rbind(all_incident_code_desc, missing_incident_code_desc) %>%
# ungroup() %>%
# select(-priority, -type, -ucr, -nibr, -class)
```
```{r}
codes_b4_messup <- read_excel("Data/Exports/Alamo_codes_2011_thru_2021_LS_labeled_categories.xls")
```
```{r}
codes_after_messup <- read_excel("Data/Exports/FINAL Offcial_SAEN_AHPD_codes_PD REVIEW.xls") %>%
select(-priority, -ls_class)
```
```{r}
codes_omg_what_did_i_do <- read_excel("Data/Exports/Offcial_SAEN_AHPD_codes.xlsx") %>%
select(value_clean, "property crime")
```
```{r}
police_proof <- codes_after_messup %>%
inner_join(codes_omg_what_did_i_do, by = "value_clean")
police_proof
```
```{r}
#police_proof <- police_proof %>%
# left_join(codes_b4_messup, by = c("value_clean"))
```
```{r}
#write_csv(police_proof, "Data/Exports/Offcial_SAEN_AHPD_codes.csv")
```