Skip to content

Commit c522b7b

Browse files
authored
Updating code base to reflect changes to private model repo (#107)
v1.7.0 - Data file names now mirror the script names that created the files - Features on food inspections are now calculated separately - Features on business inspections are now calculated separately - The model code merges in the features, does not calculate features - Added script to adjust the public sanitarian data to match the schema of the private sanitarian file - More aggressive filtering functions - Separates out the violation matrix calculation into the parsing step and classification step (which, as it turns out will be useful for the new inspection format) - Refactoring model result / evaluation steps to accommodate future analysis * adding prefix number to code and data, closes #100 * syncing and updating startup script, closes #101 * split violation matrix calculation into two steps, closes #102 * updated help example to remove unused variable * adding nokey function, needed for new violation matrix calculation * guard against too few categories in GenerateOtherLicenseInfo, closes 103 * updating filter functions to match model * starting work described in #104 to split feature creation * refactoring code for model compatibility * simplifying initialization
1 parent d6188d2 commit c522b7b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+711
-670
lines changed

CODE/00_Startup.R

+53-13
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,55 @@
1-
## INSTALL THESE DEPENDENCIES
2-
install.packages("devtools",
3-
dependencies = TRUE,
4-
repos='http://cran.us.r-project.org')
5-
install.packages("Rcpp",
6-
dependencies = TRUE,
7-
repos='http://cran.us.r-project.org')
8-
9-
## Update two packages not on CRAN using the devtools package.
10-
devtools::install_github(repo = 'geneorama/geneorama')
11-
devtools::install_github(repo = 'yihui/printr')
1+
##------------------------------------------------------------------------------
2+
## INSTALL DEPENDENCIES IF MISSING
3+
##------------------------------------------------------------------------------
4+
5+
if(!"devtools" %in% rownames(installed.packages())){
6+
install.packages("devtools",
7+
dependencies = TRUE,
8+
repos = "https://cloud.r-project.org/")
9+
}
10+
11+
if(!"Rcpp" %in% rownames(installed.packages())){
12+
install.packages("Rcpp",
13+
dependencies = TRUE,
14+
repos = "https://cloud.r-project.org/")
15+
}
16+
17+
if(!"RSocrata" %in% rownames(installed.packages())){
18+
install.packages("RSocrata",
19+
dependencies = TRUE,
20+
repos = "https://cloud.r-project.org/")
21+
}
22+
23+
if(!"data.table" %in% rownames(installed.packages())){
24+
install.packages("data.table",
25+
dependencies = TRUE,
26+
repos = "https://cloud.r-project.org/")
27+
}
28+
29+
if(!"geneorama" %in% rownames(installed.packages())){
30+
devtools::install_github('geneorama/geneorama')
31+
}
32+
33+
if(!"printr" %in% rownames(installed.packages())){
34+
devtools::install_github(repo = 'yihui/printr')
35+
}
36+
37+
##------------------------------------------------------------------------------
38+
## UPDATE DEPENDENCIES IF MISSING
39+
##------------------------------------------------------------------------------
1240

1341
## Update to RSocrata 1.7.2-2 (or later)
14-
## which is only on github as of March 8, 2016
15-
devtools::install_github(repo = 'chicago/RSocrata')
42+
if(installed.packages()["RSocrata","Version"] < "1.7.2-2"){
43+
install.packages("RSocrata",
44+
repos = "https://cloud.r-project.org/")
45+
}
46+
47+
## Needs recent version for foverlaps
48+
if(installed.packages()["data.table","Version"] < "1.10.0"){
49+
install.packages("data.table",
50+
repos = "https://cloud.r-project.org/")
51+
}
52+
53+
if(installed.packages()["geneorama","Version"] < "1.5.0"){
54+
devtools::install_github('geneorama/geneorama')
55+
}

CODE/11_business_download.R

+8-11
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
if(interactive()){
2-
##==========================================================================
3-
## INITIALIZE
4-
##==========================================================================
5-
## Remove all objects; perform garbage collection
6-
rm(list=ls())
7-
gc(reset=TRUE)
8-
## Detach any non-standard libraries
9-
geneorama::detach_nonstandard_packages()
10-
}
1+
##==============================================================================
2+
## INITIALIZE
3+
##==============================================================================
4+
## Remove all objects; perform garbage collection
5+
rm(list=ls())
6+
gc(reset=TRUE)
7+
118
## Load libraries & project functions
129
geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
1310
geneorama::sourceDir("CODE/functions/")
@@ -38,4 +35,4 @@ business[ , LICENSE_TERM_START_DATE := as.IDate(LICENSE_TERM_START_DATE, "%m/%d/
3835
business[ , LICENSE_TERM_EXPIRATION_DATE := as.IDate(LICENSE_TERM_EXPIRATION_DATE, "%m/%d/%Y")]
3936

4037
## SAVE RESULT
41-
saveRDS(business, "DATA/bus_license.Rds")
38+
saveRDS(business, "DATA/11_bus_license.Rds")

CODE/12_crime_download.R

+8-11
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
if(interactive()){
2-
##==========================================================================
3-
## INITIALIZE
4-
##==========================================================================
5-
## Remove all objects; perform garbage collection
6-
rm(list=ls())
7-
gc(reset=TRUE)
8-
## Detach any non-standard libraries
9-
geneorama::detach_nonstandard_packages()
10-
}
1+
##==============================================================================
2+
## INITIALIZE
3+
##==============================================================================
4+
## Remove all objects; perform garbage collection
5+
rm(list=ls())
6+
gc(reset=TRUE)
7+
118
## Load libraries & project functions
129
geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
1310
geneorama::sourceDir("CODE/functions/")
@@ -38,4 +35,4 @@ crime[ , Arrest := as.logical(Arrest)]
3835
crime[ , Domestic := as.logical(Domestic)]
3936

4037
## SAVE RESULT
41-
saveRDS(crime , "DATA/crime.Rds")
38+
saveRDS(crime , "DATA/12_crime.Rds")

CODE/13_food_inspection_download.R

+9-12
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
if(interactive()){
2-
##==========================================================================
3-
## INITIALIZE
4-
##==========================================================================
5-
## Remove all objects; perform garbage collection
6-
rm(list=ls())
7-
gc(reset=TRUE)
8-
## Detach any non-standard libraries
9-
geneorama::detach_nonstandard_packages()
10-
}
1+
##==========================================================================
2+
## INITIALIZE
3+
##==========================================================================
4+
## Remove all objects; perform garbage collection
5+
rm(list=ls())
6+
gc(reset=TRUE)
7+
118
## Load libraries & project functions
129
geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
1310
geneorama::sourceDir("CODE/functions/")
@@ -34,5 +31,5 @@ setnames(foodInspect, gsub("_+$","",colnames(foodInspect)))
3431
geneorama::convert_datatable_IntNum(foodInspect)
3532
geneorama::convert_datatable_DateIDate(foodInspect)
3633

37-
## SAVE ANSWER
38-
saveRDS(foodInspect , "DATA/food_inspections.Rds")
34+
## SAVE RESULT
35+
saveRDS(foodInspect , "DATA/13_food_inspections.Rds")

CODE/14_garbage_download.R

+9-11
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
if(interactive()){
2-
##==========================================================================
3-
## INITIALIZE
4-
##==========================================================================
5-
## Remove all objects; perform garbage collection
6-
rm(list=ls())
7-
gc(reset=TRUE)
8-
## Detach any non-standard libraries
9-
geneorama::detach_nonstandard_packages()
10-
}
1+
##==============================================================================
2+
## INITIALIZE
3+
##==============================================================================
4+
## Remove all objects; perform garbage collection
5+
rm(list=ls())
6+
gc(reset=TRUE)
7+
118
## Load libraries & project functions
129
geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
1310
geneorama::sourceDir("CODE/functions/")
@@ -34,4 +31,5 @@ geneorama::convert_datatable_IntNum(garbageCarts)
3431
geneorama::convert_datatable_DateIDate(garbageCarts)
3532

3633
## SAVE RESULT
37-
saveRDS(garbageCarts , "DATA/garbage_carts.Rds")
34+
saveRDS(garbageCarts , "DATA/14_garbage_carts.Rds")
35+

CODE/15_sanitation_download.R

+8-35
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
if(interactive()){
2-
##==========================================================================
3-
## INITIALIZE
4-
##==========================================================================
5-
## Remove all objects; perform garbage collection
6-
rm(list=ls())
7-
gc(reset=TRUE)
8-
## Detach any non-standard libraries
9-
geneorama::detach_nonstandard_packages()
10-
}
1+
##==============================================================================
2+
## INITIALIZE
3+
##==============================================================================
4+
## Remove all objects; perform garbage collection
5+
rm(list=ls())
6+
gc(reset=TRUE)
7+
118
## Load libraries & project functions
129
geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
1310
geneorama::sourceDir("CODE/functions/")
@@ -37,29 +34,5 @@ geneorama::convert_datatable_IntNum(sanitationComplaints)
3734
geneorama::convert_datatable_DateIDate(sanitationComplaints)
3835

3936
## SAVE RESULT
40-
saveRDS(sanitationComplaints , "DATA/sanitation_code.Rds")
41-
42-
# ## Quick fix to download creation date, which is needed for the heat map calc
43-
# ## The following block can be removed after issue 68 is resolved in RSocrata
44-
# ## https://github.com/Chicago/RSocrata/issues/68
45-
# crdate <- list()
46-
# i <- 0
47-
# while(length(crdate)==0 || length(crdate[[length(crdate)]]) == 50000 ){
48-
# i <- i + 1
49-
# url <- paste0("https://data.cityofchicago.org/resource/me59-5fac.csv",
50-
# "?$select=creation_date&$LIMIT=50000",
51-
# "&$OFFSET=", (i - 1) * 50000)
52-
# crdate[[i]] <- httr::content(httr::GET(url), as = "text")
53-
# crdate[[i]] <- strsplit(crdate[[i]], "\n")[[1]][-2]
54-
# print(i)
55-
# print(length(crdate[[i]]))
56-
# }
57-
# crdate <- do.call(c, crdate)
58-
# crdate <- crdate[-1]
59-
#
60-
# length(crdate) == nrow(sanitationComplaints)
61-
#
62-
# crdate <- as.IDate(crdate, "%m/%d/%Y")
63-
# sanitationComplaints$Creation_Date <- crdate
64-
37+
saveRDS(sanitationComplaints , "DATA/15_sanitation_code.Rds")
6538

CODE/21_calculate_violation_matrix.R

+16-8
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,33 @@
88
## Remove all objects; perform garbage collection
99
rm(list=ls())
1010
gc(reset=TRUE)
11-
## Detach libraries that are not used
12-
geneorama::detach_nonstandard_packages()
13-
## Load libraries that are used
11+
12+
## Load libraries & project functions
1413
geneorama::loadinstall_libraries(c("data.table", "MASS"))
15-
## Load custom functions
1614
geneorama::sourceDir("CODE/functions/")
1715

1816
##==============================================================================
1917
## LOAD CACHED RDS FILES
2018
##==============================================================================
21-
foodInspect <- readRDS("DATA/food_inspections.Rds")
19+
foodInspect <- readRDS("DATA/13_food_inspections.Rds")
20+
foodInspect <- filter_foodInspect(foodInspect)
2221

2322
##==============================================================================
2423
## CALCULATE FEATURES BASED ON FOOD INSPECTION DATA
2524
##==============================================================================
2625

2726
## Calculate violation matrix and put into data.table with inspection id as key
27+
vio_mat <- calculate_violation_matrix(foodInspect[ , Violations])
28+
29+
## Add key column to vio_mat
30+
vio_mat <- data.table(vio_mat,
31+
Inspection_ID = foodInspect[ , Inspection_ID],
32+
key = "Inspection_ID")
33+
2834
## calculate_violation_types calculates violations by categories:
2935
## Critical, serious, and minor violations
30-
violation_dat <- calculate_violation_types(foodInspect$Violations,
31-
Inspection_ID = foodInspect$Inspection_ID)
32-
saveRDS(violation_dat, "DATA/violation_dat.Rds")
36+
violation_dat <- calculate_violation_types(violation_mat =vio_mat)
37+
38+
## Save results
39+
saveRDS(vio_mat, "DATA/21_food_inspection_violation_matrix_nums.Rds")
40+
saveRDS(violation_dat, "DATA/21_food_inspection_violation_matrix.Rds")

CODE/22_calculate_heat_map_values.R

+9-12
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,23 @@
1-
21
##==============================================================================
32
## INITIALIZE
43
##==============================================================================
54
## Remove all objects; perform garbage collection
65
rm(list=ls())
76
gc(reset=TRUE)
8-
## Detach libraries that are not used
9-
geneorama::detach_nonstandard_packages()
10-
## Load libraries that are used
7+
8+
## Load libraries & project functions
119
geneorama::loadinstall_libraries(c("data.table", "MASS"))
12-
## Load custom functions
1310
geneorama::sourceDir("CODE/functions/")
1411

1512
##==============================================================================
1613
## LOAD CACHED RDS FILES
1714
##==============================================================================
1815

1916
## Import the key data sets used for prediction
20-
foodInspect <- readRDS("DATA/food_inspections.Rds")
21-
crime <- readRDS("DATA/crime.Rds")
22-
garbageCarts <- readRDS("DATA/garbage_carts.Rds")
23-
sanitationComplaints <- readRDS("DATA/sanitation_code.Rds")
17+
foodInspect <- readRDS("DATA/13_food_inspections.Rds")
18+
crime <- readRDS("DATA/12_crime.Rds")
19+
garbageCarts <- readRDS("DATA/14_garbage_carts.Rds")
20+
sanitationComplaints <- readRDS("DATA/15_sanitation_code.Rds")
2421

2522
## Apply filters by omitting rows that are not used in the model
2623
foodInspect <- filter_foodInspect(foodInspect)
@@ -58,8 +55,8 @@ sanitationComplaints_heat <-
5855
##==============================================================================
5956
## SAVE HEAT MAP VALUES
6057
##==============================================================================
61-
saveRDS(burglary_heat, "DATA/burglary_heat.Rds")
62-
saveRDS(garbageCarts_heat, "DATA/garbageCarts_heat.Rds")
63-
saveRDS(sanitationComplaints_heat, "DATA/sanitationComplaints_heat.Rds")
58+
saveRDS(burglary_heat, "DATA/22_burglary_heat.Rds")
59+
saveRDS(garbageCarts_heat, "DATA/22_garbageCarts_heat.Rds")
60+
saveRDS(sanitationComplaints_heat, "DATA/22_sanitationComplaints_heat.Rds")
6461

6562

CODE/23_food_insp_features.R

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
##==============================================================================
2+
## INITIALIZE
3+
##==============================================================================
4+
## Remove all objects; perform garbage collection
5+
rm(list=ls())
6+
gc(reset=TRUE)
7+
8+
## Load libraries & project functions
9+
geneorama::loadinstall_libraries(c("data.table", "MASS"))
10+
geneorama::sourceDir("CODE/functions/")
11+
## Import shift function
12+
shift <- geneorama::shift
13+
14+
##==============================================================================
15+
## LOAD CACHED RDS FILES
16+
##==============================================================================
17+
foodInspect <- readRDS("DATA/13_food_inspections.Rds")
18+
19+
## Apply row filter to remove invalid data
20+
foodInspect <- filter_foodInspect(foodInspect)
21+
22+
## Remove violations from food inspection, violations are caputured in the
23+
## violation matrix data
24+
foodInspect$Violations <- NULL
25+
26+
## Import violation matrix which lists violations by categories:
27+
## Critical, serious, and minor violations
28+
violation_dat <- readRDS("DATA/21_food_inspection_violation_matrix.Rds")
29+
30+
##==============================================================================
31+
## CALCULATE FEATURES
32+
##==============================================================================
33+
34+
## Facility_Type_Clean: Anything that is not "restaurant" or "grocery" is "other"
35+
foodInspect[ , Facility_Type_Clean :=
36+
categorize(x = Facility_Type,
37+
primary = list(Restaurant = "restaurant",
38+
Grocery_Store = "grocery"),
39+
ignore.case = TRUE)]
40+
## Join in the violation matrix
41+
foodInspect <- merge(x = foodInspect,
42+
y = violation_dat,
43+
by = "Inspection_ID")
44+
## Create pass / fail flags
45+
foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)]
46+
foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)]
47+
## Set key to ensure that records are treated CHRONOLOGICALLY...
48+
setkey(foodInspect, License, Inspection_Date)
49+
## Then find previous info by "shifting" the columns (grouped by License)
50+
foodInspect[ , pastFail := shift(fail_flag, -1, 0), by = License]
51+
foodInspect[ , pastCritical := shift(criticalCount, -1, 0), by = License]
52+
foodInspect[ , pastSerious := shift(seriousCount, -1, 0), by = License]
53+
foodInspect[ , pastMinor := shift(minorCount, -1, 0), by = License]
54+
55+
## Calcualte time since last inspection.
56+
## If the time is NA, this means it's the first inspection; add an inicator
57+
## variable to indicate that it's the first inspection.
58+
foodInspect[i = TRUE ,
59+
j = timeSinceLast := as.numeric(
60+
Inspection_Date - shift(Inspection_Date, -1, NA)) / 365,
61+
by = License]
62+
foodInspect[ , firstRecord := 0]
63+
foodInspect[is.na(timeSinceLast), firstRecord := 1]
64+
foodInspect[is.na(timeSinceLast), timeSinceLast := 2]
65+
foodInspect[ , timeSinceLast := pmin(timeSinceLast, 2)]
66+
67+
##==============================================================================
68+
## SAVE RDS
69+
##==============================================================================
70+
setkey(foodInspect, Inspection_ID)
71+
saveRDS(foodInspect, file.path("DATA/23_food_insp_features.Rds"))
72+
73+
74+

0 commit comments

Comments
 (0)