From 0a159fb326e6305fe32798daca2728210c0e6d96 Mon Sep 17 00:00:00 2001
From: Jan Overgoor <janovergoor@gmail.com>
Date: Thu, 22 Jun 2017 13:49:08 -0700
Subject: [PATCH 1/2] Other is probable cause

---
 DATA-README.md                 | 1 -
 src/processing/states/CT.R     | 4 ++--
 src/util/constants_and_paths.R | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/DATA-README.md b/DATA-README.md
index 5faa982..f0755dc 100644
--- a/DATA-README.md
+++ b/DATA-README.md
@@ -248,7 +248,6 @@ http://ctrp3.ctdata.org/rawdata/
 **Data notes**:
 - Counties were mapped by running the cities in the `Intervention Location Name` field through Google's geocoder.
 - Rows appear to represent violations, not individual stops, because a small proportion of rows (1%) report the same officer making multiple stops at the same location at the same time. We grouped the data to combine these duplicates. We don't want to be overly aggressive in grouping together stops, so we only group if the other fields are the same. 
-- While there is some search type data, a high fraction of searches are marked as "Other", so we exclude Connecticut from our consent search analysis.
 - While there is some violation data, we exclude Connecticut from the speeding analysis because it has too much missing data in the violation field.
 - The Connecticut state patrol created another website ([link](http://ctrp3.ctdata.org/)), where new data will get uploaded going forward. We haven't processed this yet.
 
diff --git a/src/processing/states/CT.R b/src/processing/states/CT.R
index 82fe9f8..6b8c017 100755
--- a/src/processing/states/CT.R
+++ b/src/processing/states/CT.R
@@ -29,8 +29,8 @@ race_vals_clean <- c("White", "Black", "Asian", "Hispanic", "Other")
 outcome_keys <- c("I","M","N","U","V","W")
 outcome_vals <- c("Ticket","Summons",NA,"Arrest","Verbal Warning","Written Warning") 
 search_type_keys <- c("C","I","N","O")
-search_type_vals <- c("Consent", "Inventory", NA, "Other")
-search_type_vals_clean <- c("Consent", "Inventory", NA, "Other")
+search_type_vals <- c("Consent", "Inventory", NA, "Probable Cause")
+search_type_vals_clean <- c("Consent", "Inventory", NA, "Probable Cause")
 
 # Rename and extract columns
 print(sprintf("[%s] extracting columns", this_state))
diff --git a/src/util/constants_and_paths.R b/src/util/constants_and_paths.R
index 50c1a19..d077e75 100644
--- a/src/util/constants_and_paths.R
+++ b/src/util/constants_and_paths.R
@@ -34,7 +34,7 @@ STATES_PROCESSED_BUT_NOT_USED_IN_PAPER = c('IA','MI','MS','ND','NH','NV','OR','S
 ALL_PROCESSED_STATES = c(FINAL_STATE_LIST, STATES_PROCESSED_BUT_NOT_USED_IN_PAPER)
 # states with sufficiently high-quality data to include in specific analyses. 
 GOOD_SEARCH_CONDUCTED_DATA = c('AZ', 'CA', 'CO', 'CT', 'FL', 'IL', 'MA', 'MD', 'MO', 'MT', 'NC', 'NE', 'OH', 'RI', 'SC', 'TX', 'VT', 'WA', 'WI')
-GOOD_CONSENT_DATA = c('CO', 'FL', 'MA', 'MD', 'NC', 'TX', 'WA')
+GOOD_CONSENT_DATA = c('CO', 'CT', 'FL', 'MA', 'MD', 'NC', 'TX', 'WA')
 GOOD_ARREST_DATA = c('AZ', 'CA', 'CO', 'CT', 'FL',  'MA', 'MD', 'MT', 'NC', 'OH', 'RI', 'SC', 'VT', 'WI')
 GOOD_SPEEDING_CITATION_DATA =  c('CO', 'FL', 'IL', 'MT', 'NC', 'RI',  'TX', 'WI')
 GOOD_COUNTY_LEVEL_CONTRABAND_DATA = c('CO', 'CT', 'IL', 'NC', 'RI', 'SC', 'TX', 'WA', 'WI' )

From 7cd878a927140ae60782abe98ad263e0f6c31f97 Mon Sep 17 00:00:00 2001
From: Jan Overgoor <janovergoor@gmail.com>
Date: Thu, 22 Jun 2017 15:44:05 -0700
Subject: [PATCH 2/2] process CT dataset 2

---
 DATA-README.md             |  9 +++---
 src/processing/states/CT.R | 65 +++++++++++++++++++++++---------------
 2 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/DATA-README.md b/DATA-README.md
index f0755dc..d11939f 100644
--- a/DATA-README.md
+++ b/DATA-README.md
@@ -238,10 +238,9 @@ Below are notes on the data for each state. They are not intended to be a compre
 
 ### Connecticut
 
-**Original format**: online, downloaded from https://data.ct.gov/view/baka-5j97
-http://ctrp3.ctdata.org/rawdata/
+**Original format**: online, downloaded from http://ctrp3.ctdata.org/
 
-**Time period**: 2013-10 to 2015-03
+**Time period**: 2013-10 to 2015-09
 
 **Columns with no data**: none
 
@@ -249,9 +248,11 @@ http://ctrp3.ctdata.org/rawdata/
 - Counties were mapped by running the cities in the `Intervention Location Name` field through Google's geocoder.
 - Rows appear to represent violations, not individual stops, because a small proportion of rows (1%) report the same officer making multiple stops at the same location at the same time. We grouped the data to combine these duplicates. We don't want to be overly aggressive in grouping together stops, so we only group if the other fields are the same. 
 - While there is some violation data, we exclude Connecticut from the speeding analysis because it has too much missing data in the violation field.
-- The Connecticut state patrol created another website ([link](http://ctrp3.ctdata.org/)), where new data will get uploaded going forward. We haven't processed this yet.
+- The geo lat/lon data that is provided sometimes falls outside of the state, so use these fields with caution.
 
 **Extra fields**:
+- `lat`
+- `lon`
 - `officer_id`
 - `stop_duration`
 
diff --git a/src/processing/states/CT.R b/src/processing/states/CT.R
index 6b8c017..94cdb39 100755
--- a/src/processing/states/CT.R
+++ b/src/processing/states/CT.R
@@ -6,19 +6,23 @@ change_path(this_state)
 
 # Read in and combine data
 print(sprintf("[%s] reading in the data", this_state))
-d <- read_csv("Connecticut_DownloadedMarch27_2016_Racial_Profiling_Prohibition_Project_Traffic_Stop_Data.csv", 
-              col_types = cols(`Intervention Time` = col_character()))
+d1 <- read_csv("connecticut-r1.csv", col_types=cols(InterventionTime=col_character(), InterventionIdentificationID=col_character()))
+d2 <- read_csv("connecticut-r2.csv", col_types=cols(InterventionTime=col_character(), InterventionIdentificationID=col_character()))
+colnames(d2)[26:27] = c("StatutoryReasonForStop","StatutatoryCitationPostStop")
+d <- rbind(d1, d2) %>%
+       # separate date and time
+       separate(InterventionDateTime, c("date","time"), sep=' ', remove=F)
 
 # Combine duplicates
 message(sprintf("Prior to combining duplicates, %i rows.", nrow(d)))
 d <- d %>% group_by(
-  `Intervention Date`, `Intervention Time`, `Intervention Location Name`, `Subject Age`, 
-  `Intervention Location Description Text`, `Department Name`, `Subject Ethnicity Code`, 
-  `Subject Sex Code`, `Search Authorization Code`, `Vehicle Searched Indicator`,
-  `Contraband Indicator`, `Subject Race Code`,
-  `Custodial Arrest Indicator`, `Reporting Officer Identification ID`, `Intervention Duration Code`) %>%
-  summarize(`Statute Code Description` = paste(sort(unique(`Statute Code Description`)), collapse=','), 
-            `Intervention Disposition Code` = paste(`Intervention Disposition Code`, collapse = ',')) %>%
+  date, InterventionTime, InterventionLocationName, `Department Name`, InterventionLocationDescriptionText,
+  SubjectEthnicityCode, SubjectAge, SubjectRaceCode, SubjectSexCode,
+  SearchAuthorizationCode, VehicleSearchedIndicator, ContrabandIndicator, CustodialArrestIndicator,
+  ReportingOfficerIdentificationID, InterventionDurationCode,
+  InterventionLocationLatitude, InterventionLocationLongitude) %>%
+  summarize(StatutoryReasonForStop = paste(sort(unique(StatutoryReasonForStop)), collapse=','), 
+            InterventionDispositionCode = paste(InterventionDispositionCode, collapse = ',')) %>%
   ungroup()
 message(sprintf("After combining duplicates, %i rows.", nrow(d)))
 
@@ -35,42 +39,51 @@ search_type_vals_clean <- c("Consent", "Inventory", NA, "Probable Cause")
 # Rename and extract columns
 print(sprintf("[%s] extracting columns", this_state))
 d$state                 <- this_state
-d$stop_date             <- make_date(substr(d$`Intervention Date`, 1, 10), format='%m/%d/%Y')
-d$stop_time             <- strftime(strptime(d$`Intervention Time`, "%H:%M"), format = '%H:%M')
+d$stop_date             <- make_date(d$date)
+d$stop_time             <- strftime(strptime(d$InterventionTime, "%H:%M"), format = '%H:%M')
 d$stop_time[as.character(d$stop_time) == '00:00'] = NA  # We have an overdensity of stops at midnight; this probably indicates unreliable data, setting these to NA. 
 d$id                    <- make_row_id(d)
-d$location_raw          <- tolower(d$`Intervention Location Name`)
+d$location_raw          <- tolower(d$InterventionLocationName)
 counties_clean          <- normalize_county(d)
 d$county_name           <- counties_clean$county_name
 d$county_fips           <- counties_clean$fips
-d$fine_grained_location <- d$`Intervention Location Description Text`
+d$fine_grained_location <- d$InterventionLocationDescriptionText
 d$state_patrol          <- d$`Department Name` == 'State Police'
 d$police_department     <- d$`Department Name`
-d$driver_gender         <- d$`Subject Sex Code`
-d$driver_age_raw        <- d$`Subject Age`
+d$driver_gender         <- d$SubjectSexCode
+d$driver_age_raw        <- d$SubjectAge
 d$driver_age            <- get_age(d, type='age')
-d$driver_race_tmp       <- ifelse(d$`Subject Ethnicity Code` == 'H', d$`Subject Ethnicity Code`, d$`Subject Race Code`)
+d$driver_race_tmp       <- ifelse(d$SubjectEthnicityCode == 'H', d$SubjectEthnicityCode, d$SubjectRaceCode)
 d$driver_race_raw       <- map(d$driver_race_tmp, race_keys, race_vals)
 d$driver_race           <- map(d$driver_race_tmp, race_keys, race_vals_clean)
-d$violation_raw         <- d$`Statute Code Description`
-d$violation             <- normalize_violation_multiple(d, d$`Statute Code Description`, clean=TRUE)
-d$search_conducted      <- d$`Vehicle Searched Indicator` | (d$`Search Authorization Code` != 'N')
-d$search_type_raw       <- map(d$`Search Authorization Code`, search_type_keys, search_type_vals)
-d$search_type           <- map(d$`Search Authorization Code`, search_type_keys, search_type_vals_clean)
-d$contraband_found      <- d$`Contraband Indicator`
+d$violation_raw         <- d$StatutoryReasonForStop
+d$violation             <- normalize_violation_multiple(d, d$StatutoryReasonForStop, clean=TRUE)
+d$search_conducted      <- (d$VehicleSearchedIndicator == 'True') | (d$SearchAuthorizationCode != 'N')
+d$search_type_raw       <- map(d$SearchAuthorizationCode, search_type_keys, search_type_vals)
+d$search_type           <- map(d$SearchAuthorizationCode, search_type_keys, search_type_vals_clean)
+d$contraband_found      <- d$ContrabandIndicator == 'True'
 d$contraband_found[!d$search_conducted] <- FALSE  # Keep search_conducted + contraband_found consistent: if no search is conducted, contraband cannot be found as result of search. 
-d$stop_outcome          <- multimap(d$id, d$`Intervention Disposition Code`, outcome_keys, outcome_vals, sep = ',')
 # If a stop has multiple outcomes, report most severe outcome, consistent with other states. 
-d$stop_outcome          <- ifelse(grepl('Arrest', d$stop_outcome) | d$`Custodial Arrest Indicator`, 'Arrest', 
+d$stop_outcome          <- multimap(d$id, d$InterventionDispositionCode, outcome_keys, outcome_vals, sep = ',')
+d$stop_outcome          <- ifelse(grepl('Arrest', d$stop_outcome) | d$CustodialArrestIndicator == 'True', 'Arrest', 
                            ifelse(grepl('Summons', d$stop_outcome), 'Summons', 
                            ifelse(grepl('Ticket', d$stop_outcome), 'Ticket', 
                            ifelse(grepl('Written Warning', d$stop_outcome), 'Written Warning', 
                            ifelse(grepl('Verbal Warning', d$stop_outcome), 'Verbal Warning', NA)))))
 d$is_arrested           <- d$stop_outcome == 'Arrest'
 
+# Temporary function to fix the lat/long format
+fix_latlong <- function(ll) {
+  paste(str_extract(ll, '-?\\d\\d'),
+        str_replace_all(str_extract(ll, '(?<=\\s).*'), '\\.|\\s', ''), sep='.')
+}
+
 # Extra fields
-d$officer_id            <- d$`Reporting Officer Identification ID`
-d$stop_duration         <- map(d$`Intervention Duration Code`, 1:3, c("1-15 min", "16-30 min", "30+ min"))
+d$lat                   <- as.numeric(fix_latlong(d$InterventionLocationLatitude))
+d$lon                   <- as.numeric(fix_latlong(d$InterventionLocationLongitude))
+d$lon                   <- ifelse(d$lon > 0, -1*d$lon, d$lon)
+d$officer_id            <- d$ReportingOfficerIdentificationID
+d$stop_duration         <- map(d$InterventionDurationCode, 1:3, c("1-15 min", "16-30 min", "30+ min"))
 
 # Close-up
 write_cleaned_state(d, extra_cols=c('officer_id', 'stop_duration'))