5harad · janovergoor · Jun 22, 2017 · Jun 22, 2017
diff --git a/DATA-README.md b/DATA-README.md
@@ -238,21 +238,21 @@ Below are notes on the data for each state. They are not intended to be a compre
 
 ### Connecticut
 
-**Original format**: online, downloaded from https://data.ct.gov/view/baka-5j97
-http://ctrp3.ctdata.org/rawdata/
+**Original format**: online, downloaded from http://ctrp3.ctdata.org/
 
-**Time period**: 2013-10 to 2015-03
+**Time period**: 2013-10 to 2015-09
 
 **Columns with no data**: none
 
 **Data notes**:
 - Counties were mapped by running the cities in the `Intervention Location Name` field through Google's geocoder.
 - Rows appear to represent violations, not individual stops, because a small proportion of rows (1%) report the same officer making multiple stops at the same location at the same time. We grouped the data to combine these duplicates. We don't want to be overly aggressive in grouping together stops, so we only group if the other fields are the same. 
-- While there is some search type data, a high fraction of searches are marked as "Other", so we exclude Connecticut from our consent search analysis.
 - While there is some violation data, we exclude Connecticut from the speeding analysis because it has too much missing data in the violation field.
-- The Connecticut state patrol created another website ([link](http://ctrp3.ctdata.org/)), where new data will get uploaded going forward. We haven't processed this yet.
+- The geo lat/lon data that is provided sometimes falls outside of the state, so use these fields with caution.
 
 **Extra fields**:
+- `lat`
+- `lon`
 - `officer_id`
 - `stop_duration`
 

diff --git a/src/processing/states/CT.R b/src/processing/states/CT.R
@@ -6,19 +6,23 @@ change_path(this_state)
 
 # Read in and combine data
 print(sprintf("[%s] reading in the data", this_state))
-d <- read_csv("Connecticut_DownloadedMarch27_2016_Racial_Profiling_Prohibition_Project_Traffic_Stop_Data.csv", 
-              col_types = cols(`Intervention Time` = col_character()))
+d1 <- read_csv("connecticut-r1.csv", col_types=cols(InterventionTime=col_character(), InterventionIdentificationID=col_character()))
+d2 <- read_csv("connecticut-r2.csv", col_types=cols(InterventionTime=col_character(), InterventionIdentificationID=col_character()))
+colnames(d2)[26:27] = c("StatutoryReasonForStop","StatutatoryCitationPostStop")
+d <- rbind(d1, d2) %>%
+       # separate date and time
+       separate(InterventionDateTime, c("date","time"), sep=' ', remove=F)
 
 # Combine duplicates
 message(sprintf("Prior to combining duplicates, %i rows.", nrow(d)))
 d <- d %>% group_by(
-  `Intervention Date`, `Intervention Time`, `Intervention Location Name`, `Subject Age`, 
-  `Intervention Location Description Text`, `Department Name`, `Subject Ethnicity Code`, 
-  `Subject Sex Code`, `Search Authorization Code`, `Vehicle Searched Indicator`,
-  `Contraband Indicator`, `Subject Race Code`,
-  `Custodial Arrest Indicator`, `Reporting Officer Identification ID`, `Intervention Duration Code`) %>%
-  summarize(`Statute Code Description` = paste(sort(unique(`Statute Code Description`)), collapse=','), 
-            `Intervention Disposition Code` = paste(`Intervention Disposition Code`, collapse = ',')) %>%
+  date, InterventionTime, InterventionLocationName, `Department Name`, InterventionLocationDescriptionText,
+  SubjectEthnicityCode, SubjectAge, SubjectRaceCode, SubjectSexCode,
+  SearchAuthorizationCode, VehicleSearchedIndicator, ContrabandIndicator, CustodialArrestIndicator,
+  ReportingOfficerIdentificationID, InterventionDurationCode,
+  InterventionLocationLatitude, InterventionLocationLongitude) %>%
+  summarize(StatutoryReasonForStop = paste(sort(unique(StatutoryReasonForStop)), collapse=','), 
+            InterventionDispositionCode = paste(InterventionDispositionCode, collapse = ',')) %>%
   ungroup()
 message(sprintf("After combining duplicates, %i rows.", nrow(d)))
 
@@ -29,48 +33,57 @@ race_vals_clean <- c("White", "Black", "Asian", "Hispanic", "Other")
 outcome_keys <- c("I","M","N","U","V","W")
 outcome_vals <- c("Ticket","Summons",NA,"Arrest","Verbal Warning","Written Warning") 
 search_type_keys <- c("C","I","N","O")
-search_type_vals <- c("Consent", "Inventory", NA, "Other")
-search_type_vals_clean <- c("Consent", "Inventory", NA, "Other")
+search_type_vals <- c("Consent", "Inventory", NA, "Probable Cause")
+search_type_vals_clean <- c("Consent", "Inventory", NA, "Probable Cause")
 
 # Rename and extract columns
 print(sprintf("[%s] extracting columns", this_state))
 d$state                 <- this_state
-d$stop_date             <- make_date(substr(d$`Intervention Date`, 1, 10), format='%m/%d/%Y')
-d$stop_time             <- strftime(strptime(d$`Intervention Time`, "%H:%M"), format = '%H:%M')
+d$stop_date             <- make_date(d$date)
+d$stop_time             <- strftime(strptime(d$InterventionTime, "%H:%M"), format = '%H:%M')
 d$stop_time[as.character(d$stop_time) == '00:00'] = NA  # We have an overdensity of stops at midnight; this probably indicates unreliable data, setting these to NA. 
 d$id                    <- make_row_id(d)
-d$location_raw          <- tolower(d$`Intervention Location Name`)
+d$location_raw          <- tolower(d$InterventionLocationName)
 counties_clean          <- normalize_county(d)
 d$county_name           <- counties_clean$county_name
 d$county_fips           <- counties_clean$fips
-d$fine_grained_location <- d$`Intervention Location Description Text`
+d$fine_grained_location <- d$InterventionLocationDescriptionText
 d$state_patrol          <- d$`Department Name` == 'State Police'
 d$police_department     <- d$`Department Name`
-d$driver_gender         <- d$`Subject Sex Code`
-d$driver_age_raw        <- d$`Subject Age`
+d$driver_gender         <- d$SubjectSexCode
+d$driver_age_raw        <- d$SubjectAge
 d$driver_age            <- get_age(d, type='age')
-d$driver_race_tmp       <- ifelse(d$`Subject Ethnicity Code` == 'H', d$`Subject Ethnicity Code`, d$`Subject Race Code`)
+d$driver_race_tmp       <- ifelse(d$SubjectEthnicityCode == 'H', d$SubjectEthnicityCode, d$SubjectRaceCode)
 d$driver_race_raw       <- map(d$driver_race_tmp, race_keys, race_vals)
 d$driver_race           <- map(d$driver_race_tmp, race_keys, race_vals_clean)
-d$violation_raw         <- d$`Statute Code Description`
-d$violation             <- normalize_violation_multiple(d, d$`Statute Code Description`, clean=TRUE)
-d$search_conducted      <- d$`Vehicle Searched Indicator` | (d$`Search Authorization Code` != 'N')
-d$search_type_raw       <- map(d$`Search Authorization Code`, search_type_keys, search_type_vals)
-d$search_type           <- map(d$`Search Authorization Code`, search_type_keys, search_type_vals_clean)
-d$contraband_found      <- d$`Contraband Indicator`
+d$violation_raw         <- d$StatutoryReasonForStop
+d$violation             <- normalize_violation_multiple(d, d$StatutoryReasonForStop, clean=TRUE)
+d$search_conducted      <- (d$VehicleSearchedIndicator == 'True') | (d$SearchAuthorizationCode != 'N')
+d$search_type_raw       <- map(d$SearchAuthorizationCode, search_type_keys, search_type_vals)
+d$search_type           <- map(d$SearchAuthorizationCode, search_type_keys, search_type_vals_clean)
+d$contraband_found      <- d$ContrabandIndicator == 'True'
 d$contraband_found[!d$search_conducted] <- FALSE  # Keep search_conducted + contraband_found consistent: if no search is conducted, contraband cannot be found as result of search. 
-d$stop_outcome          <- multimap(d$id, d$`Intervention Disposition Code`, outcome_keys, outcome_vals, sep = ',')
 # If a stop has multiple outcomes, report most severe outcome, consistent with other states. 
-d$stop_outcome          <- ifelse(grepl('Arrest', d$stop_outcome) | d$`Custodial Arrest Indicator`, 'Arrest', 
+d$stop_outcome          <- multimap(d$id, d$InterventionDispositionCode, outcome_keys, outcome_vals, sep = ',')
+d$stop_outcome          <- ifelse(grepl('Arrest', d$stop_outcome) | d$CustodialArrestIndicator == 'True', 'Arrest', 
                            ifelse(grepl('Summons', d$stop_outcome), 'Summons', 
                            ifelse(grepl('Ticket', d$stop_outcome), 'Ticket', 
                            ifelse(grepl('Written Warning', d$stop_outcome), 'Written Warning', 
                            ifelse(grepl('Verbal Warning', d$stop_outcome), 'Verbal Warning', NA)))))
 d$is_arrested           <- d$stop_outcome == 'Arrest'
 
+# Temporary function to fix the lat/long format
+fix_latlong <- function(ll) {
+  paste(str_extract(ll, '-?\\d\\d'),
+        str_replace_all(str_extract(ll, '(?<=\\s).*'), '\\.|\\s', ''), sep='.')
+}
+
 # Extra fields
-d$officer_id            <- d$`Reporting Officer Identification ID`
-d$stop_duration         <- map(d$`Intervention Duration Code`, 1:3, c("1-15 min", "16-30 min", "30+ min"))
+d$lat                   <- as.numeric(fix_latlong(d$InterventionLocationLatitude))
+d$lon                   <- as.numeric(fix_latlong(d$InterventionLocationLongitude))
+d$lon                   <- ifelse(d$lon > 0, -1*d$lon, d$lon)
+d$officer_id            <- d$ReportingOfficerIdentificationID
+d$stop_duration         <- map(d$InterventionDurationCode, 1:3, c("1-15 min", "16-30 min", "30+ min"))
 
 # Close-up
 write_cleaned_state(d, extra_cols=c('officer_id', 'stop_duration'))

diff --git a/src/util/constants_and_paths.R b/src/util/constants_and_paths.R
@@ -34,7 +34,7 @@ STATES_PROCESSED_BUT_NOT_USED_IN_PAPER = c('IA','MI','MS','ND','NH','NV','OR','S
 ALL_PROCESSED_STATES = c(FINAL_STATE_LIST, STATES_PROCESSED_BUT_NOT_USED_IN_PAPER)
 # states with sufficiently high-quality data to include in specific analyses. 
 GOOD_SEARCH_CONDUCTED_DATA = c('AZ', 'CA', 'CO', 'CT', 'FL', 'IL', 'MA', 'MD', 'MO', 'MT', 'NC', 'NE', 'OH', 'RI', 'SC', 'TX', 'VT', 'WA', 'WI')
-GOOD_CONSENT_DATA = c('CO', 'FL', 'MA', 'MD', 'NC', 'TX', 'WA')
+GOOD_CONSENT_DATA = c('CO', 'CT', 'FL', 'MA', 'MD', 'NC', 'TX', 'WA')
 GOOD_ARREST_DATA = c('AZ', 'CA', 'CO', 'CT', 'FL',  'MA', 'MD', 'MT', 'NC', 'OH', 'RI', 'SC', 'VT', 'WI')
 GOOD_SPEEDING_CITATION_DATA =  c('CO', 'FL', 'IL', 'MT', 'NC', 'RI',  'TX', 'WI')
 GOOD_COUNTY_LEVEL_CONTRABAND_DATA = c('CO', 'CT', 'IL', 'NC', 'RI', 'SC', 'TX', 'WA', 'WI' )