-
Notifications
You must be signed in to change notification settings - Fork 0
/
CCMDs.Rmd
136 lines (89 loc) · 3.65 KB
/
CCMDs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "Building and cleaning the Colorado Counties Marijuana Dataset"
author: "David Martinez"
date: "April 12, 2019"
output:
html_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# CO Counties Marijuana Dataset<br>
## Purpose
## Data Collection and Processing Methodology<br>
```{r message=FALSE, warning=FALSE}
#dependencies - if needed, these packages can be installed using the install.packages() function
library("readxl")
library("formattable")
library("tidyverse")
library("tidyr")
library("ggplot2")
```
```{r}
#gather sales and tax data into tibbles
sales_mj <- read_xlsx("CO_County_Sales_2014_2018.xlsx", sheet = "aggregate", range = NULL, col_names = TRUE)
taxes_mj <- read_xlsx("CO_County_Taxes_2014_2018.xlsx", sheet = "aggregate", range = NULL, col_names = TRUE)
#merge the two tibbles - {base} merge returns a dataframe
CCMDs <- merge(sales_mj, taxes_mj)
#temp - remove 2018 values until both spreadsheets are populated
CCMDs <- subset(CCMDs, Year < "2018")
#create a list of column names for columns 8 through 14 - these are the columns related to sales and taxes
cashcol <- colnames(CCMDs[8:14])
#loop to convert sales/tax columns to numeric/currency - this will introduce NAs for each of the 7 columns (resulting in a warning for each column)
for (i in cashcol) {
CCMDs[[i]] <- as.numeric(CCMDs[[i]]) #character to numeric
CCMDs[[i]] <- currency(CCMDs[[i]], digits = 0L) #numeric but with currency symbology
}
#clean up unneeded files
rm(cashcol, i)
rm(sales_mj, taxes_mj)
#write dataframe to disk
write_excel_csv(CCMDs, "CCMDs.csv", na = "NA", append = FALSE)
```
##EDA
```{r}
summary(CCMDs)
#count the NAs - (~9.5% of the dataset are NA's)
colSums(is.na(CCMDs))
sum(is.na(CCMDs))
#corrplot the value features
#m <- cor(CCMDs[, c(8:14)], use = "complete.obs", method = "spearman")
#require("corrplot")
#corrplot(m, type = "upper", order = "hclust", tl.srt = 45)
```
```{r}
#retrieve file from https://storage.googleapis.com/co-publicdata/profiles-county.csv
#housing_demog <- data.frame(read_csv("https://storage.googleapis.com/co-publicdata/profiles-county.csv", col_names = TRUE, col_types = NULL))
#subset to retain only 2014-2017 data
#housing_demog <- subset(housing_demog, year >= "2014" & year <= "2017")
#subset to retain only county information - colorado state information is stripped out
#housing_demog <- subset(housing_demog, countyfips > "0")
#remove first column as it provides no value for this exercise
#housing_demog <- housing_demog[,-1]
#CCMDs <- cbind(co_mj, housing_demog)
#rm(co_mj)
#rm(housing_demog)
```
```{r}
#https://storage.googleapis.com/co-publicdata/household-county.csv
#pop_demog <- read_csv("https://storage.googleapis.com/co-publicdata/household-county.csv", col_names = TRUE, col_types = NULL)
#pop_demog <- subset(pop_demog, year >= "2014" & year <= "2017")
#subset to retain only county information - colorado state information is stripped out
#pop_demog <- subset(pop_demog, area_code > "0")
#pop_demog <- subset(pop_demog, household_type_id <= "0")
#pop_demog <- subset(pop_demog, age_group_id != "0")
#pop_demog <- spread(pop_demog, key = 'age_group_description', value= 'total_households')
#pop_demog <- pop_demog[,-c(1,4,6)]
#require(data.table)
#pop_demog <- setDT(pop_demog)[, lapply(.SD, function(x) unique(na.omit(x))), by = "area_code"]
#CCMDs <- cbind(CCMDs, pop_demog)
#require(gtools)
#CCMDs <- smartbind(CCMDs, nr_mj)
#rm(pop_demog, nr_mj)
```
```{r}
#str(CCMDs)
#CCMDs <- CCMDs[,-c(15:16,30:31)]
#str(CCMDs)
#plot(CCMDs)
```