CCMDs.Rmd

---
title: "Building and cleaning the Colorado Counties Marijuana Dataset"
author: "David Martinez"
date: "April 12, 2019"
output:
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# CO Counties Marijuana Dataset<br>

## Purpose

## Data Collection and Processing Methodology<br>




```{r message=FALSE, warning=FALSE}
#dependencies - if needed, these packages can be installed using the install.packages() function
library("readxl")
library("formattable")
library("tidyverse")
library("tidyr")
library("ggplot2")
```

```{r}
#gather sales and tax data into tibbles
sales_mj <- read_xlsx("CO_County_Sales_2014_2018.xlsx", sheet = "aggregate", range = NULL, col_names = TRUE)
taxes_mj <- read_xlsx("CO_County_Taxes_2014_2018.xlsx", sheet = "aggregate", range = NULL, col_names = TRUE)

#merge the two tibbles - {base} merge returns a dataframe
CCMDs <- merge(sales_mj, taxes_mj)

#temp - remove 2018 values until both spreadsheets are populated
CCMDs <- subset(CCMDs, Year < "2018")

#create a list of column names for columns 8 through 14 - these are the columns related to sales and taxes
cashcol <- colnames(CCMDs[8:14])

#loop to convert sales/tax columns to numeric/currency - this will introduce NAs for each of the 7 columns (resulting in a warning for each column)
for (i in cashcol) {
  CCMDs[[i]] <- as.numeric(CCMDs[[i]])   #character to numeric
  CCMDs[[i]] <- currency(CCMDs[[i]], digits = 0L) #numeric but with currency symbology
}

#clean up unneeded files
rm(cashcol, i)
rm(sales_mj, taxes_mj)

#write dataframe to disk
write_excel_csv(CCMDs, "CCMDs.csv", na = "NA", append = FALSE)

```


##EDA
```{r}
summary(CCMDs)

#count the NAs -  (~9.5% of the dataset are NA's)
colSums(is.na(CCMDs))
sum(is.na(CCMDs))


#corrplot the value features
#m <- cor(CCMDs[, c(8:14)], use = "complete.obs", method = "spearman")
#require("corrplot")
#corrplot(m, type = "upper", order = "hclust", tl.srt = 45)

```


```{r}
#retrieve file from https://storage.googleapis.com/co-publicdata/profiles-county.csv
#housing_demog <- data.frame(read_csv("https://storage.googleapis.com/co-publicdata/profiles-county.csv", col_names = TRUE, col_types = NULL))

#subset to retain only 2014-2017 data
#housing_demog <- subset(housing_demog, year >= "2014" & year <= "2017")

#subset to retain only county information - colorado state information is stripped out
#housing_demog <- subset(housing_demog, countyfips > "0")

#remove first column as it provides no value for this exercise
#housing_demog <- housing_demog[,-1]

#CCMDs <- cbind(co_mj, housing_demog)

#rm(co_mj)
#rm(housing_demog)

```

```{r}
#https://storage.googleapis.com/co-publicdata/household-county.csv
#pop_demog <- read_csv("https://storage.googleapis.com/co-publicdata/household-county.csv", col_names = TRUE, col_types = NULL)

#pop_demog <- subset(pop_demog, year >= "2014" & year <= "2017")

#subset to retain only county information - colorado state information is stripped out
#pop_demog <- subset(pop_demog, area_code > "0")
#pop_demog <- subset(pop_demog, household_type_id <= "0")
#pop_demog <- subset(pop_demog, age_group_id != "0")

#pop_demog <- spread(pop_demog, key = 'age_group_description', value= 'total_households')

#pop_demog <- pop_demog[,-c(1,4,6)]

#require(data.table)
#pop_demog <- setDT(pop_demog)[, lapply(.SD, function(x) unique(na.omit(x))), by = "area_code"]

#CCMDs <- cbind(CCMDs, pop_demog)

#require(gtools)
#CCMDs <- smartbind(CCMDs, nr_mj)

#rm(pop_demog, nr_mj)

```

```{r}

#str(CCMDs)

#CCMDs <- CCMDs[,-c(15:16,30:31)]

#str(CCMDs)

#plot(CCMDs)
```