-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_manipulation.Rmd
169 lines (129 loc) · 5.74 KB
/
data_manipulation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
---
title: "ABS data notes"
output: html_notebook
---
# Introduction
This is just a notebook to demonstrate how to load and manipulate Census datasets in R.
## Markdown notes
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).
# Load libraries
We are going to use the following:
* [tidyverse](https://www.tidyverse.org/) - this is a package of packages including `dplyr` which is used for data manipulation.
* [sf](http://r-spatial.github.io/sf/) - this is the spatial features package. This makes it easy to work with spatial data using tidyverse syntax.
* [janitor](https://garthtarr.github.io/meatR/janitor.html) - this package makes it easy to remove empty rows and columns as well as clean variable names.
```{r}
#### the first time you use a package you will need to install it.
# install.packages("tidyverse")
# install.packages("sf")
# install.packages("janitor")
# load the libraries
library(tidyverse)
library(sf)
library(janitor)
# if you want, you can load packages like pacman, which make it easier to load lots of packages.
```
Note: this (%>%) is a pipe and is used in the tidyverse syntax to join processes together. the shortcut is Ctrl+Shift+M
# Load datasets
```{r}
# sa1 data
sa1_shp_raw <- st_read("data_in/shp/SA1_2016_MV.shp") %>%
clean_names()
# languages spoken at home - this file has been cleaned up previously
lang_raw <- read_csv("data_in/SA1_2016_LANP.csv") %>%
clean_names()
# household composition - this one has extra rows on the top and bottom that needs to be deleted - this has been downloaded from tablebuilder
hhcd_raw <- read_csv("data_in/SA1_2016_hhcd2.csv", skip = 9) %>% # skip the top nine rows
clean_names() %>%
filter(!is.na(count)) %>% # this gets rid of the bottom rows
filter(hhcd_2_digit_level != "Not applicable") %>%
select(-x5) # sometimes an extra column gets read in when the data is in this format from Tablebuilder
# view them
sa1_shp_raw
lang_raw
hhcd_raw
```
# Manipulate data
## Languages spoken at home
```{r}
lang <- lang_raw %>%
select(-counting) %>% # drop this column
rename(sa1 = sa1_ur) %>% # rename the column
filter(!lanp_4_digit_level %in% c("English", "Not stated")) %>% #filter out these
arrange(sa1, desc(count)) %>% # arrange by these columns
mutate(sa1 = factor(sa1)) %>% # converts the sa1 varialbe to factor (a type of character field) to match the shapefile
mutate(count = as.integer(count)) # convert to integer
lang # just to have the output below
```
```{r}
# pivot the data wider
lang_wide <- lang %>%
pivot_wider(names_from = "lanp_4_digit_level", values_from = "count") %>%
arrange(sa1) %>%
replace(is.na(.), 0L) # replace the empty cells with 0 as an integer
write_csv(lang_wide, "data_out/lang_wide_2016.csv") # export to csv
lang_wide
```
```{r}
# top languages
lang_1 <- lang %>%
group_by(sa1) %>%
slice(1) %>% # takes the top language by the group
rename(lang1 = lanp_4_digit_level, lang1_count = count)
lang_2 <- lang %>%
group_by(sa1) %>%
slice(2) %>%
rename(lang2 = lanp_4_digit_level, lang2_count = count)
lang_3 <- lang %>%
group_by(sa1) %>%
slice(3) %>%
rename(lang3 = lanp_4_digit_level, lang3_count = count)
lang_top3 <- left_join(lang_1, lang_2, by = "sa1") %>% # join everything together
left_join(lang_3, by = "sa1")
write_csv(lang_top3, "data_out/lang_top3_2016.csv")
lang_top3
```
## Lone person households
```{r}
# select the columns that I want
hhcd_ <- hhcd_raw %>%
select(sa1 = sa1, hhcd_2_digit_level, count)
```
```{r}
# caluculate the lone person households
lone_pers_hh <- hhcd_raw %>%
group_by(sa1) %>% # group by sa1
select(-counting) %>% # drops this column
mutate(pct = round(count/sum(count)*100, 1)) %>% #calculate a percentage rounded to one decimal place - this is for all groups
filter(hhcd_2_digit_level == "Lone person household") %>% # we only want lone person households
filter(!is.na(pct)) %>% # get rid of the ones where there isn't a value
filter(sa1 != "Total") %>% # there might have been totals in the original input file
ungroup() %>% # need to ungroup first as sa1 is used as the grouping variable so can't be mutated in the next step
mutate(sa1 = factor(sa1)) # converts the sa1 varialbe to factor (a type of character field) to match the shapefile
write_csv(lone_pers_hh, "data_out/lone_pers_hh_sa1_2016.csv")
lone_pers_hh
```
# Spatial joins
```{r}
sa1_shp <- sa1_shp_raw %>%
select(sa1 = sa1_7digit) # just select sa1 and rename it. it will also keep the geometry column too unless we drop it
sa1_shp
```
## Join to languages spoken at home
```{r}
sa1_lang_shp <- sa1_shp %>%
left_join(lang_1, by = "sa1")
st_write(sa1_lang_shp, "data_out/shp/sa1_lang.shp", delete_layer = TRUE) # output - the delete layer just overwrites anything
sa1_lang_shp
plot(sa1_lang_shp) # this is just a very quick plot for the demo. there are plenty of ways to make better maps using other packages such tmap, mapview or cartography
```
## Join to lone person households
```{r}
# join then export - note that functions using sf start with 'st'
sa1_lone_pers_hh_shp <- sa1_shp %>%
left_join(lone_pers_hh, by = "sa1")
st_write(sa1_lone_pers_hh_shp, "data_out/shp/sa1_lone_pers_hh.shp", delete_layer = TRUE) # output - the delete layer just overwrites anything
sa1_lone_pers_hh_shp
plot(sa1_lone_pers_hh_shp) # this is just a very quick plot for the demo. there are plenty of ways to make better maps using other packages such tmap, mapview or cartography
```