-
Notifications
You must be signed in to change notification settings - Fork 1
/
09-survey.R
152 lines (127 loc) · 5.97 KB
/
09-survey.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Install ggplot and lubridate (this is only required once)
install.packages('ggplot2', dependencies = TRUE)
install.packages('lubridate', dependencies = TRUE)
install.packages('data.table', dependencies = TRUE)
# Load ggplot into current session
library(ggplot2)
# Install lubridate
library(lubridate)
# Load data.table
library(data.table)
# Read in MTA SF Survey Results
raw_survey_2017 <- read.csv(file="./TDS_202017_20Data-WEBPAGE.csv", head=TRUE,sep=",")
# Create a working dataframe
survey_2017 <- raw_survey_2017
View(survey_2017)
# Select the columns for Rideshare useage, age, and income
rideshare <- survey_2017[c("Q21A","Q27","Q29")]
# Keep only the resposents with missing answers
rideshare <- rideshare[complete.cases(rideshare), ]
# Rename columns to meaningful names.
colnames(rideshare)[1] = "USAGE"
colnames(rideshare)[2] = "AGE"
colnames(rideshare)[3] = "INCOME"
View(rideshare)
# Separate the rows into age groups
rideshare$AGE_RANGE <- cut(rideshare$AGE,
breaks = 7,
labels = c("18-24 yrs","25-34 yrs", "35-44 yrs", "45-54 yrs", "55-64 yrs", "65+ yrs", "NA"),
right = FALSE)
# Separate the rows into income groups
rideshare$INCOME_RANGE <- cut(rideshare$INCOME,
breaks = 8,
labels = c("$15,000 or less","$15001-$25,000", "$25001-$35,000", "$35001-$75,000",
"$75001-$100,000", "$100,001-$200,000", "Over $200,000", "NA"),
right = FALSE)
# Reformat the usage score coding to go from increasing ridership
rideshare$USAGE_SCORE <- 0
rideshare$USAGE_SCORE <- ((rideshare$USAGE == 2) * 5) + rideshare$USAGE_SCORE
rideshare$USAGE_SCORE <- ((rideshare$USAGE == 3) * 4) + rideshare$USAGE_SCORE
rideshare$USAGE_SCORE <- ((rideshare$USAGE == 4) * 3) + rideshare$USAGE_SCORE
rideshare$USAGE_SCORE <- ((rideshare$USAGE == 5) * 2) + rideshare$USAGE_SCORE
rideshare$USAGE_SCORE <- ((rideshare$USAGE == 6) * 1) + rideshare$USAGE_SCORE
# Find average usage score by age range
rideshare_mean_byage <- aggregate(USAGE_SCORE~AGE,mean, data=rideshare)
rideshare_mean_byage$USAGE_SCORE <- round(rideshare_mean_byage$USAGE_SCORE, digits = 2)
rideshare_mean_byage$AGE <- cut(rideshare_mean_byage$AGE,
breaks = 7,
labels = c("18-24 yrs","25-34 yrs", "35-44 yrs", "45-54 yrs", "55-64 yrs", "65+ yrs", "NA"),
right = FALSE)
# Find average usage score by income range
rideshare_mean_byincome <- aggregate(USAGE_SCORE~INCOME,mean, data=rideshare)
rideshare_mean_byincome$USAGE_SCORE <- round(rideshare_mean_byincome$USAGE_SCORE, digits = 2)
rideshare_mean_byincome$INCOME <- cut(rideshare_mean_byincome$INCOME,
breaks = 8,
labels = c("$15,000 or less","$15001-$25,000", "$25001-$35,000", "$35001-$75,000",
"$75001-$100,000", "$100,001-$200,000", "Over $200,000", "NA"),
right = FALSE)
# Plot usage score by age
usagebyage <- ggplot(rideshare_mean_byage, aes(x=AGE, y=USAGE_SCORE, label=USAGE_SCORE)) +
geom_point(stat='identity', fill='black', size=10) +
geom_segment(aes(y = 0,
x = AGE,
yend = USAGE_SCORE,
xend = AGE),
color = "black") +
geom_text(color="white", size=3) +
labs(title="Rideshare Usage by Age",
subtitle="SFMTA Transit Decision Survey") +
ylim(0, 6) +
coord_flip()
plot(usagebyage)
# Plot usage score by income
usagebyincome <- ggplot(rideshare_mean_byincome, aes(x=INCOME, y=USAGE_SCORE, label=USAGE_SCORE)) +
geom_point(stat='identity', fill='black', size=10) +
geom_segment(aes(y = 0,
x = INCOME,
yend = USAGE_SCORE,
xend = INCOME),
color = "black") +
geom_text(color="white", size=3) +
labs(title="Rideshare Usage by Income",
subtitle="SFMTA Transit Decision Survey") +
ylim(0, 6) +
coord_flip()
plot(usagebyincome)
# Find the mean by age and income
rideshare_mean <- aggregate(USAGE2~AGE+INCOME,mean, data=rideshare)
#fixed bins
bins <- ggplot(rideshare, aes(age)) + scale_fill_brewer(palette = "Spectral")
bins + geom_bar(aes(fill=USAGE), width = 0.5) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Histogram on Categorical Variable",
subtitle="Manufacturer across Vehicle Classes")
# Scatterplot
gg <- ggplot(rideshare_mean, aes(AGE2, INCOME2))
gg + geom_point(aes(size=USAGE2)) +
labs(subtitle="AGE vs INCOME",
y="Income",
x="Age",
title="Usage of Ridesharing by Age and Income",
caption = "Source: SFfghfgh")
plot(gg)
data("midwest")
midwest
usageplot <- ggplot(rideshare, aes(AGE))
usageplot + geom_density(aes(fill=factor(USAGE)), alpha=0.8) +
labs(title="Density plot",
subtitle="City Mileage Grouped by Number of cylinders",
caption="Source: mpg",
x="AGE",
fill="USAGE")
chart
rideshare$AGE2 <- cut(rideshare$AGE,
breaks = c(1, 2, 3, 4, 5, 6, 7, Inf),
labels = c("18-24 yrs","25-34 yrs", "35-44 yrs", "45-54 yrs", "55-64 yrs", "65+ yrs", "NA"),
right = FALSE)
# Find the mean by income
rideshare_mean$INCOME_RANGE <- cut(rideshare_mean$INCOME,
breaks = 8,
labels = c("$15,000 or less","$15001-$25,000", "$25001-$35,000", "$35001-$75,000",
"$75001-$100,000", "$100,001-$200,000", "Over $200,000", "NA"),
right = FALSE)
# Find the mean by age
rideshare_mean$AGE_RANGE <- cut(rideshare_mean$AGE,
breaks = 7,
labels = c("18-24 yrs","25-34 yrs", "35-44 yrs", "45-54 yrs", "55-64 yrs", "65+ yrs", "NA"),
right = FALSE)