-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatistics.R
172 lines (140 loc) · 6.29 KB
/
statistics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
library(tidyverse)
library(argparse)
library(ggplot2)
library(patchwork)
library(car)
library(effects)
library(performance)
# HELPER FUNCTIONS
# Read data from a given file path
read_csv_file <- function(path) {
data <- read.csv(path)
return(data)
}
# Check for linearity
check_linearity <- function(data, x_var, y_var) {
ggplot(data, aes_string(x = x_var, y = y_var)) +
geom_point() +
geom_smooth(method = 'lm') +
geom_smooth(method = 'loess', colour='firebrick4') +
labs(y = expression(italic('L')), x = expression(paste("log"[10], "(", italic("N"), ")"))) +
theme_minimal()
}
# Check for normality
check_normality <- function(mdl) {
residuals <- resid(mdl)
empirical_density <- density(residuals)
theoretical_density <- dnorm(empirical_density$x, mean = 0, sd = sd(residuals))
ggplot() +
geom_density(data = data.frame(residuals = residuals), aes(x = residuals, y = ..density..), fill = "lightblue", alpha = 0.6) +
geom_line(data = data.frame(x = empirical_density$x, y = empirical_density$y), aes(x = x, y = y), color = "blue") +
geom_line(data = data.frame(x = empirical_density$x, y = theoretical_density), aes(x = x, y = y), color = "red", linetype = "dashed") +
labs(x = "Residuals", y = "Density") +
theme_minimal()
}
# Check for homoscedasticity
check_homoscedasticity <- function(mdl) {
residuals <- resid(mdl)
fitted_values <- predict(mdl)
ggplot(data.frame(fitted = fitted_values, residuals = residuals), aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess", se = TRUE, color = "blue", linetype = "solid") +
labs(x = "Fitted",
y = "Residuals") +
theme_minimal()
}
# LOAD DATA
# generated under neutral change, replicator selection, and interactor selection
nc_data <- read_csv_file("path_to_neutral_change_output.csv")
rs_data <- read_csv_file("path_to_replicator_selection_output.csv")
is_data <- read_csv_file("path_to_interactor_selection_output.csv")
head(nc_data)
head(rs_data)
head(is_data)
# DATA TRANSFORMATION
# neutral change
nc_data <- mutate(nc_data,
LogPopulationSize = log10(population_size),
RewProbFactor = factor(rewiring_probability))
# replicator selection
rs_data <- mutate(rs_data,
LogPopulationSize = log10(population_size),
RewProbFactor = factor(rewiring_probability),
OrdinalPressure = factor(selection_pressure, ordered = TRUE))
contrasts(rs_data$OrdinalPressure) <- contr.treatment(10)
# interactor selection
is_data <- mutate(is_data,
LogPopulationSize = log10(population_size),
RewProbFactor = factor(rewiring_probability),
OrdinalPressure = factor(selection_pressure, ordered = TRUE),
Leaders = factor(n, ordered = TRUE))
contrasts(is_data$OrdinalPressure) <- contr.treatment(10)
contrasts(is_data$Leaders) <- contr.treatment(2)
# MODEL RUN
# neutral change
M_nc <- lm(final_A ~ LogPopulationSize + RewProbFactor, data = nc_data)
# replicator selection
M_rs <- lm(final_A ~ LogPopulationSize + RewProbFactor + OrdinalPressure, data = rs_data)
# interactor selection
M_is <- lm(final_A ~ LogPopulationSize + RewProbFactor + OrdinalPressure + Leaders, data = is_data)
# MODEL OUPUTS
summary(M_nc)
summary(M_rs)
summary(M_is)
# ASSUMPTIONS
# 1. Linearity
lin_nc <- check_linearity(nc_data, "LogPopulationSize", "final_A")
lin_rs <- check_linearity(rs_data, "LogPopulationSize", "final_A")
lin_is <- check_linearity(is_data, "LogPopulationSize", "final_A")
empty_plot <- plot_spacer()
combined_lin <- (lin_nc | lin_rs) / (lin_is | empty_plot)
print(combined_lin)
# 2. Normality
norm_nc <- check_normality(M_nc)
norm_rs <- check_normality(M_rs)
norm_is <- check_normality(M_is)
empty_plot <- plot_spacer()
combined_lin <- (norm_nc | norm_rs) / (norm_is | empty_plot)
print(combined_lin)
# 3. Homoscedasticity
hom_nc <- check_homoscedasticity(M_nc)
hom_rs <- check_homoscedasticity(M_rs)
hom_is <- check_homoscedasticity(M_is)
empty_plot <- plot_spacer()
combined_lin <- (hom_nc | hom_rs) / (hom_is | empty_plot)
print(combined_lin)
# 4. Multicollinearity
vif_nc <- vif(M_nc) # low correlation
vif_rs <- vif(M_rs) # low correlation
vif_is <- vif(M_is) # low correlation
# OUTLIERS
check_outliers(M_nc) # 2 outliers detected: cases 151, 302
check_outliers(M_rs) # No outliers detected
check_outliers(M_is) # No outliers detected
# EFFECT SIZES
plot(predictorEffects(M_nc))
plot(predictorEffects(M_rs))
plot(predictorEffects(M_is))
# EFFECT SIZE Cohen's f2
# neutral change
M_nc_rp_exluded <- lm(final_A ~ LogPopulationSize, data = nc_data)
M_nc_ps_excluded <- lm(final_A ~ RewProbFactor, data = nc_data)
nc_rewprob <- (summary(M_nc)$r.squared - summary(M_nc_rp_exluded)$r.squared) / 1 - summary(M_nc)$r.squared
nc_ps <- (summary(M_nc)$r.squared - summary(M_nc_ps_exluded)$r.squared) / 1 - summary(M_nc)$r.squared
# replicator selection
M_rs_rp_exluded <- lm(final_A ~ LogPopulationSize + OrdinalPressure, data = rs_data)
M_rs_ps_excluded <- lm(final_A ~ RewProbFactor + OrdinalPressure, data = rs_data)
M_rs_op_excluded <- lm(final_A ~ LogPopulationSize + RewProbFactor, data = rs_data)
rs_rewprob <- (summary(M_rs)$r.squared - summary(M_rs_rp_exluded)$r.squared) / 1 - summary(M_rs)$r.squared
rs_ps <- (summary(M_rs)$r.squared - summary(M_rs_ps_exluded)$r.squared) / 1 - summary(M_rs)$r.squared
rs_op <- (summary(M_rs)$r.squared - summary(M_rs_op_exluded)$r.squared) / 1 - summary(M_rs)$r.squared
# interactor selection
M_is_rp_exluded <- lm(final_A ~ LogPopulationSize + OrdinalPressure + Leaders, data = is_data)
M_is_ps_excluded <- lm(final_A ~ RewProbFactor + OrdinalPressure + Leaders, data = is_data)
M_is_op_excluded <- lm(final_A ~ LogPopulationSize + RewProbFactor + Leaders, data = is_data)
M_is_l_excluded <- lm(final_A ~ LogPopulationSize + RewProbFactor + OrdinalPressure, data = is_data)
is_rewprob <- (summary(M_is)$r.squared - summary(M_is_rp_exluded)$r.squared) / 1 - summary(M_is)$r.squared
is_ps <- (summary(M_is)$r.squared - summary(M_is_ps_exluded)$r.squared) / 1 - summary(M_is)$r.squared
is_op <- (summary(M_is)$r.squared - summary(M_is_op_exluded)$r.squared) / 1 - summary(M_is)$r.squared
is_l <- (summary(M_is)$r.squared - summary(M_is_l_exluded)$r.squared) / 1 - summary(M_is)$r.squared