forked from ghurault/IRR-eczema-images
-
Notifications
You must be signed in to change notification settings - Fork 0
/
03_estimate_performance.R
125 lines (101 loc) · 5.1 KB
/
03_estimate_performance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Notes -------------------------------------------------------------------
#
# Initialisation ----------------------------------------------------------
rm(list = ls()) # Clear workspace (better to restart the session)
set.seed(2020)
source(here::here("analysis", "00_init.R"))
df <- load_masks() %>%
filter(Skin == 1) # Remove background to make results more generalisable to other images
img <- unique(df[["filename"]])
n_rater <- 4
# Clinician and naive performance ---------------------------------------------------------------------
perf <- lapply(1:n_rater,
function(k) {
tested_rater <- paste0("rater_", k)
consensus_raters <- paste0("rater_", setdiff(1:n_rater, k))
tmp <- df %>%
mutate(Consensus = apply(df[, consensus_raters], 1, function(x) {as.numeric(mean(x) > 0.5)})) %>%
rename(Rater = all_of(tested_rater))
# Performance of clinician for each image
perf_clinician <- tmp %>%
group_by(filename) %>%
summarise(TP = sum(Consensus == 1 & Rater == 1),
TN = sum(Consensus == 0 & Rater == 0),
FP = sum(Consensus == 0 & Rater == 1),
FN = sum(Consensus == 1 & Rater == 0))
perf_clinician <- compute_metrics(perf_clinician)
# Performance when predicting all skin to be eczema for each image
perf_naive <- tmp %>%
group_by(filename) %>%
summarise(TP = sum(Consensus == 1 & Skin == 1),
TN = sum(Consensus == 0 & Skin == 0),
FP = sum(Consensus == 0 & Skin == 1),
FN = sum(Consensus == 1 & Skin == 0))
perf_naive <- compute_metrics(perf_naive)
bind_rows(perf_clinician %>% mutate(Type = "Clinician"),
perf_naive %>% mutate(Type = "Naive")) %>%
mutate(Rater = k)
}) %>%
bind_rows()
# Average performance across raters
avg_perf <- perf %>%
pivot_longer(cols = !all_of(c("Type", "filename", "Rater")), names_to = "Metric", values_to = "Value") %>%
group_by(Type, filename, Metric) %>%
summarise(Mean_Rater = mean(Value)) %>%
ungroup()
# Difference in performance for each image between clinician and naive
diff_perf <- avg_perf %>%
pivot_wider(names_from = "Type", values_from = "Mean_Rater") %>%
mutate(Difference = Clinician - Naive)
# Plot difference in performance ------------------------------------------
# Distribution of difference between clinician and naive
diff_perf %>%
filter(!(Metric %in% c("TP", "TN", "FP", "FN"))) %>%
ggplot(aes(x = Difference)) +
facet_grid(rows = vars(Metric)) +
geom_histogram(binwidth = 0.1) + # geom_density()
coord_cartesian(xlim = c(-1, 1)) +
labs(x = "Difference between clinician and naive performance") +
theme_bw(base_size = 15)
# Mean and SE of difference between clinician and naive
diff_perf %>%
group_by(Metric) %>%
summarise(Mean = mean(Difference), SD = sd(Difference), SE = SD / sqrt(n())) %>%
filter(!(Metric %in% c("TP", "TN", "FP", "FN"))) %>%
ggplot(aes(x = Metric, y = Mean, ymin = Mean - SE, ymax = Mean + SE)) +
geom_pointrange() +
coord_flip(ylim = c(-0.5, 0.5)) +
labs(y = "Average difference between\nclinician and naive performance", x = "") +
theme_bw(base_size = 15)
# Plot clinician and naive performance ------------------------------------
# Average performance across images
avg_perf %>%
group_by(Type, Metric) %>%
summarise(Mean_Image = mean(Mean_Rater), SD_Image = sd(Mean_Rater), SE_Image = SD_Image / sqrt(n())) %>%
ungroup() %>%
filter(!(Metric %in% c("TP", "TN", "FP", "FN"))) %>%
mutate(Type = factor(Type, levels = c("Naive", "Clinician"), labels = c("Naive rater", "Average rater"))) %>%
ggplot(aes(x = Type, y = Mean_Image, ymin = Mean_Image - SE_Image, ymax = Mean_Image + SE_Image)) +
facet_grid(rows = vars(Metric)) +
geom_pointrange() +
coord_flip(ylim = c(0.5, 1)) +
labs(x = "", y = "") +
theme_bw(base_size = 15) +
theme(strip.text.y = element_text(angle = 0))
# ggsave(here("results", "baseline_performance.jpg"), width = 13, height = 8, units = "cm", dpi = 300, scale = 2)
if (FALSE) {
# Colour instead of facet
avg_perf %>%
group_by(Type, Metric) %>%
summarise(Mean_Image = mean(Mean_Rater), SD_Image = sd(Mean_Rater), SE_Image = SD_Image / sqrt(n())) %>%
ungroup() %>%
filter(!(Metric %in% c("TP", "TN", "FP", "FN"))) %>%
mutate(Type = replace(Type, Type == "Naive", "All Eczema")) %>%
ggplot(aes(x = Metric, y = Mean_Image, ymin = Mean_Image - SE_Image, ymax = Mean_Image + SE_Image, colour = Type)) +
geom_pointrange(position = position_dodge(width = 0.4)) +
coord_flip(ylim = c(0.5, 1)) +
scale_colour_manual(values = c("#000000", "#E69F00")) +
labs(x = "", y = "", colour = "") +
theme_bw(base_size = 15) +
theme(legend.position = "top")
}