Skip to content

Commit a1d2d98

Browse files
committed
adding binomial solutions
1 parent 9b92267 commit a1d2d98

8 files changed

+801
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
data {
2+
int n; // sample size
3+
int p; // number of coefficients
4+
matrix[n, p] X;
5+
int y[n];
6+
}
7+
8+
parameters {
9+
vector[p] beta;
10+
}
11+
12+
model {
13+
beta ~ normal(0, 1);
14+
y ~ bernoulli_logit(X * beta);
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
data {
2+
int n; // sample size
3+
int p; // number of coefficients
4+
matrix[n, p] X;
5+
int y[n];
6+
int n_test;
7+
int y_test[n_test];
8+
matrix[n_test, p] X_test;
9+
}
10+
11+
parameters {
12+
vector[p] beta;
13+
}
14+
15+
model {
16+
beta ~ normal(0, 1);
17+
y ~ bernoulli_logit(X * beta);
18+
}
19+
20+
21+
generated quantities {
22+
// I wrote this section for you as a hint
23+
real loglik_test;
24+
vector[n_test] logit_p_test;
25+
26+
logit_p_test <- X_test * beta;
27+
loglik_test <- bernoulli_logit_log(y_test, logit_p_test);
28+
//returns the sum of the log likelihoods (the joint log-likelihood)
29+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
data {
2+
// integer inputs
3+
int n; // the number of samples
4+
int n_pred; // the number of predictors
5+
int n_cohort; // the number of cohorts
6+
int n_test; // number of individuals in the test data
7+
8+
9+
// integer vector inputs
10+
int<lower=0, upper=1> y[n]; // observed malignancies
11+
int<lower=0, upper=1> y_test[n_test]; // observed for test data
12+
13+
// design matrix
14+
matrix[n,n_pred + n_cohort] X;
15+
matrix[n_test, n_pred + n_cohort] X_test;
16+
}
17+
18+
parameters {
19+
20+
// vector intercept, betas for predictors and cohort means
21+
vector [n_pred + n_cohort] beta; //
22+
}
23+
24+
model {
25+
26+
27+
// define priors for continuous predictors
28+
beta[1:n_pred] ~ cauchy(0, 3);
29+
30+
// define priors for cohort effects
31+
beta[n_pred + 1:11] ~ cauchy(0,5);
32+
beta[12] ~ normal(0,5); #shrinks estimate for cohort 4 towards 50%
33+
beta[13:n_pred+n_cohort] ~ cauchy(0,5);
34+
35+
// define the likelihood
36+
y ~ bernoulli_logit(X*beta);
37+
38+
}
39+
40+
generated quantities {
41+
real loglik_test;
42+
vector[n_test] logit_p_test;
43+
44+
logit_p_test <- X_test * beta;
45+
loglik_test <- bernoulli_logit_log(y_test, logit_p_test);
46+
//returns the sum of the log likelihoods (the joint log-likelihood)
47+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
data {
2+
// integer inputs
3+
int n; // the number of samples
4+
int n_pred; // the number of predictors
5+
int n_cohort; // the number of cohorts
6+
7+
// integer vector inputs
8+
int<lower=0, upper=1> y[n]; // observed malignancies
9+
10+
// design matrix
11+
matrix[n,n_pred + n_cohort] X;
12+
}
13+
14+
parameters {
15+
16+
// vector intercpet, betas for predictors and cohort means
17+
vector [n_pred + n_cohort] beta; //
18+
}
19+
20+
model {
21+
22+
// define priors for continuous predictors
23+
beta[1:n_pred] ~ cauchy(0, 3);
24+
25+
// define priors for cohort effects
26+
beta[n_pred + 1:11] ~ cauchy(0,5);
27+
beta[12] ~ normal(0,5); #shrinks estimate for cohort 4 towards 50%
28+
beta[13:n_pred+n_cohort] ~ cauchy(0,5);
29+
30+
// define the likelihood
31+
y ~ bernoulli_logit(X*beta);
32+
33+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
---
2+
title: 'Week 5 assignment: Binomial models'
3+
author: "Example solutions"
4+
date: "`r format(Sys.time(), '%d %B, %Y')`"
5+
output: pdf_document
6+
---
7+
8+
About one out of eight women in the U.S. will develop breast cancer at some point in her lifetime.
9+
Early diagnoses help with treatment of this potentially fatal disease, and these diagnoses can be made based on a variety of cytological metrics evaluated via biopsy.
10+
Your job today is to develop a model that classifies tumors as malignant or benign based on these metrics.
11+
The student(s) with the most predictive model will get a prize.
12+
13+
The data are in the `breast_cancer.csv` file.
14+
Details for this dataset can be found [on the UCI machine learning data repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)), which is useful if you ever need data to play with.
15+
I split the data into two groups at random: the *training* data, which you'll use to estimate parameters, and the *test* data, which we'll use to evaluate the predictive power of the model.
16+
There is a column in the data called `group`, which indicates whether an observation is part of the training or test set.
17+
18+
## Data exploration
19+
20+
As usual, you will want to explore the data before constructing any statistcal models.
21+
Only explore the training data, and do not use the test data for data exploration/visualization.
22+
We will pretend that we don't have access to the test data yet.
23+
24+
```{r, message = FALSE, fig.width = 12, fig.height = 7}
25+
d <- read.csv('data/breast_cancer.csv')
26+
library(dplyr)
27+
str(d)
28+
# center the continuous explanatory variables
29+
d[, 2:10] <- apply(d[, 2:10], 2, FUN = function(x) unlist(scale(x)))
30+
# create a subsetted data frame with just the training data
31+
d_train <- subset(d, group == 'train')
32+
d_test <- subset(d, group == 'test')
33+
par(mfrow=c(2, 5))
34+
for (i in 2:10){
35+
plot(x = jitter(d_train[, i]),
36+
y = jitter(d_train[, 'malignant']),
37+
xlab = names(d_train)[i], ylab = 'malignant')
38+
}
39+
plot(malignant ~ cohort, data = d_train)
40+
```
41+
42+
43+
## Model structure
44+
45+
What is your model? Write it out in \LaTeX. Hint: you will want to use a design matrix.
46+
47+
*LaTeX here*
48+
49+
What is your Stan model statement?
50+
51+
```
52+
data {
53+
int n; // sample size
54+
int p; // number of coefficients
55+
matrix[n, p] X;
56+
int y[n];
57+
}
58+
59+
parameters {
60+
vector[p] beta;
61+
}
62+
63+
model {
64+
beta ~ normal(0, 1);
65+
y ~ bernoulli_logit(X * beta);
66+
}
67+
68+
```
69+
70+
## Building and understanding the design matrix
71+
72+
We mentioned that you would want to use a design matrix.
73+
Specifically, your model should be of the form:
74+
75+
$y \sim Bernoulli(p)$
76+
77+
And the probability of malignancy $p$ is modeled using a logit-link:
78+
79+
$log \Big(\dfrac{p}{1 - p} \Big) = X \beta$
80+
81+
The design matrix $X$ contains the tumor features, and also dictates the interpretation of the coefficients $\beta$.
82+
In the code block below, construct your design matrix, creating an object called `X`.
83+
The included code will make an image plot of your design matrix with a horrendous color scheme.
84+
Once you fill in your code, set the argument `eval = TRUE` inside of the curly braces at the beginning of the code chuck (this is a chunk option), otherwise the code chunk will not be evaluated when you're knitting your pdf.
85+
86+
```{r}
87+
# define your design matrix below
88+
X <- model.matrix(~ 0 + clump_thickness +
89+
size_uniformity +
90+
shape_uniformity +
91+
marginal_adhesion +
92+
epithelial_size +
93+
bare_nuclei +
94+
bland_chromatin +
95+
normal_nucleoli +
96+
mitoses +
97+
cohort,
98+
data = d_train)
99+
100+
# the code below will plot your design matrix
101+
library(reshape2)
102+
library(ggplot2)
103+
mX <- melt(X)
104+
ggplot(mX, aes(x = Var2, y = Var1)) +
105+
geom_raster(aes(fill = value)) +
106+
scale_y_reverse() +
107+
xlab('Design matrix column') +
108+
ylab('Design matrix row') +
109+
scale_fill_gradientn(colors = rainbow(20)) +
110+
theme_minimal() +
111+
theme(axis.text.x = element_text(angle=90))
112+
```
113+
114+
For each column of $X$ you will get a coefficient, one element in $\beta$.
115+
For instance, the coefficient $\beta_1$ will be associated with the first column in $X$, which we might denote $X[, 1]$, to borrow some R syntax.
116+
There's no sense in estimating parameters if you don't know what they mean (Abraham Lincoln said that), so below, list each element in $\beta$ and briefly describe what it represents/how you would interpret it:
117+
118+
1. $\beta_1$ represents the increase in the logit probability of malignance for an increase of one standard deviation in clump thickness
119+
120+
2. $\beta_2$ represents the increase in the logit probability of malignance for an increase of one standard deviation in size uniformity
121+
122+
3. $\beta_3$ represents the increase in the logit probability of malignance for an increase of one standard deviation in shape uniformity
123+
124+
4. $\beta_4$ represents the increase in the logit probability of malignance for an increase of one standard deviation in marginal adhesion
125+
126+
5. $\beta_5$ represents the increase in the logit probability of malignance for an increase of one standard deviation in epithelial size
127+
128+
6. $\beta_6$ represents the increase in the logit probability of malignance for an increase of one standard deviation in bare nuclei
129+
130+
7. $\beta_7$ represents the increase in the logit probability of malignance for an increase of one standard deviation in bland chromatin
131+
132+
8. $\beta_8$ represents the increase in the logit probability of malignance for an increase of one standard deviation in normal nucleoli
133+
134+
9. $\beta_9$ represents the increase in the logit probability of malignance for an increase of one standard deviation in mitoses
135+
136+
The remaining columns (10 through 17) are group-level intercepts, whose coefficients will represent the logit probability of malignance for an average tumor.
137+
138+
## Parameter estimation
139+
140+
Use the **training** data to estimate your model's parameters (`group == 'test'`).
141+
Do not use the **test** data yet.
142+
Make sure that the MCMC algorithm has converged before moving forward.
143+
144+
```{r, message = FALSE}
145+
library(rstan)
146+
rstan_options(auto_write = TRUE)
147+
stan_d <- list(n = nrow(X),
148+
p = ncol(X),
149+
X = X,
150+
y = d_train$malignant)
151+
m_fit <- stan('bernoulli_glm.stan',
152+
data = stan_d, cores = 2)
153+
m_fit
154+
traceplot(m_fit, inc_warmup = TRUE, 'beta')
155+
```
156+
157+
158+
## Out of sample predictive power
159+
160+
One measure of a model's ability to predict new data is the log likelihood of new data, given the parameters of the model $[\tilde{y} \mid \theta]$, where $\tilde{y}$ is the new data (the **test** or **validation** data), and the parameters $\theta$ have been estimated from other data (e.g., the **training** data).
161+
162+
Hints:
163+
164+
- this is done most easily via a new design matrix $X_{test}$, which can be multiplied by the vector of model parameters, and must be declared in the `data` block
165+
- make sure that if you used any feature scaling or centering in the training data, that the exact same scaling/centering schemes are applied to the test set
166+
- you'll use the `generated quantities` block to calculate the log-likelihood of the test data
167+
- you can obtain the joint log likelihood with the `bernoulli_logit_log` function in Stan, and I wrote a generated quantities model block for you below, which should be the last block in your new Stan model statement
168+
169+
What is your updated Stan model?
170+
171+
```
172+
data {
173+
int n; // sample size
174+
int p; // number of coefficients
175+
matrix[n, p] X;
176+
int y[n];
177+
int n_test;
178+
int y_test[n_test];
179+
matrix[n_test, p] X_test;
180+
}
181+
182+
parameters {
183+
vector[p] beta;
184+
}
185+
186+
model {
187+
beta ~ normal(0, 1);
188+
y ~ bernoulli_logit(X * beta);
189+
}
190+
191+
192+
generated quantities {
193+
// I wrote this section for you as a hint
194+
real loglik_test;
195+
vector[n_test] logit_p_test;
196+
197+
logit_p_test <- X_test * beta;
198+
loglik_test <- bernoulli_logit_log(y_test, logit_p_test);
199+
//returns the sum of the log likelihoods (the joint log-likelihood)
200+
}
201+
202+
```
203+
204+
Acquire the posterior distribution of the model parameters and the holdout log likelihood.
205+
206+
```{r}
207+
X_test <- model.matrix(~ 0 + clump_thickness +
208+
size_uniformity +
209+
shape_uniformity +
210+
marginal_adhesion +
211+
epithelial_size +
212+
bare_nuclei +
213+
bland_chromatin +
214+
normal_nucleoli +
215+
mitoses +
216+
cohort,
217+
data = d_test)
218+
219+
stan_d <- list(n = nrow(X),
220+
p = ncol(X),
221+
X = X,
222+
y = d_train$malignant,
223+
n_test = nrow(X_test),
224+
y_test = d_test$malignant,
225+
X_test = X_test)
226+
m_fit <- stan('bernoulli_glm_test.stan',
227+
data = stan_d, cores = 2)
228+
print(m_fit, pars = c('beta', 'loglik_test'))
229+
traceplot(m_fit, inc_warmup = TRUE, c('beta', 'loglik_test'))
230+
```
231+
232+
Make a histogram of the holdout log likelihood and report the posterior mean along with a 95% credible interval.
233+
234+
```{r}
235+
post <- rstan::extract(m_fit)
236+
par(mfrow=c(1, 1))
237+
hist(post$loglik_test, breaks=40)
238+
c(mean = mean(post$loglik_test), quantile(post$loglik_test, c(0.025, 0.975)))
239+
```
240+
241+
242+
## Showing predictions
243+
244+
The whole point of building this model is to diagnose whether a tumor is malignant based on some features.
245+
Plot the posterior probability of tumor malignance for each holdout tumor, and show the true tumor status in the same graph.
246+
Multiple graph types are possible here, but we do not recommend simply copying and pasting code from another example (so far about a quarter of plots made in this way have made sense).
247+
Instead, think hard about what sort of data display would be effective, and make that plot!
248+
249+
```{r}
250+
library(reshape2)
251+
p_df <- melt(post$logit_p_test, varnames = c('iter', 'obs'))
252+
253+
subset(d, group == 'test') %>%
254+
mutate(obs = 1:n()) %>%
255+
full_join(p_df) %>%
256+
ggplot(aes(x = value,
257+
group = obs,
258+
fill = factor(malignant))) +
259+
geom_density(alpha = .1) +
260+
facet_wrap(~ cohort, nrow = 2) +
261+
theme_bw() +
262+
xlab('Predicted logit probability of tumor malignance') +
263+
ylab('Posterior density') +
264+
theme(legend.position = "top")
265+
```
266+
1.94 MB
Binary file not shown.

0 commit comments

Comments
 (0)