Fitting odin models with monty

A pragmatic introduction

On your laptop:

library(odin2)
library(dust2)
library(monty)

In Posit Cloud:

posit.cloud/content/9998065

Previously, on “Introduction to odin”

We created some simple compartmental models
We ran these and observed trajectories over time
We saw that stochastic models produce a family of trajectories

The data

We have some data on the daily incidence of cases

data <- read.csv("data/incidence.csv")
head(data)
#>   time cases
#> 1    1    12
#> 2    2    23
#> 3    3    25
#> 4    4    36
#> 5    5    30
#> 6    6    57

The data

plot(data, pch = 19, col = "red")

Our model

Let’s fit these data to a model

sir <- odin({
  update(S) <- S - n_SI
  update(I) <- I + n_SI - n_IR
  update(R) <- R + n_IR
  update(incidence) <- incidence + n_SI
  
  p_SI <- 1 - exp(-beta * I / N * dt)
  p_IR <- 1 - exp(-gamma * dt)
  n_SI <- Binomial(S, p_SI)
  n_IR <- Binomial(I, p_IR)
  
  initial(S) <- N - I0
  initial(I) <- I0
  initial(R) <- 0
  initial(incidence, zero_every = 1) <- 0
  
  N <- parameter(1000)
  I0 <- parameter(10)
  beta <- parameter(0.2)
  gamma <- parameter(0.1)
})

We will link cases in the data to incidence in the model, and we will treat beta and gamma as unknown parameters to be estimated

Adding likelihood to the model

sir <- odin({
  update(S) <- S - n_SI
  update(I) <- I + n_SI - n_IR
  update(R) <- R + n_IR
  update(incidence) <- incidence + n_SI
  
  initial(S) <- N - I0
  initial(I) <- I0
  initial(R) <- 0
  initial(incidence, zero_every = 1) <- 0
  
  p_SI <- 1 - exp(-beta * I / N * dt)
  p_IR <- 1 - exp(-gamma * dt)
  n_SI <- Binomial(S, p_SI)
  n_IR <- Binomial(I, p_IR)
  
  N <- parameter(1000)
  I0 <- parameter(10)
  beta <- parameter(0.2)
  gamma <- parameter(0.1)
  
  cases <- data()
  cases ~ Poisson(incidence)
})

Calculating likelihood: particle filtering

Calculating likelihood

filter <- dust_filter_create(sir, data = data, time_start = 0,
                             n_particles = 200, dt = 0.25)
dust_likelihood_run(filter, list(beta = 0.4, gamma = 0.2))
#> [1] -92.89566

The system runs stochastically, and the likelihood is different each time:

dust_likelihood_run(filter, list(beta = 0.4, gamma = 0.2))
#> [1] -93.37538
dust_likelihood_run(filter, list(beta = 0.4, gamma = 0.2))
#> [1] -88.08544

Filtered trajectories

dust_likelihood_run(filter, list(beta = 0.4, gamma = 0.2),
                    save_trajectories = TRUE)
#> [1] -87.92554
y <- dust_likelihood_last_trajectories(filter)
y <- dust_unpack_state(filter, y)
matplot(data$time, t(y$incidence), type = "l", col = "#00000044", lty = 1,
        xlab = "Time", ylab = "Incidence")
points(data, pch = 19, col = "red")

Particle MCMC

So we have a marginal likelihood estimator from our particle filter

How do we sample from beta and gamma?

We need:

to tidy up our parameters
to create a prior
to create a posterior
to create a sampler

“Parameters”

Our filter takes a list of beta and gamma, pars
- it could take all sorts of other things, not all of which are to be estimated
- some of the inputs might be vectors or matrices
Our MCMC takes an unstructured vector \(\theta\)
- we propose a new \(\theta^*\) via some kernel, say a multivariate normal requiring a matrix of parameters corresponding to \(\theta\)
- we need a prior over \(\theta\), but not necessarily every element of pars
Smoothing this over is a massive nuisance
- some way of mapping from \(\theta\) to pars (and back again)

Parameter packers

Our solution, “packers”

packer <- monty_packer(c("beta", "gamma"))
packer
#> 
#> ── <monty_packer> ──────────────────────────────────────────────────────────────
#> ℹ Packing 2 parameters: 'beta' and 'gamma'
#> ℹ Use '$pack()' to convert from a list to a vector
#> ℹ Use '$unpack()' to convert from a vector to a list
#> ℹ See `?monty_packer()` for more information

We can transform from \(\theta\) to a named list:

packer$unpack(c(0.2, 0.1))
#> $beta
#> [1] 0.2
#> 
#> $gamma
#> [1] 0.1

and back the other way:

packer$pack(c(beta = 0.2, gamma = 0.1))
#> [1] 0.2 0.1

Parameter packers

Bind additional data

packer <- monty_packer(c("beta", "gamma"), fixed = list(I0 = 5))
packer$unpack(c(0.2, 0.1))
#> $beta
#> [1] 0.2
#> 
#> $gamma
#> [1] 0.1
#> 
#> $I0
#> [1] 5

Parameter packers

Cope with vector-valued parameters in \(\theta\)

packer <- monty_packer(array = c(beta = 3, gamma = 3))
packer
#> 
#> ── <monty_packer> ──────────────────────────────────────────────────────────────
#> ℹ Packing 6 parameters: 'beta[1]', 'beta[2]', 'beta[3]', 'gamma[1]', 'gamma[2]', and 'gamma[3]'
#> ℹ Use '$pack()' to convert from a list to a vector
#> ℹ Use '$unpack()' to convert from a vector to a list
#> ℹ See `?monty_packer()` for more information
packer$unpack(c(0.2, 0.21, 0.22, 0.1, 0.11, 0.12))
#> $beta
#> [1] 0.20 0.21 0.22
#> 
#> $gamma
#> [1] 0.10 0.11 0.12

Priors

Another DSL, similar to odin’s:

prior <- monty_dsl({
  beta ~ Exponential(mean = 0.5)
  gamma ~ Exponential(mean = 0.3)
})

This is a “monty model”

prior
#> 
#> ── <monty_model> ───────────────────────────────────────────────────────────────
#> ℹ Model has 2 parameters: 'beta' and 'gamma'
#> ℹ This model:
#> • can compute gradients
#> • can be directly sampled from
#> • accepts multiple parameters
#> ℹ See `?monty_model()` for more information
monty_model_density(prior, c(0.2, 0.1))
#> [1] 1.163787

compute this density manually:

dexp(0.2, 1 / 0.5, log = TRUE) + dexp(0.1, 1 / 0.3, log = TRUE)
#> [1] 1.163787

From a dust filter to a monty model

filter
#> 
#> ── <dust_likelihood (odin_system)> ─────────────────────────────────────────────
#> ℹ 4 state x 200 particles
#> ℹ The likelihood is stochastic
#> ℹ This system runs in discrete time with dt = 0.25
#> ℹ Use coef() (`?stats::coef()`) to get more information on parameters

Combine a filter and a packer

packer <- monty_packer(c("beta", "gamma"))
likelihood <- dust_likelihood_monty(filter, packer)
likelihood
#> 
#> ── <monty_model> ───────────────────────────────────────────────────────────────
#> ℹ Model has 2 parameters: 'beta' and 'gamma'
#> ℹ This model:
#> • is stochastic
#> ℹ See `?monty_model()` for more information

Posterior from likelihood and prior

Combine a likelihood and a prior to make a posterior

\[ \underbrace{\Pr(\theta | \mathrm{data})}_{\mathrm{posterior}} \propto \underbrace{\Pr(\mathrm{data} | \theta)}_\mathrm{likelihood} \times \underbrace{P(\theta)}_{\mathrm{prior}} \]

posterior <- likelihood + prior
posterior
#> 
#> ── <monty_model> ───────────────────────────────────────────────────────────────
#> ℹ Model has 2 parameters: 'beta' and 'gamma'
#> ℹ This model:
#> • can be directly sampled from
#> • is stochastic
#> ℹ See `?monty_model()` for more information

(remember that addition is multiplication on a log scale)

Create a sampler

A diagonal variance-covariance matrix (uncorrelated parameters)

vcv <- diag(2) * 0.2
vcv
#>      [,1] [,2]
#> [1,]  0.2  0.0
#> [2,]  0.0  0.2

Use this to create a “random walk” sampler:

sampler <- monty_sampler_random_walk(vcv)
sampler
#> 
#> ── <monty_sampler: Random walk (monty_random_walk)> ────────────────────────────
#> ℹ Use `?monty_sample()` to use this sampler
#> ℹ See `?monty_random_walk()` for more information

Let’s sample!

samples <- monty_sample(posterior, sampler, 1000, n_chains = 3)
samples
#> 
#> ── <monty_samples: 2 parameters x 1000 samples x 3 chains> ─────────────────────
#> ℹ Parameters: 'beta' and 'gamma'
#> ℹ Conversion to other types is possible:
#> → ! posterior::as_draws_array() [package installed, but not loaded]
#> → ! posterior::as_draws_df() [package installed, but not loaded]
#> → ! coda::as.mcmc.list() [package installed, but not loaded]
#> ℹ See `?monty_sample()` and `vignette("samples")` for more information

The result: diagnostics

Diagnostics can be used from the posterior package

## Note: as_draws_df converts samples$pars, and drops anything else in samples
samples_df <- posterior::as_draws_df(samples)
posterior::summarise_draws(samples_df)
#> # A tibble: 2 × 10
#>   variable  mean median     sd    mad    q5   q95  rhat ess_bulk ess_tail
#>   <chr>    <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 beta     0.873  0.855 0.132  0.112  0.701 1.10   1.07     67.4     64.9
#> 2 gamma    0.553  0.547 0.0967 0.0933 0.442 0.720  1.11     67.9     81.9

The results: parameters

You can use the posterior package in conjunction with bayesplot (and then also ggplot2)

bayesplot::mcmc_scatter(samples_df)

The result: traceplots

bayesplot::mcmc_trace(samples_df)

The result: density over time

matplot(drop(samples$density), type = "l", lty = 1)

The result: density over time

matplot(drop(samples$density[-(1:100), ]), type = "l", lty = 1)

Better mixing

vcv <- matrix(c(0.01, 0.005, 0.005, 0.005), 2, 2)
sampler <- monty_sampler_random_walk(vcv)
samples <- monty_sample(posterior, sampler, 2000, initial = samples,
                        n_chains = 4)
matplot(samples$density, type = "l", lty = 1)

Better mixing: the results

samples_df <- posterior::as_draws_df(samples)
posterior::summarise_draws(samples_df)
#> # A tibble: 2 × 10
#>   variable  mean median     sd    mad    q5   q95  rhat ess_bulk ess_tail
#>   <chr>    <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 beta     0.843  0.830 0.121  0.113  0.664 1.07   1.01     458.     556.
#> 2 gamma    0.529  0.516 0.0953 0.0852 0.393 0.711  1.01     431.     579.

Better mixing: the results

bayesplot::mcmc_scatter(samples_df)

Better mixing: the results

bayesplot::mcmc_trace(samples_df)

Parallelism

Two places to parallelise

among particles in your filter
between chains in the sample

e.g., 4 threads per filter x 2 workers = 8 total cores in use

Configure the filter

Use the n_threads argument, here for 4 threads

filter <- dust_filter_create(sir, data = data, time_start = 0,
                             n_particles = 200, dt = 0.25, n_threads = 4)

requires that you have OpenMP; this is very annoying on macOS

Configure a parallel runner

Use monty_runner_callr, here for 2 workers

runner <- monty_runner_callr(2)

Pass runner through to monty_sample:

samples <- monty_sample(posterior, sampler, 1000,
                        runner = runner, n_chains = 4)

Run chains on different cluster nodes

monty_sample_manual_prepare(posterior, sampler, 10000, "mypath",
                            n_chains = 10)

Then run these chains in parallel on your cluster:

monty_sample_manual_run(1, "mypath")
monty_sample_manual_run(2, "mypath")
monty_sample_manual_run(3, "mypath")

And retrieve the result

samples <- monty_sample_manual_collect("mypath")

Saving history

Save your trajectories at every collected sample
Save the final state at every sample (for onward simulation)
Save snapshots at intermediate timepoints of the state at every sample (for counterfactuals)

Trajectories

likelihood <- dust_likelihood_monty(filter, packer,
                                    save_trajectories = TRUE)
posterior <- likelihood + prior
samples <- monty_sample(posterior, sampler, 1000, n_chains = 4)

Trajectories

trajectories <- dust_unpack_state(filter,
                                  samples$observations$trajectories)
matplot(data$time, trajectories$incidence[, , 1], type = "l", lty = 1,
        col = "#00000044", xlab = "Time", ylab = "Infection incidence")
points(data, pch = 19, col = "red")

Trajectories

Trajectories are 4-dimensional

# (4 states x 20 time points x 1000 samples x 4 chains)
dim(samples$observations$trajectories)
#> [1]    4   20 1000    4

These can get very large quickly - there are two main ways to help reduce this:

Saving only a subset of the states
Thinning

Saving a subset of trajectories

You can save a subset via specifying a named vector

likelihood <- dust_likelihood_monty(filter, packer, 
                                    save_trajectories = c("I", "incidence"))
posterior <- likelihood + prior
samples2 <- monty_sample(posterior, sampler, 100, initial = samples)
dim(samples2$observations$trajectories)
#> [1]   2  20 100   1

Thinning

While running

samples <- monty_sample(...,
                        burnin = 100,
                        thinning_factor = 2)

After running

samples <- monty_samples_thin(samples,
                              burnin = 500,
                              thinning_factor = 2)

Thinning while running faster and uses less memory
After running is more flexible (e.g. can plot full chains of parameters between running and thinning)

Deterministic models from stochastic

Stochastic models written in odin, can be run deterministically
Runs by taking the expectation of any random draws
This gives two models for the price of one
However it might not be suitable for all models

Fitting in deterministic mode

The key difference is to use dust_unfilter_create

unfilter <- dust_unfilter_create(sir, data = data, time_start = 0, dt = 0.25)

Note as this is deterministic it produces the same likelihood every time

dust_likelihood_run(unfilter, list(beta = 0.4, gamma = 0.2))
#> [1] -371.9752
dust_likelihood_run(unfilter, list(beta = 0.4, gamma = 0.2))
#> [1] -371.9752

Fitting in deterministic mode

likelihood <- dust_likelihood_monty(unfilter, packer, save_trajectories = TRUE)
posterior <- likelihood + prior
samples_det <- monty_sample(posterior, sampler, 1000, n_chains = 4)
samples_det <- monty_samples_thin(samples_det,
                                  burnin = 500,
                                  thinning_factor = 2)

Stochastic v deterministic comparison

y <- dust2::dust_unpack_state(filter, samples$observations$trajectories)
incidence <- array(y$incidence, c(20, 1000))
matplot(data$time, incidence, type = "l", lty = 1, col = "#00000044",
        xlab = "Time", ylab = "Infection incidence", ylim = c(0, 75))
points(data, pch = 19, col = "red")

Stochastic v deterministic comparison

y <- dust2::dust_unpack_state(filter, samples_det$observations$trajectories)
incidence <- array(y$incidence, c(20, 1000))
matplot(data$time, incidence, type = "l", lty = 1, col = "#00000044",
        xlab = "Time", ylab = "Infection incidence", ylim = c(0, 75))
points(data, pch = 19, col = "red")

Stochastic v deterministic comparison

pars_stochastic <- array(samples$pars, c(2, 500))
pars_deterministic <- array(samples_det$pars, c(2, 500))
plot(pars_stochastic[1, ], pars_stochastic[2, ], ylab = "gamma", xlab = "beta",
     pch = 19, col = "blue")
points(pars_deterministic[1, ], pars_deterministic[2, ], pch = 19, col = "red")
legend("bottomright", c("stochastic fit", "deterministic fit"), pch = c(19, 19), 
       col = c("blue", "red"))

Projections and counterfactuals

Let’s use some new data

data <- read.csv("data/schools.csv")
plot(data, pch = 19, col = "red")

Projections and counterfactuals

We’ll fit the data to an SIS model incorporating schools opening/closing

sis <- odin({
  update(S) <- S - n_SI + n_IS
  update(I) <- I + n_SI - n_IS
  update(incidence) <- incidence + n_SI
  
  initial(S) <- N - I0
  initial(I) <- I0
  initial(incidence, zero_every = 1) <- 0
  
  schools <- interpolate(schools_time, schools_open, "constant")
  schools_time <- parameter()
  schools_open <- parameter()
  dim(schools_time, schools_open) <- parameter(rank = 1)
  
  beta <- ((1 - schools) * (1 - schools_modifier) + schools) * beta0
  
  p_SI <- 1 - exp(-beta * I / N * dt)
  p_IS <- 1 - exp(-gamma * dt)
  n_SI <- Binomial(S, p_SI)
  n_IS <- Binomial(I, p_IS)
  
  N <- parameter(1000)
  I0 <- parameter(10)
  beta0 <- parameter(0.2)
  gamma <- parameter(0.1)
  schools_modifier <- parameter(0.6)
  
  cases <- data()
  cases ~ Poisson(incidence)
})

Projections and counterfactuals

schools_time <- c(0, 50, 60, 120, 130, 170, 180)
schools_open <- c(1,  0,  1,   0,   1,   0,   1)

We will

project forward from the end of the fits (day 150) to day 200
run a counterfactual where the schools did not reopen on day 60, reopening on day 130

Fitting to the SIS model

packer <- monty_packer(c("beta0", "gamma", "schools_modifier"),
                       fixed = list(schools_time = schools_time,
                                    schools_open = schools_open))

filter <- dust_filter_create(sis, time_start = 0, dt = 1,
                             data = data, n_particles = 200)

prior <- monty_dsl({
  beta0 ~ Exponential(mean = 0.3)
  gamma ~ Exponential(mean = 0.1)
  schools_modifier ~ Uniform(0, 1)
})

vcv <- diag(c(2e-4, 2e-4, 4e-4))
sampler <- monty_sampler_random_walk(vcv)

Fitting to the SIS model

We want to save the end state, and a snapshot at day 60 (where the counterfactual will diverge)

likelihood <- dust_likelihood_monty(filter, packer, save_trajectories = TRUE,
                                    save_state = TRUE, save_snapshots = 60)

posterior <- likelihood + prior

samples <- monty_sample(posterior, sampler, 500, initial = c(0.3, 0.1, 0.5),
                        n_chains = 4)
samples <- monty_samples_thin(samples, burnin = 100, thinning_factor = 8)

Fit to data

y <- dust_unpack_state(filter, samples$observations$trajectories)
incidence <- array(y$incidence, c(150, 200))
matplot(data$time, incidence, type = "l", col = "#00000044", lty = 1,
        xlab = "Time", ylab = "Incidence")
points(data, pch = 19, col = "red")

Running projection using the end state

state <- array(samples$observations$state, c(3, 200))
pars <- array(samples$pars, c(3, 200))
pars <- lapply(seq_len(200), function(i) packer$unpack(pars[, i]))

sys <- dust_system_create(sis, pars, n_groups = length(pars), dt = 1)

dust_system_set_state(sys, state)
t <- seq(150, 200)
y <- dust_system_simulate(sys, t)
y <- dust_unpack_state(sys, y)

Running projection using the end state

matplot(data$time, incidence, type = "l", col = "#00000044", lty = 1,
        xlab = "Time", ylab = "Incidence", xlim = c(0, 200))
matlines(t, t(y$incidence), col = "blue")
points(data, pch = 19, col = "red")

Running counterfactual using the snapshot

snapshot <- array(samples$observations$snapshots, c(3, 200))
pars <- array(samples$pars, c(3, 200))
f <- function(i) {
  p <- packer$unpack(pars[, i])
  p$schools_time <- c(0, 50, 130, 170, 180)
  p$schools_open <- c(1, 0, 1, 0, 1)
  p
}
pars <- lapply(seq_len(200), f)
sys <- dust_system_create(sis, pars, n_groups = length(pars), dt = 1)

dust_system_set_state(sys, snapshot)
t <- seq(60, 150)
y <- dust_system_simulate(sys, t)
y <- dust_unpack_state(sys, y)

Running counterfactual using the snapshot

matplot(data$time, incidence, type = "l", col = "#00000044", lty = 1,
        xlab = "Time", ylab = "Incidence")
matlines(t, t(y$incidence), col = "blue")
points(data, pch = 19, col = "red")

Next steps

forward time predictions
posterior predictive checks
rerun filter in MCMC
multi-parameter models
deterministic (expectation) models as starting points
adaptive fitting (deterministic models only)
HMC