But…
../
path fragmentsSuppose we’re working on a data analysis pipeline, starting with “incoming data”:
Our setup:
I’ve copied some data in as data.xlsx
into src/incoming
.
incoming.R
to tidy that up for consumption using your favourite packages.src/incoming
and just edit things as usualMy attempt at cleaning:
d <- readxl::read_excel("data.xlsx", sheet = 2, skip = 2)
names(d) <- gsub(" ", "_", tolower(names(d)))
d$date <- as.Date(d$date)
write.csv(d, "data.csv", row.names = FALSE)
janitor
)csv
id <- orderly_run("incoming")
## ℹ Starting packet 'incoming' `20241024-123851-9432dd2b` at 2024-10-24 12:38:51.587109
## > d <- readxl::read_excel("data.xlsx", sheet = 2, skip = 2)
## > names(d) <- gsub(" ", "_", tolower(names(d)))
## > d$date <- as.Date(d$date)
## > write.csv(d, "data.csv", row.names = FALSE)
## ✔ Finished running 'incoming.R'
## ℹ Finished 20241024-123851-9432dd2b at 2024-10-24 12:38:51.655659 (0.06854963 secs)
Our generated metadata (this output box scrolls)
orderly_metadata(id)
## $schema_version
## [1] "0.1.1"
##
## $name
## [1] "incoming"
##
## $id
## [1] "20241024-123851-9432dd2b"
##
## $time
## $time$start
## [1] "2024-10-24 12:38:51 UTC"
##
## $time$end
## [1] "2024-10-24 12:38:51 UTC"
##
##
## $parameters
## NULL
##
## $files
## path size
## 1 data.csv 466
## 2 data.xlsx 10851
## 3 incoming.R 174
## hash
## 1 sha256:9117700f079b786812cc20904f4c34f5659f2986a923d2771216551cf378e86f
## 2 sha256:149445ecc545eb987ecc0d5255a48f21165ee1c9b6b54c84b882ce1fbda066b7
## 3 sha256:8ce9dc59614d62b39cd4ff8cfba08e48f993a9be6d3c29357118df386bf7688f
##
## $depends
## [1] packet query files
## <0 rows> (or 0-length row.names)
##
## $git
## $git$sha
## [1] "d184c8b2d3c0f223b6592b841d8b9622e552f0c5"
##
## $git$branch
## [1] "main"
##
## $git$url
## [1] "https://github.com/mrc-ide/orderly-tutorial"
##
##
## $custom
## $custom$orderly
## $custom$orderly$artefacts
## [1] description paths
## <0 rows> (or 0-length row.names)
##
## $custom$orderly$role
## path role
## 1 incoming.R orderly
##
## $custom$orderly$description
## $custom$orderly$description$display
## NULL
##
## $custom$orderly$description$long
## NULL
##
## $custom$orderly$description$custom
## NULL
##
##
## $custom$orderly$shared
## [1] here there
## <0 rows> (or 0-length row.names)
##
## $custom$orderly$session
## $custom$orderly$session$platform
## $custom$orderly$session$platform$version
## [1] "R version 4.4.1 (2024-06-14)"
##
## $custom$orderly$session$platform$os
## [1] "Ubuntu 22.04.5 LTS"
##
## $custom$orderly$session$platform$system
## [1] "x86_64, linux-gnu"
##
##
## $custom$orderly$session$packages
## package version attached
## 1 orderly2 1.99.54 TRUE
## 2 crayon 1.5.3 FALSE
## 3 vctrs 0.6.5 FALSE
## 4 cli 3.6.3 FALSE
## 5 knitr 1.48 FALSE
## 6 rlang 1.1.4 FALSE
## 7 xfun 0.48 FALSE
## 8 jsonlite 1.8.9 FALSE
## 9 glue 1.8.0 FALSE
## 10 openssl 2.2.2 FALSE
## 11 askpass 1.2.1 FALSE
## 12 htmltools 0.5.8.1 FALSE
## 13 sys 3.4.3 FALSE
## 14 readxl 1.4.3 FALSE
## 15 fansi 1.0.6 FALSE
## 16 rmarkdown 2.28 FALSE
## 17 cellranger 1.1.0 FALSE
## 18 evaluate 1.0.1 FALSE
## 19 tibble 3.2.1 FALSE
## 20 fastmap 1.2.0 FALSE
## 21 yaml 2.3.10 FALSE
## 22 lifecycle 1.0.4 FALSE
## 23 compiler 4.4.1 FALSE
## 24 fs 1.6.4 FALSE
## 25 pkgconfig 2.0.3 FALSE
## 26 digest 0.6.37 FALSE
## 27 gert 2.1.4 FALSE
## 28 R6 2.5.1 FALSE
## 29 utf8 1.2.4 FALSE
## 30 pillar 1.9.0 FALSE
## 31 credentials 2.0.2 FALSE
## 32 magrittr 2.0.3 FALSE
## 33 withr 3.0.1 FALSE
## 34 tools 4.4.1 FALSE
source()
knitr
or rmarkdown
.csv
, .xlsx
, etc)README.md
, licence info, etc)data.xlsx
is an inputorderly_resource("data.xlsx")
d <- readxl::read_excel("data.xlsx", sheet = 2, skip = 2)
names(d) <- gsub(" ", "_", tolower(names(d)))
d$date <- as.Date(d$date)
write.csv(d, "data.csv", row.names = FALSE)
data.xlsx
is a resourceid <- orderly_run("incoming")
## ℹ Starting packet 'incoming' `20241024-123851-b7997b23` at 2024-10-24 12:38:51.721692
## > orderly_resource("data.xlsx")
## > d <- readxl::read_excel("data.xlsx", sheet = 2, skip = 2)
## > names(d) <- gsub(" ", "_", tolower(names(d)))
## > d$date <- as.Date(d$date)
## > write.csv(d, "data.csv", row.names = FALSE)
## ✔ Finished running 'incoming.R'
## ℹ Finished 20241024-123851-b7997b23 at 2024-10-24 12:38:51.764704 (0.04301143 secs)
orderly_metadata(id)$custom$orderly$role
## path role
## 1 incoming.R orderly
## 2 data.xlsx resource
knitr
or rmarkdown
data.csv
is an artefactorderly_resource("data.xlsx")
orderly_artefact(files = "data.csv", description = "Cleaned data")
d <- readxl::read_excel("data.xlsx", sheet = 2, skip = 2)
names(d) <- gsub(" ", "_", tolower(names(d)))
d$date <- as.Date(d$date)
write.csv(d, "data.csv", row.names = FALSE)
csv.xlsx
is an artefactid <- orderly_run("incoming")
## ℹ Starting packet 'incoming' `20241024-123851-d1352967` at 2024-10-24 12:38:51.821893
## > orderly_resource("data.xlsx")
## > orderly_artefact(files = "data.csv", description = "Cleaned data")
## > d <- readxl::read_excel("data.xlsx", sheet = 2, skip = 2)
## > names(d) <- gsub(" ", "_", tolower(names(d)))
## > d$date <- as.Date(d$date)
## > write.csv(d, "data.csv", row.names = FALSE)
## ✔ Finished running 'incoming.R'
## ℹ Finished 20241024-123851-d1352967 at 2024-10-24 12:38:51.852441 (0.03054833 secs)
orderly_metadata(id)$custom$orderly$artefacts
## description paths
## 1 Cleaned data data.csv
orderly_description(
display = "Incoming data from Otherlandia",
long = "Data as given to us from the MoH in Otherlandia.",
custom = list(received = "2024-10-22"))
orderly_resource("data.xlsx")
orderly_artefact(files = "data.csv", description = "Cleaned data")
d <- readxl::read_excel("data.xlsx", sheet = 2, skip = 2)
names(d) <- gsub(" ", "_", tolower(names(d)))
d$date <- as.Date(d$date)
write.csv(d, "data.csv", row.names = FALSE)
Running this:
id <- orderly_run("incoming", echo = FALSE)
## ℹ Starting packet 'incoming' `20241024-123851-e603957f` at 2024-10-24 12:38:51.903057
## ✔ Finished running 'incoming.R'
## ℹ Finished 20241024-123851-e603957f at 2024-10-24 12:38:51.932955 (0.02989793 secs)
orderly_metadata(id)$custom$orderly$description
## $display
## [1] "Incoming data from Otherlandia"
##
## $long
## [1] "Data as given to us from the MoH in Otherlandia."
##
## $custom
## $custom$received
## [1] "2024-10-22"
Our aim: We want to use data.csv
in some analysis
orderly_dependency("incoming", "latest", "data.csv")
orderly_artefact(files = c("coverage-gf.png", "coverage-bf.png"),
description = "Plots of coverage")
d <- read.csv("data.csv")
d$date <- as.Date(d$date)
png("coverage-gf.png")
plot(gf_coverage ~ date, d, type = "l")
dev.off()
png("coverage-bf.png")
plot(bf_coverage ~ date, d, type = "l")
dev.off()
This is the only file within our analysis directory:
id <- orderly_run("analysis")
## ℹ Starting packet 'analysis' `20241024-123851-fffecc5b` at 2024-10-24 12:38:52.004566
## > orderly_dependency("incoming", "latest", "data.csv")
## ℹ Depending on incoming @ `20241024-123851-e603957f` (via latest(name == "incoming"))
## > orderly_artefact(files = c("coverage-gf.png", "coverage-bf.png"),
## + description = "Plots of coverage")
## > d <- read.csv("data.csv")
## > d$date <- as.Date(d$date)
## > png("coverage-gf.png")
## > plot(gf_coverage ~ date, d, type = "l")
## > dev.off()
## png
## 2
## > png("coverage-bf.png")
## > plot(bf_coverage ~ date, d, type = "l")
## > dev.off()
## png
## 2
## ✔ Finished running 'analysis.R'
## ℹ Finished 20241024-123851-fffecc5b at 2024-10-24 12:38:52.096394 (0.09182715 secs)
fs::dir_tree("workdir/part2")
## workdir/part2
## ├── archive
## │ ├── analysis
## │ │ └── 20241024-123851-fffecc5b
## │ │ ├── analysis.R
## │ │ ├── coverage-bf.png
## │ │ ├── coverage-gf.png
## │ │ └── data.csv
## │ └── incoming
## │ ├── 20241024-123851-9432dd2b
## │ │ ├── data.csv
## │ │ ├── data.xlsx
## │ │ └── incoming.R
## │ ├── 20241024-123851-b7997b23
## │ │ ├── data.csv
## │ │ ├── data.xlsx
## │ │ └── incoming.R
## │ ├── 20241024-123851-d1352967
## │ │ ├── data.csv
## │ │ ├── data.xlsx
## │ │ └── incoming.R
## │ └── 20241024-123851-e603957f
## │ ├── data.csv
## │ ├── data.xlsx
## │ └── incoming.R
## ├── draft
## │ ├── analysis
## │ └── incoming
## ├── orderly_config.yml
## └── src
## ├── analysis
## │ └── analysis.R
## └── incoming
## ├── data.xlsx
## └── incoming.R
data.csv
file has been copied from the final copy of incoming
into analysis
orderly_dependency("incoming", "20241024-123851-b7997b23", "data.csv")
orderly_artefact(files = c("coverage-gf.png", "coverage-bf.png"),
description = "Plots of coverage")
d <- read.csv("data.csv")
d$date <- as.Date(d$date)
png("coverage-gf.png")
plot(gf_coverage ~ date, d, type = "l")
dev.off()
png("coverage-bf.png")
plot(bf_coverage ~ date, d, type = "l")
dev.off()
id <- orderly_run("analysis")
## ℹ Starting packet 'analysis' `20241024-123852-287ba7b0` at 2024-10-24 12:38:52.162664
## > orderly_dependency("incoming", "20241024-123851-b7997b23", "data.csv")
## ℹ Depending on incoming @ `20241024-123851-b7997b23` (via single(id == "20241024-123851-b7997b23" && name == "incoming"))
## > orderly_artefact(files = c("coverage-gf.png", "coverage-bf.png"),
## + description = "Plots of coverage")
## > d <- read.csv("data.csv")
## > d$date <- as.Date(d$date)
## > png("coverage-gf.png")
## > plot(gf_coverage ~ date, d, type = "l")
## > dev.off()
## png
## 2
## > png("coverage-bf.png")
## > plot(bf_coverage ~ date, d, type = "l")
## > dev.off()
## png
## 2
## ✔ Finished running 'analysis.R'
## ℹ Finished 20241024-123852-287ba7b0 at 2024-10-24 12:38:52.236722 (0.07405734 secs)
with metadata
A real analysis courtesty of Katy Gaythorpe, using wuenic.xlsx
orderly_resource("wuenic.xlsx")
orderly_artefact(
files = "wuenic.rds",
description = "Tidied WUENIC data")
orderly_artefact(
files = "corr_out.rds",
description = "Output correlations between WUENIC and OFFICIAL coverage")
# -------------------------------------------------------------------------
library(dplyr)
library(ggplot2)
library(readxl)
library(janitor)
df <- read_xlsx("wuenic.xlsx")
# Question: is there better agreement in coverage estimates for BCG or YF
# some cleaning
df <- df %>% clean_names()
df <- df %>% mutate(across(starts_with("x"),
.fns = function(inp) as.numeric(gsub("%", "", inp))))
df <- df %>% mutate(all_na_coverage = if_all(starts_with("x"), is.na))
# quick visual for 2023
df %>%
filter(!is.na(antigen)) %>%
filter(country_region %in% c("Nigeria", "Senegal", "Kenya", "Ghana")) %>%
ggplot() +
aes(x = country_region, y = x2023, fill = category) +
geom_col(position = "dodge") +
facet_wrap(antigen ~ ., ncol = 1) +
theme_minimal() +
labs(x = "Country", y = "Coverage in 2023", fill = "Coverage type")
# get correlations per vaccine and country
get_ma_corr <- function(df, country_reg_in = "Afghanistan",
antigen_in = "Yellow fever vaccine") {
df_subset <- df %>% filter(country_region %in% country_reg_in,
antigen %in% antigen_in)
cor(t(df_subset[df_subset$category %in% "WUENIC", grep("^x", names(df))]),
t(df_subset[df_subset$category %in% "OFFICIAL", grep("^x", names(df))]),
use = "na.or.complete")
}
df_out <- data.frame(country = unique(df$country_region))
df_out$YF_cor <- sapply(df_out$country, function(x) get_ma_corr(df, x))
df_out$BCG_cor <- sapply(df_out$country, function(x) get_ma_corr(df, x, "BCG"))
# get a nice figure
p <- df_out %>%
mutate(cor_diff = as.numeric(BCG_cor) - as.numeric(YF_cor)) %>%
filter(!is.na(cor_diff)) %>%
mutate(pos_neg = cor_diff > 0) %>%
ggplot() +
aes(x = reorder(country, cor_diff), y = cor_diff, fill = pos_neg) +
geom_col() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Country", y = "Difference in correlation",
fill = "BCG has better \nagreement between \nWUENIC and OFFICIAL \ncoverage than YF") +
scale_fill_manual(values = c("yellow", "pink")) +
ggtitle("Yellow fever vaccination coverage has better agreement between WUENIC and OFFICIAL estimates in more countries than BCG")
# save everything
ggsave(plot = p, filename = "BCG_YF_correlation_comparison.png",
width = 14, height = 8)
saveRDS(df, "wuenic.rds")
saveRDS(df_out, "corr_out.rds")
id <- orderly_run("wuenic", echo = FALSE)
## ℹ Starting packet 'wuenic' `20241024-123852-4deb5cff` at 2024-10-24 12:38:52.309107
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
## Warning in cor(t(df_subset[df_subset$category %in% "WUENIC", grep("^x", : the
## standard deviation is zero
## Warning in cor(t(df_subset[df_subset$category %in% "WUENIC", grep("^x", : the
## standard deviation is zero
## Warning in cor(t(df_subset[df_subset$category %in% "WUENIC", grep("^x", : the
## standard deviation is zero
## Warning in cor(t(df_subset[df_subset$category %in% "WUENIC", grep("^x", : the
## standard deviation is zero
## Warning in cor(t(df_subset[df_subset$category %in% "WUENIC", grep("^x", : the
## standard deviation is zero
## Warning in cor(t(df_subset[df_subset$category %in% "WUENIC", grep("^x", : the
## standard deviation is zero
## ✔ Finished running 'wuenic.R'
## ! 6 warnings found:
## • the standard deviation is zero
## • the standard deviation is zero
## • the standard deviation is zero
## • the standard deviation is zero
## • the standard deviation is zero
## • the standard deviation is zero
## ℹ Finished 20241024-123852-4deb5cff at 2024-10-24 12:38:54.535468 (2.226362 secs)
wuenic.rds
?