例子
plan <- drake_plan(
raw_data = readxl::read_excel(file_in("raw_data.xlsx")),
data = raw_data %>%
mutate(Species = forcats::fct_inorder(Species)),
hist = create_plot(data),
fit = lm(Sepal.Width ~ Petal.Width + Species, data),
report = rmarkdown::render(
knitr_in("report.Rmd"),
output_file = file_out("report.html"),
quiet = TRUE
)
)
plan
#> # A tibble: 5 x 2
#> target command
#> <chr> <expr>
#> 1 raw_data readxl::read_excel(file_in("raw_data.xlsx")) …
#> 2 data raw_data %>% mutate(Species = forcats::fct_inorder(Species)) …
#> 3 hist create_plot(data) …
#> 4 fit lm(Sepal.Width ~ Petal.Width + Species, data) …
#> 5 report rmarkdown::render(knitr_in("report.Rmd"), output_file = file_ou…
drake_plan
其实是经过改造的R脚本,其实例是一个tibble数据框,一般包含target
和相对应的command
,target
一般是分析过程中的变量或者数据以及输出的文件,commmand
一般是建模、绘制等统计分析的指令。可以通过如下语句自由转换plan和一般脚本
plan_to_code()
code_to_plan()
plan_to_notebook() # 转换成Rnotebooks
drake
包的核心思想是减少数据分析中由于改变参数而导致所有管线组件需要重新计算的问题,利用drake_plan
包裹数据分析的脚本,随后执行如下语句来计算整个plan:
config <- drake_config(plan) # 建立plan配置
vis_drake_graph(config) # 生成plan流程图
make(plan) # 执行plan
使用drake_plan
做数据分析的好处在于,drake会自动分析plan的依赖关系,每次执行是只会计算整个管线中发生改变的组件,从而减少大量分析时间。 此外vis_drake_graph
形成依赖图使分析过程更加可视化。
target
:定制的command
利用target
来包裹plan
的子代码,能够更好的抽象整个分析过程中的组件,此外target
的几个参数能够方便的进行并行计算
drake_plan(
file = target(
ggsave(file_out("plot.png"), plot),
elapsed = 10
),
create_plot(datasets::iris)
)
#> # A tibble: 2 x 3
#> target command elapsed
#> <chr> <expr> <dbl>
The following columns have special meanings for make().
elapsed
andcpu
: number of seconds to wait for the target to build before timing out (elapsed for elapsed time and cpu for CPU time).
priority
: for parallel computing, optionally rank the targets according to priority in the scheduler. resources: target-specific lists of resources for a computing cluster. See the advanced options in the parallel computing chapter for details.
retries
: number of times to retry building a target in the event of an error.
trigger
: rule to decide whether a target needs to run. See the trigger chapter to learn more.
map
, cross
和combine
数据分析中常常需要重复执行一些填入不同参数的命令,最常见的就是逐步回归中的建模,回归系数多时往往需要很多模型。这是使用以上三个命令可以极大减少plan
的代码量,同时也可以加速运算过程。
map
:参数矩阵
例子1
drake_plan(
x = target(
simulate_data(center, scale),
transform = map(center = c(2, 1, 0), scale = c(3, 2, 1))
)
)
#> # A tibble: 3 x 2
#> target command
#> <chr> <expr>
#> 1 x_2_3 simulate_data(2, 3)
#> 2 x_1_2 simulate_data(1, 2)
#> 3 x_0_1 simulate_data(0, 1)
simulate_data
是元操作,map生成一个2*3参数矩阵,套用在这个元操作上,注意参数之间长度应该相同
例子2
my_grid <- tibble(
sim_function = c("rnrom", "rt", "rcauchy"),
title = c("Normal", "Student t", "Cauchy")
)
my_grid$sim_function <- rlang::syms(my_grid$sim_function)
drake_plan(
x = target(
simulate_data(sim_function, title, center, scale),
transform = map(
center = c(2, 1, 0),
scale = c(3, 2, 1),
.data = !!my_grid,
.id = sim_function # for pretty target names
)
)
)
#> # A tibble: 3 x 2
#> target command
#> <chr> <expr>
#> 1 x_rnrom simulate_data(rnrom, "Normal", 2, 3)
#> 2 x_rt simulate_data(rt, "Student t", 1, 2)
#> 3 x_rcauchy simulate_data(rcauchy, "Cauchy", 0, 1)
可以自己定制参数矩阵,要注意!!
的使用。
例子3
drake_plan(
x = target(
simulate_data(center),
transform = map(center = c(1, 2))
),
y = target(
process_data(x, center),
transform = map(x)
),
trace = TRUE # Adds extra columns for the grouping variables.
)
#> # A tibble: 4 x 5
#> target command center x y
#> <chr> <expr> <chr> <chr> <chr>
#> 1 x_1 simulate_data(1) 1 x_1 <NA>
#> 2 x_2 simulate_data(2) 2 x_2 <NA>
#> 3 y_x_1 process_data(x_1, 1) 1 x_1 y_x_1
#> 4 y_x_2 process_data(x_2, 2) 2 x_2 y_x_2
map
可以嵌套使用,第一个x
使用center=c(1,2)
生成两条命令,第二个y
实际利用了第一个的结果即map(x=c(x_1, x_2)
,注意,这里center
没有加入map
,但却有取值,这可能跟这个代码块中的center
变量并非参数变量,在第一个map执行完后不会销毁而继续作为drake_plan
中的局部变量而被第二个map
调用。
cross
:参数组合
利用cross
可以形成参数之间的组合,也是产生一个参数矩阵,这时参数的长度可以不同
例子
drake_plan(
x = target(
simulate_data(nrow, ncol),
transform = cross(nrow = c(1, 2, 3), ncol = c(4, 5))
)
)
#> # A tibble: 6 x 2
#> target command
#> <chr> <expr>
#> 1 x_1_4 simulate_data(1, 4)
#> 2 x_2_4 simulate_data(2, 4)
#> 3 x_3_4 simulate_data(3, 4)
#> 4 x_1_5 simulate_data(1, 5)
#> 5 x_2_5 simulate_data(2, 5)
#> 6 x_3_5 simulate_data(3, 5)
这里nrow
和ncol
进行组合,就有3*2中参数组合,所以生成的tibble就有6行命令
combine
:整合命令
利用combine
可以聚合多个命令成为一条命令
例子1
plan <- drake_plan(
data = target(
sim_data(mean = x, sd = y),
transform = map(x = c(1, 2), y = c(3, 4))
),
larger = target(
bind_rows(data, .id = "id") %>%
arrange(sd) %>%
head(n = 400),
transform = combine(data)
)
)
plan
#> # A tibble: 3 x 2
#> target command
#> <chr> <expr>
#> 1 data_1_3 sim_data(mean = 1, sd = 3) …
#> 2 data_2_4 sim_data(mean = 2, sd = 4) …
#> 3 larger bind_rows(data_1_3, data_2_4, .id = "id") %>% arrange(sd) %>% …
drake_plan_source(plan)
#> drake_plan(
#> data_1_3 = sim_data(mean = 1, sd = 3),
#> data_2_4 = sim_data(mean = 2, sd = 4),
#> larger = bind_rows(data_1_3, data_2_4, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400)
#> )
这段代码中,首先第一个target
为data
,生成了两个命令data_1_3
和data_2_4
,随后在larger
中,利用combine
将bind_rows
中的参数整合成一条命令
例子2
plan <- drake_plan(
data = target(
sim_data(mean = x, sd = y, skew = z),
transform = cross(x = c(1, 2), y = c(3, 4), z = c(5, 6))
),
combined = target(
bind_rows(data, .id = "id") %>%
arrange(sd) %>%
head(n = 400),
transform = combine(data, .by = c(x, y))
)
)
drake_plan_source(plan)
#> drake_plan(
#> data_1_3_5 = sim_data(mean = 1, sd = 3, skew = 5),
#> data_2_3_5 = sim_data(mean = 2, sd = 3, skew = 5),
#> data_1_4_5 = sim_data(mean = 1, sd = 4, skew = 5),
#> data_2_4_5 = sim_data(mean = 2, sd = 4, skew = 5),
#> data_1_3_6 = sim_data(mean = 1, sd = 3, skew = 6),
#> data_2_3_6 = sim_data(mean = 2, sd = 3, skew = 6),
#> data_1_4_6 = sim_data(mean = 1, sd = 4, skew = 6),
#> data_2_4_6 = sim_data(mean = 2, sd = 4, skew = 6),
#> combined_1_3 = bind_rows(data_1_3_5, data_1_3_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400),
#> combined_2_3 = bind_rows(data_2_3_5, data_2_3_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400),
#> combined_1_4 = bind_rows(data_1_4_5, data_1_4_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400),
#> combined_2_4 = bind_rows(data_2_4_5, data_2_4_6, .id = "id") %>%
#> arrange(sd) %>%
#> head(n = 400)
#> )
当需要整合指定的几个参数时,用combine
的.by
参数。上面代码中,第一个data
生成了六条指令,第二个combined
进行组合时,如果不用.by
,那么生成的命令是用bind_rows
聚合六个命令,这是用.by=c(x, y)
指定组合,表明以x
和y
两个参数为分组进行聚合。
用drake
来Debug
检查组件依赖关系
利用vis_drake_graph
能检查组件间依赖关系
例子
config <- drake_config(my_plan)
# Hover, click, drag, zoom, and pan. See args 'from' and 'to'.
vis_drake_graph(config, width = "100%", height = "500px")
检查指定组件
deps_code()
按顺序展现组件依赖
print(simulate)
#> function (n)
#> {
#> data <- random_rows(data = datasets::mtcars, n = n)
#> data.frame(x = data$wt, y = data$mpg)
#> }
deps_code(simulate)
#> # A tibble: 5 x 2
#> name type
#> <chr> <chr>
#> 1 data.frame globals
#> 2 mpg globals
#> 3 wt globals
#> 4 random_rows globals
#> 5 datasets::mtcars namespaced
deps_target() 检查指定组件内部依赖
deps_target("simulate", config)
#> # A tibble: 2 x 2
#> name type
#> <chr> <chr>
#> 1 random_rows globals
#> 2 datasets::mtcars namespaced
outdated()
和missed()
标出更新而改变的组件或丢失的组件