drake学习笔记

Posted by rogerclarkgc on 周四 27 六月 2019

例子

plan <- drake_plan(
  raw_data = readxl::read_excel(file_in("raw_data.xlsx")),
  data = raw_data %>%
    mutate(Species = forcats::fct_inorder(Species)),
  hist = create_plot(data),
  fit = lm(Sepal.Width ~ Petal.Width + Species, data),
  report = rmarkdown::render(
    knitr_in("report.Rmd"),
    output_file = file_out("report.html"),
    quiet = TRUE
  )
)
plan
#> # A tibble: 5 x 2
#>   target   command                                                         
#>   <chr>    <expr>                                                          
#> 1 raw_data readxl::read_excel(file_in("raw_data.xlsx"))                   …
#> 2 data     raw_data %>% mutate(Species = forcats::fct_inorder(Species))   …
#> 3 hist     create_plot(data)                                              …
#> 4 fit      lm(Sepal.Width ~ Petal.Width + Species, data)                  …
#> 5 report   rmarkdown::render(knitr_in("report.Rmd"), output_file = file_ou…

drake_plan其实是经过改造的R脚本,其实例是一个tibble数据框,一般包含target和相对应的commandtarget一般是分析过程中的变量或者数据以及输出的文件,commmand一般是建模、绘制等统计分析的指令。可以通过如下语句自由转换plan和一般脚本

plan_to_code() 
code_to_plan()
plan_to_notebook() # 转换成Rnotebooks

drake包的核心思想是减少数据分析中由于改变参数而导致所有管线组件需要重新计算的问题,利用drake_plan包裹数据分析的脚本,随后执行如下语句来计算整个plan:

config <- drake_config(plan) # 建立plan配置
vis_drake_graph(config) # 生成plan流程图
make(plan) # 执行plan

使用drake_plan做数据分析的好处在于,drake会自动分析plan的依赖关系,每次执行是只会计算整个管线中发生改变的组件,从而减少大量分析时间。 此外vis_drake_graph形成依赖图使分析过程更加可视化。

target:定制的command

利用target来包裹plan的子代码,能够更好的抽象整个分析过程中的组件,此外target的几个参数能够方便的进行并行计算

drake_plan(
  file = target(
    ggsave(file_out("plot.png"), plot),
    elapsed = 10
  ),
  create_plot(datasets::iris)
)
#> # A tibble: 2 x 3
#>   target         command                            elapsed
#>   <chr>          <expr>                               <dbl>

The following columns have special meanings for make().

elapsed and cpu: number of seconds to wait for the target to build before timing out (elapsed for elapsed time and cpu for CPU time).

priority: for parallel computing, optionally rank the targets according to priority in the scheduler. resources: target-specific lists of resources for a computing cluster. See the advanced options in the parallel computing chapter for details.

retries: number of times to retry building a target in the event of an error.

trigger: rule to decide whether a target needs to run. See the trigger chapter to learn more.

map, crosscombine

数据分析中常常需要重复执行一些填入不同参数的命令,最常见的就是逐步回归中的建模,回归系数多时往往需要很多模型。这是使用以上三个命令可以极大减少plan的代码量,同时也可以加速运算过程。

map:参数矩阵

例子1

drake_plan(
  x = target(
    simulate_data(center, scale),
    transform = map(center = c(2, 1, 0), scale = c(3, 2, 1))
  )
)
#> # A tibble: 3 x 2
#>   target command            
#>   <chr>  <expr>             
#> 1 x_2_3  simulate_data(2, 3)
#> 2 x_1_2  simulate_data(1, 2)
#> 3 x_0_1  simulate_data(0, 1)

simulate_data是元操作,map生成一个2*3参数矩阵,套用在这个元操作上,注意参数之间长度应该相同

例子2

my_grid <- tibble(
  sim_function = c("rnrom", "rt", "rcauchy"),
  title = c("Normal", "Student t", "Cauchy")
)
my_grid$sim_function <- rlang::syms(my_grid$sim_function)

drake_plan(
  x = target(
    simulate_data(sim_function, title, center, scale),
    transform = map(
      center = c(2, 1, 0),
      scale = c(3, 2, 1),
      .data = !!my_grid,
      .id = sim_function # for pretty target names
    )
  )
)
#> # A tibble: 3 x 2
#>   target    command                               
#>   <chr>     <expr>                                
#> 1 x_rnrom   simulate_data(rnrom, "Normal", 2, 3)  
#> 2 x_rt      simulate_data(rt, "Student t", 1, 2)  
#> 3 x_rcauchy simulate_data(rcauchy, "Cauchy", 0, 1)

可以自己定制参数矩阵,要注意!!的使用。

例子3

drake_plan(
  x = target(
    simulate_data(center),
    transform = map(center = c(1, 2))
  ),
  y = target(
    process_data(x, center),
    transform = map(x)
  ),
  trace = TRUE # Adds extra columns for the grouping variables.
)
#> # A tibble: 4 x 5
#>   target command              center x     y    
#>   <chr>  <expr>               <chr>  <chr> <chr>
#> 1 x_1    simulate_data(1)     1      x_1   <NA> 
#> 2 x_2    simulate_data(2)     2      x_2   <NA> 
#> 3 y_x_1  process_data(x_1, 1) 1      x_1   y_x_1
#> 4 y_x_2  process_data(x_2, 2) 2      x_2   y_x_2

map可以嵌套使用,第一个x使用center=c(1,2)生成两条命令,第二个y实际利用了第一个的结果即map(x=c(x_1, x_2),注意,这里center没有加入map,但却有取值,这可能跟这个代码块中的center变量并非参数变量,在第一个map执行完后不会销毁而继续作为drake_plan中的局部变量而被第二个map调用。

cross:参数组合

利用cross可以形成参数之间的组合,也是产生一个参数矩阵,这时参数的长度可以不同

例子

drake_plan(
  x = target(
    simulate_data(nrow, ncol),
    transform = cross(nrow = c(1, 2, 3), ncol = c(4, 5))
  )
)
#> # A tibble: 6 x 2
#>   target command            
#>   <chr>  <expr>             
#> 1 x_1_4  simulate_data(1, 4)
#> 2 x_2_4  simulate_data(2, 4)
#> 3 x_3_4  simulate_data(3, 4)
#> 4 x_1_5  simulate_data(1, 5)
#> 5 x_2_5  simulate_data(2, 5)
#> 6 x_3_5  simulate_data(3, 5)

这里nrowncol进行组合,就有3*2中参数组合,所以生成的tibble就有6行命令

combine:整合命令

利用combine可以聚合多个命令成为一条命令

例子1

plan <- drake_plan(
  data = target(
    sim_data(mean = x, sd = y),
    transform = map(x = c(1, 2), y = c(3, 4))
  ),
  larger = target(
    bind_rows(data, .id = "id") %>%
      arrange(sd) %>%
      head(n = 400),
    transform = combine(data)
  )
)

plan
#> # A tibble: 3 x 2
#>   target   command                                                         
#>   <chr>    <expr>                                                          
#> 1 data_1_3 sim_data(mean = 1, sd = 3)                                     …
#> 2 data_2_4 sim_data(mean = 2, sd = 4)                                     …
#> 3 larger   bind_rows(data_1_3, data_2_4, .id = "id") %>% arrange(sd) %>%  …

drake_plan_source(plan)
#> drake_plan(
#>   data_1_3 = sim_data(mean = 1, sd = 3),
#>   data_2_4 = sim_data(mean = 2, sd = 4),
#>   larger = bind_rows(data_1_3, data_2_4, .id = "id") %>%
#>     arrange(sd) %>%
#>     head(n = 400)
#> )

这段代码中,首先第一个targetdata,生成了两个命令data_1_3data_2_4,随后在larger中,利用combinebind_rows中的参数整合成一条命令

例子2

plan <- drake_plan(
  data = target(
    sim_data(mean = x, sd = y, skew = z),
    transform = cross(x = c(1, 2), y = c(3, 4), z = c(5, 6))
  ),
  combined = target(
    bind_rows(data, .id = "id") %>%
      arrange(sd) %>%
      head(n = 400),
    transform = combine(data, .by = c(x, y))
  )
)

drake_plan_source(plan)
#> drake_plan(
#>   data_1_3_5 = sim_data(mean = 1, sd = 3, skew = 5),
#>   data_2_3_5 = sim_data(mean = 2, sd = 3, skew = 5),
#>   data_1_4_5 = sim_data(mean = 1, sd = 4, skew = 5),
#>   data_2_4_5 = sim_data(mean = 2, sd = 4, skew = 5),
#>   data_1_3_6 = sim_data(mean = 1, sd = 3, skew = 6),
#>   data_2_3_6 = sim_data(mean = 2, sd = 3, skew = 6),
#>   data_1_4_6 = sim_data(mean = 1, sd = 4, skew = 6),
#>   data_2_4_6 = sim_data(mean = 2, sd = 4, skew = 6),
#>   combined_1_3 = bind_rows(data_1_3_5, data_1_3_6, .id = "id") %>%
#>     arrange(sd) %>%
#>     head(n = 400),
#>   combined_2_3 = bind_rows(data_2_3_5, data_2_3_6, .id = "id") %>%
#>     arrange(sd) %>%
#>     head(n = 400),
#>   combined_1_4 = bind_rows(data_1_4_5, data_1_4_6, .id = "id") %>%
#>     arrange(sd) %>%
#>     head(n = 400),
#>   combined_2_4 = bind_rows(data_2_4_5, data_2_4_6, .id = "id") %>%
#>     arrange(sd) %>%
#>     head(n = 400)
#> )

当需要整合指定的几个参数时,用combine.by参数。上面代码中,第一个data生成了六条指令,第二个combined进行组合时,如果不用.by,那么生成的命令是用bind_rows聚合六个命令,这是用.by=c(x, y)指定组合,表明以xy两个参数为分组进行聚合。

drake来Debug

检查组件依赖关系

利用vis_drake_graph能检查组件间依赖关系

例子

config <- drake_config(my_plan)
# Hover, click, drag, zoom, and pan. See args 'from' and 'to'.
vis_drake_graph(config, width = "100%", height = "500px")

检查指定组件

deps_code()按顺序展现组件依赖

print(simulate)
#> function (n) 
#> {
#>     data <- random_rows(data = datasets::mtcars, n = n)
#>     data.frame(x = data$wt, y = data$mpg)
#> }

deps_code(simulate)
#> # A tibble: 5 x 2
#>   name             type      
#>   <chr>            <chr>     
#> 1 data.frame       globals   
#> 2 mpg              globals   
#> 3 wt               globals   
#> 4 random_rows      globals   
#> 5 datasets::mtcars namespaced

deps_target() 检查指定组件内部依赖

deps_target("simulate", config)
#> # A tibble: 2 x 2
#>   name             type      
#>   <chr>            <chr>     
#> 1 random_rows      globals   
#> 2 datasets::mtcars namespaced

outdated()missed()

标出更新而改变的组件或丢失的组件

tags: R, notes