奇怪的是,我认为从查看df开始更容易。
#reproducible data
quantiles<-c("50","90")
var=c("w","d")
df=data.frame(a=runif(20,0.01,.5),b=runif(20,0.02,.5),c=runif(20,0.03,.5),e=runif(20,0.04,.5),
q50=runif(20,1,5),q90=runif(20,10,50))
head(df)
我想自动化我创建的一个函数(下面(,使用df中不同的值组合来计算vars
。例如,w
的计算需要使用a
和b
,并且d
需要使用c
和e
,使得w = a *q ^ b
和d = c * q ^ e
。此外,q
是一个分位数,所以我实际上想要w50
、w90
等,它们将对应于df中的q50
、q90
等。
在我看来,棘手的部分是设置使用&b与c&d,而不使用嵌套循环。我有一个使用适当列计算vars
的函数,但我无法有效地将所有部分组合在一起。
#function to calculate the w, d
calc_wd <- function(df,col_name,col1,col2,col3){
#Calculate and create new column col_name for each combo of var and quantile, e.g. "w_50", "d_50", etc.
df[[col_name]] <- df[[col1]] * (df[[col2]] ^ (df[[col3]]))
df
}
我可以将其用于单个情况,但不能通过自动进行系数交换。。。你会看到我在下面指定了"a"one_answers"b"。
wd<-c("w_","d_")
make_wd_list<-apply(expand.grid(wd, quantiles), 1, paste,collapse="")
calc_wdv(df,make_wd_list[1],"a",paste0("q",sapply(strsplit(make_wd_list[1],"_"),tail,1)),"b")
或者,我试图使用嵌套的for循环来实现这一点,但似乎无法正确地附加数据。而且很难看。
var=c("w","d")
dataf<-data.frame()
for(j in unique(var)){
if(j=="w"){
coeff1="a"
coeff2="b"
}else if(j=="d"){
coeff1="c"
coeff1="e"
}
print(coeff1)
print(coeff2)
for(k in unique(quantiles)){
dataf<-calc_wd(df,paste0(j,k),coeff1,paste0("q",k),coeff2)
dataf[k,j]=rbind(df,dataf) #this aint right. tried to do.call outside, etc.
}
}
最后,我希望有w_50
、w_90
等新列,它们使用q50
、q90
和最初定义的相应系数。
我发现一种很容易键入的方法是使用purrr::pmap
。我喜欢这样,因为当您使用with(list(...),)
时,您可以按名称访问data.frame
的列名。此外,您还可以提供其他参数。
library(purrr)
pmap_df(df, quant = "q90", ~with(list(...),{
list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)
}))
## A tibble: 20 x 2
# w d
# <dbl> <dbl>
# 1 0.239 0.295
# 2 0.152 0.392
# 3 0.476 0.828
# 4 0.344 0.236
# 5 0.439 1.00
例如,您可以将其与第二个map
调用结合起来,对分位数进行迭代。
library(dplyr)
map(setNames(quantiles,quantiles),
~ pmap_df(df, quant = paste0("q",.x),
~ with(list(...),{list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)}))
) %>% do.call(cbind,.)
# 50.w 50.d 90.w 90.d
#1 0.63585897 0.11045837 1.7276019 0.1784987
#2 0.17286184 0.22033649 0.2333682 0.5200265
#3 0.32437528 0.72502654 0.5722203 1.4490065
#4 0.68020897 0.33797621 0.8749206 0.6179557
#5 0.73516886 0.38481785 1.2782923 0.4870877
那么,分配一个自定义函数是微不足道的。
calcwd <- function(df,quantiles){
map(setNames(quantiles,quantiles),
~ pmap_df(df, quant = paste0("q",.x),
~ with(list(...),{list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)}))
) %>% do.call(cbind,.)
}
我喜欢@Ian对with
和do.call
等经典作品的完整性和使用的回答。我的解决方案已经很晚了,但由于我一直在努力改进行操作(包括使用rowwise
(,我认为我只使用mutate
、formula.tools
和map_dfc
就可以提供一个不那么优雅但更简单、更快的解决方案
library(dplyr)
library(purrr)
require(formula.tools)
# same type example data plus a much larger version in df2 for
# performance testing
df <- data.frame(a = runif(20, 0.01, .5),
b = runif(20, 0.02, .5),
c = runif(20, 0.03, .5),
e = runif(20, 0.04, .5),
q50 = runif(20,1,5),
q90 = runif(20,10,50)
)
df2 <- data.frame(a = runif(20000, 0.01, .5),
b = runif(20000, 0.02, .5),
c = runif(20000, 0.03, .5),
e = runif(20000, 0.04, .5),
q50 = runif(20000,1,5),
q90 = runif(20000,10,50)
)
# from your original post
quantiles <- c("q50", "q90")
wd <- c("w_", "d_")
make_wd_list <- apply(expand.grid(wd, quantiles),
1,
paste, collapse = "")
make_wd_list
#> [1] "w_q50" "d_q50" "w_q90" "d_q90"
# an empty list to hold our formulas
eqn_list <- vector(mode = "list",
length = length(make_wd_list)
)
# populate the list makes it very extensible to more outcomes
# or to more quantile levels
for (i in seq_along(make_wd_list)) {
if (substr(make_wd_list[[i]], 1, 1) == "w") {
eqn_list[[i]] <- as.formula(paste(make_wd_list[[i]], "~ a * ", substr(make_wd_list[[i]], 3, 5), " ^ b"))
} else if (substr(make_wd_list[[i]], 1, 1) == "d") {
eqn_list[[i]] <- as.formula(paste(make_wd_list[[i]], "~ c * ", substr(make_wd_list[[i]], 3, 5), " ^ e"))
}
}
# formula.tools helps us grab both left and right sides
add_column <- function(df, equation){
df <- transmute_(df, rhs(equation))
colnames(df)[ncol(df)] <- as.character(lhs(equation))
return(df)
}
result <- map_dfc(eqn_list, ~ add_column(df = df, equation = .x))
#> w_q50 d_q50 w_q90 d_q90
#> 1 0.10580863 0.29136904 0.37839737 0.9014040
#> 2 0.34798729 0.35185585 0.64196417 0.4257495
#> 3 0.79714122 0.37242915 1.57594506 0.6198531
#> 4 0.56446922 0.43432160 1.07458217 1.1082825
#> 5 0.26896574 0.07374273 0.28557366 0.1678035
#> 6 0.36840408 0.72458466 0.72741030 1.2480547
#> 7 0.64484009 0.69464045 1.93290705 2.1663690
#> 8 0.43336109 0.21265672 0.46187366 0.4365486
#> 9 0.61340404 0.47528697 0.89286358 0.5383290
#> 10 0.36983212 0.53292900 0.53996112 0.8488402
#> 11 0.11278412 0.12532491 0.12486156 0.2413191
#> 12 0.03599639 0.25578020 0.04084221 0.3284659
#> 13 0.26308183 0.05322304 0.87057854 0.1817630
#> 14 0.06533586 0.22458880 0.09085436 0.3391683
#> 15 0.11625845 0.32995233 0.12749040 0.4730407
#> 16 0.81584442 0.07733376 2.15108243 0.1041342
#> 17 0.38198254 0.60263861 0.68082354 0.8502999
#> 18 0.51756058 0.43398089 1.06683204 1.3397900
#> 19 0.34490492 0.13790601 0.69168711 0.1580659
#> 20 0.39771037 0.33286225 1.32578056 0.4141457
microbenchmark::microbenchmark(result <- map_dfc(eqn_list, ~ add_column(df = df2, equation = .x)), times = 10)
#> Unit: milliseconds
#> expr min
#> result <- map_dfc(eqn_list, ~add_column(df = df2, equation = .x)) 10.58004
#> lq mean median uq max neval
#> 11.34603 12.56774 11.6257 13.24273 16.91417 10
mutate
和formula
解决方案的速度约为50倍,尽管两者都在不到一秒的内穿透20000行
由reprex包(v0.3.0(创建于2020-04-30