r语言 - 以迭代方式将多个列中的一个合并到列表中的数据帧


# load example data
# generate predictor tables with overlapping rows and different amount of cols
varespec1 <- varespec[c(1:9), ]
varespec2 <- varespec[c(8:16), c(1:43)]
varespec3 <- varespec[c(14:24), c(1:41)]
# store predictor tables in list
subset_list <- list(varespec1 = varespec1, 
varespec2 = varespec2, 
varespec3 = varespec3)
# generate a table that holds ALL possible response variables as presence/absence
varechem_binary <- as.data.frame(apply(varechem, 2, cut, 
breaks = c(-Inf, 1.0, Inf), labels = c("Absent", "Present")))
row.names(varechem_binary) <- row.names(varechem)


# merge response table with each predictor table
merge_counter <- 0
merged_list <- list()
for(table in subset_list) {
merge_counter <- merge_counter + 1
current_name <- names(subset_list)[merge_counter]
tmp <- merge(table, varechem_binary, by = "row.names")
row.names(tmp) <- tmp$Row.names
tmp <- tmp[, -1]
merged_list[[current_name]] <- tmp



# storing in data frames just for illustration, I would like to do this within the list
# subsets for the 3 predictor tables with the first response variable
aa <- merged_list[[1]][,-c(46:58)]  # column 1:44 are the predictor variables, then the different response variables start
bb <- merged_list[[2]][,-c(45:57)]  # column 1:43 are the predictor variables, then the different response variables start
cc <- merged_list[[3]][,-c(43:58)] # column 1:41 are the predictor variables, then the different response variables start
# subsets for the 3 predictor tables with the second response variable
dd <- merged_list[[1]][,-c(45, 47:58)]
ee <- merged_list[[2]][,-c(44, 46:57)]
ff <- merged_list[[3]][,-c(42, 44:58)]
# subsets for the 3 predictor tables with the third response variable
gg <- merged_list[[1]][,-c(45, 46, 48:58)]
# this is just to illustrate how the list could look like, I would like to keep all files in a list all the time
list_for_classification_runs <- list(aa, bb, cc, dd, ee, ff, gg, ...)


for (current_table in list_for_classification_runs) {
counter <- counter + 1 
# response_variable should be the one variable added to the predictor variables in the data frames 
RF_list[[counter]] <- ranger(response_variable ~ ., data = current_table)

根据格雷戈尔的评论,我想出了一个类似的方法。我没有将完整的varechem_binarysubset_list的所有元素合并,而是添加了另一个 for 循环并迭代了varechem_binary中的所有列。使用drop = FALSE将保留 row.names 和结构,因此合并有效:

merge_col_counter <- 0
column_counter <- 0
merged_column_list <- list()
for(table in subset_list) {
merge_col_counter <- merge_col_counter + 1
for (column in names(varechem_binary)) {
column_counter <- column_counter + 1
current_name <- paste(names(subset_list)[merge_col_counter], names(varechem_binary)[column_counter], sep = "_")
tmp <- merge(table, varechem_binary[, column_counter, drop = FALSE], by = "row.names")
row.names(tmp) <- tmp$Row.names
tmp <- tmp[, -1]
merged_column_list[[current_name]] <- tmp
column_counter <- 0



lapply(subset_list, function(x) apply(varechem_binary, 2, function(var) merge(var, x, by= 'row.names')))

使用system.time(( 使用示例数据对这两种方法进行基准测试,此方法的速度(0.075 个用户时间(是使用 for 循环的解决方案(0.143 个用户时间(的两倍。
