这是上一个问题的更复杂的版本,我把实际问题抽象得太多,无法应用答案。 R 将整洁的分层数据框转换为分层列表
我使用 for 循环将具有两个分组级别的分层数据框转换为分层列表网格。
有没有更有效的基础R,tidyverse或其他方法来实现这一目标?
在真实数据集中:
- 分组变量和说明是多字字符串。
- 描述前言 - d# - 位于 MWE 中,以便于检查。
- 有 14 个不同类型的关联变量:字符、整数和双精度
规则
要在描述列中的组 1 和组 2 标题 第 1 组标题仅显示一次 组 2 标题是组 1 标题的子标题,仅当有新的组 2 标题时才更改 描述是第 2 组标题的子项
由此
g1 g2 desc var1 var2 var3
A a d1 KS3 0.0500 2 PLs
A a d2 CTI 0.0500 9 7O0
A b d3 b8x 0.580 5 he2
A b d4 XOf 0.180 12 XJE
A b d5 ygn 0.900 11 v48
A c d6 dGY 0.770 6 UcH
A d d7 jpG 0.600 4 P5M
B d d8 Z95 0.600 10 j6O
对此
desc var1 var2 var3
A
a
d1 KS3 0.0500 2 PLs
d2 CTI 0.0500 9 7O0
b
d3 b8x 0.580 5 he2
d4 XOf 0.180 12 XJE
d5 ygn 0.900 11 v48
c
d6 dGY 0.770 6 UcH
d
d7 jpG 0.600 4 P5M
B
d
法典
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
tib
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = ncol(tib)-1)) %>%
rename(id = V1, desc = V2, var1 = V3, var2 = V4, var3 = V5) %>%
mutate(id = NA_character_,
desc = NA_character_,
var1 = NA_real_,
var2 = NA_integer_,
var3 = NA_character_)
# Loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$id[[output_row]] <- "g1"
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$id[[output_row]] <- "g2"
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data grid
output$desc[[output_row]] <- tib$desc[[i]]
output$var1[[output_row]] <- tib$var1[[i]]
output$var2[[output_row]] <- tib$var2[[i]]
output$var3[[output_row]] <- tib$var3[[i]]
output_row <- output_row + 1
}
output
从 tyluRp R 调整答案 将整洁的分层数据帧转换为分层列表 我找到了解决方案。
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
# add unique identifier for description and variable rows
tib <-
tib %>%
rowid_to_column() %>%
mutate(rowid = paste0("z_", rowid))
# separate tibble for variables associated with descriptions
tib_var <-
tib %>%
select(rowid, var1, var2, var3)
# code adapted from tyluRp to reorder the data and add description variables
tib <-
tib %>%
select(g1, g2, desc, rowid) %>%
mutate(g2 = paste(g1, g2, sep = "_")) %>%
transpose() %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^z_"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values))%>%
drop_na(values) %>%
select(values, ind) %>%
mutate(values = str_remove(values, "\D_")) %>%
left_join(tib_var, by = c("ind" = "rowid")) %>%
select(-ind) %>%
replace_na(list(var1 = "", var2 = "", var3 = ""))