r-将分区文件夹列转换为变量



我想转换这样的数据帧:

mre <- tibble::tribble(
~folder3, ~folder2, ~folder1,
"V3=4",   "V2=1",   "V1=0",
"V3=5",   "V2=1",   "V1=0",
"V3=4",   "V2=2",   "V1=0",
"V3=5",   "V2=2",   "V1=0",
"V3=4",   "V2=1",   "V1=1",
"V3=5",   "V2=1",   "V1=1",
"V3=4",   "V2=2",   "V1=1",
"V3=5",   "V2=2",   "V1=1"
)

到此:

folder3 folder2 folder1 V3  V2  V1
V3=4    V2=1    V1=0    4   1   0
V3=5    V2=1    V1=0    5   1   0
V3=4    V2=2    V1=0    4   2   0
V3=5    V2=2    V1=0    5   2   0
V3=4    V2=1    V1=1    4   1   1
V3=5    V2=1    V1=1    5   1   1
V3=4    V2=2    V1=1    4   2   1
V3=5    V2=2    V1=1    5   2   1

基本上提取每个folder?列的唯一变量名(此处为"V3"、"V2"、"V1",但可以是任何有效名称,如"a"、"b"、"c"(作为新列名,并将值保持在适当位置。

我有一个单独的";文件夹";列使用第一行值:

mre %>% 
tidyr::extract(folder1, into = .$folder1[1] |> word(1, sep="="), "\S+=(\d+)", remove = FALSE)

但我不知道如何扩展到多个";文件夹";列(数字不是固定的(。我试着根据这里的答案使用map,但不知道如何从第一行获得变量名。

有什么建议吗?

我们可以在across本身中创建新列,而不是extract-mutateacross所有列(everything()(,使用str_extract获取=后面的数字(\d+(,同时使用str_replace修改names中的列名

library(dplyr)
library(stringr)
mre %>%
mutate(across(everything(), 
~ as.numeric(str_extract(., "(?<=\=)\d+")), 
.names = "{str_replace(.col, 'folder', 'V')}"))

-输出

# A tibble: 8 × 6
folder3 folder2 folder1    V3    V2    V1
<chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1

如果列名需要从"文件夹"列值中提取,那么一个选项是用pivot_longer将列重塑为"长",用separate_rows将列拆分,用pivot_wider将列重塑回"宽",并用原始数据绑定列

library(tidyr)
mre %>% 
mutate(rn = row_number()) %>% 
pivot_longer(cols= starts_with('folder'), names_to = NULL) %>% 
separate(value, into = c('name', 'value'), sep="\=", convert = TRUE) %>% 
pivot_wider(names_from = name, values_from = value) %>% 
select(-rn) %>% 
bind_cols(mre, .)

-输出

# A tibble: 8 × 6
folder3 folder2 folder1    V3    V2    V1
<chr>   <chr>   <chr>   <int> <int> <int>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1

基本R选项

cbind(
mre,
unclass(
xtabs(
V2 ~ id + factor(V1, levels = unique(V1)),
do.call(
rbind,
Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
)
)
)
)

给出

folder3 folder2 folder1 V3 V2 V1
1    V3=4    V2=1    V1=0  4  1  0
2    V3=5    V2=1    V1=0  5  1  0
3    V3=4    V2=2    V1=0  4  2  0
4    V3=5    V2=2    V1=0  5  2  0
5    V3=4    V2=1    V1=1  4  1  1
6    V3=5    V2=1    V1=1  5  1  1
7    V3=4    V2=2    V1=1  4  2  1
8    V3=5    V2=2    V1=1  5  2  1

代码分解

  • Map(..., mre)
> Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
$folder3
V1 V2 id
1 V3  4  1
2 V3  5  2
3 V3  4  3
4 V3  5  4
5 V3  4  5
6 V3  5  6
7 V3  4  7
8 V3  5  8
$folder2
V1 V2 id
1 V2  1  1
2 V2  1  2
3 V2  2  3
4 V2  2  4
5 V2  1  5
6 V2  1  6
7 V2  2  7
8 V2  2  8
$folder1
V1 V2 id
1 V1  0  1
2 V1  0  2
3 V1  0  3
4 V1  0  4
5 V1  1  5
6 V1  1  6
7 V1  1  7
8 V1  1  8
  • do.call(rbind, ...)
> do.call(
+   rbind,
+   Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
+ )
V1 V2 id
folder3.1 V3  4  1
folder3.2 V3  5  2
folder3.3 V3  4  3
folder3.4 V3  5  4
folder3.5 V3  4  5
folder3.6 V3  5  6
folder3.7 V3  4  7
folder3.8 V3  5  8
folder2.1 V2  1  1
folder2.2 V2  1  2
folder2.3 V2  2  3
folder2.4 V2  2  4
folder2.5 V2  1  5
folder2.6 V2  1  6
folder2.7 V2  2  7
folder2.8 V2  2  8
folder1.1 V1  0  1
folder1.2 V1  0  2
folder1.3 V1  0  3
folder1.4 V1  0  4
folder1.5 V1  1  5
folder1.6 V1  1  6
folder1.7 V1  1  7
folder1.8 V1  1  8
  • xtabs(..., ...)
> xtabs(
+   V2 ~ id + factor(V1, levels = unique(V1)),
+   do.call(
+     rbind,
+     Map(function(x) cbind(read.table(text = x, sep = "="), id = se .... [TRUNCATED]
factor(V1, levels = unique(V1))
id  V3 V2 V1
1  4  1  0
2  5  1  0
3  4  2  0
4  5  2  0
5  4  1  1
6  5  1  1
7  4  2  1
8  5  2  1

这里有另一个可以使用的解决方案:

library(dplyr)
library(stringr)
library(purrr)
mre %>%
bind_cols(
setNames(seq_len(ncol(mre)) %>%
map_dfc(~ {y <- unname(unlist(mre[.x]))
as.integer(str_remove(y, "\w+="))}), paste0("V", 1:ncol(mre)))
)
# A tibble: 8 x 6
folder3 folder2 folder1    V1    V2    V3
<chr>   <chr>   <chr>   <int> <int> <int>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1

最新更新