我浏览了这里发布的大多数相关问题,但似乎没有一个问题与我面临的问题相同。据我所知,这里已经发布的问题与长格式数据中的重复值有关(缺乏唯一标识符(,这会导致具有列表列的宽格式数据,而这通常通过创建一个由唯一数字组成的伪变量列来解决。我尝试了我看到的所有不同的解决方案,但都没有解决我的问题,这就是我决定发布这个问题的原因。
数据
我有一个不同地块中各种植物物种(及其数量和层次(的长表:
> rep_example[1:15,]
Point Species Number Layer
1 P03 Lari_deci 21 C
2 P03 Quer_rope 17 C
3 P03 Pinu_sylv 5 C
4 P03 Sorb_aucu 3 U
5 P03 Betu_pend 1 C
6 P03 Acer_pseu 1 U
7 P06 Quer_rope 28 C
8 P06 Pinu_sylv 28 C
9 P06 Popu_trem 6 C
10 P06 Lari_deci 3 C
11 P07 Fagu_sylv 110 C
12 P07 Pinu_sylv 20 C
13 P07 Pice_abie 5 C
14 P07 Quer_rope 3 C
15 P07 Betu_pend 1 C
> dput(rep_example[1:15,])
structure(list(Point = c("P03", "P03", "P03", "P03", "P03", "P03",
"P06", "P06", "P06", "P06", "P07", "P07", "P07", "P07", "P07"
), Species = c("Lari_deci", "Quer_rope", "Pinu_sylv", "Sorb_aucu",
"Betu_pend", "Acer_pseu", "Quer_rope", "Pinu_sylv", "Popu_trem",
"Lari_deci", "Fagu_sylv", "Pinu_sylv", "Pice_abie", "Quer_rope",
"Betu_pend"), Number = c("21", "17", "5", "3", "1", "1", "28",
"28", "6", "3", "110", "20", "5", "3", "1"), Layer = c("C", "C",
"C", "U", "C", "U", "C", "C", "C", "C", "C", "C", "C", "C", "C"
)), row.names = c(NA, 15L), class = "data.frame")
理想的结果
我希望通过将不同的Species
名称作为列,并且每个Layer
每个Point
:只有一行来创建这个表的广泛形式
> rep_example_ideal
Point Layer Lari_deci Quer_rope Pinu_sylv Sorb_aucu Betu_pend Acer_pseu
1 P03 C 21 17 5 0 1 0
2 P03 U 0 0 0 3 0 1
3 P06 C 3 28 28 0 0 0
4 P06 U 0 0 0 0 0 0
5 P07 C 0 3 20 1 1 0
6 P07 U 0 0 0 0 0 0
> dput(rep_example_ideal)
structure(list(Point = c("P03", "P03", "P06", "P06", "P07", "P07"
), Layer = c("C", "U", "C", "U", "C", "U"), Lari_deci = c("21",
"0", "3", "0", "0", "0"), Quer_rope = c("17", "0", "28", "0",
"3", "0"), Pinu_sylv = c("5", "0", "28", "0", "20", "0"), Sorb_aucu = c("0",
"3", "0", "0", "1", "0"), Betu_pend = c("1", "0", "0", "0", "1",
"0"), Acer_pseu = c("0", "1", "0", "0", "0", "0")), class = "data.frame", row.names = c(NA,
-6L))
问题代码
这是我用来创建宽表的代码:
rep_example %>% group_by(Point, Layer) %>%
mutate(Number = as.numeric(Number)) %>%
distinct() %>%
mutate(rn = 1:n()) %>%
pivot_wider(id_cols = c(Point, Layer, rn), names_from = Species, values_from = Number)
# A tibble: 172 x 17
# Groups: Point, Layer [57]
Point Layer rn Lari_deci Quer_rope Pinu_sylv Sorb_aucu Betu_pend Acer_pseu Popu_trem
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 P03 C 1 21 NA NA NA NA NA NA
2 P03 C 2 NA 17 NA NA NA NA NA
3 P03 C 3 NA NA 5 NA NA NA NA
4 P03 U 1 NA NA NA 3 NA NA NA
5 P03 C 4 NA NA NA NA 1 NA NA
6 P03 U 2 NA NA NA NA NA 1 NA
7 P06 C 1 NA 28 NA NA NA NA NA
8 P06 C 2 NA NA 28 NA NA NA NA
9 P06 C 3 NA NA NA NA NA NA 6
10 P06 C 4 3 NA NA NA NA NA NA
# ... with 162 more rows, and 7 more variables: Fagu_sylv <dbl>, Pice_abie <dbl>,
# Abie_alba <dbl>, Fran_alnu <dbl>, Tili_cord <dbl>, Alnu_glut <dbl>, Quer_rubr <dbl>
我使用
mutate(rn = 1:n())
创建一个伪变量rn
,以确保标识符唯一。周和点值相同的行不会合并,而是显示为单独的行。我尝试使用不同形式的group_by()
,但这些都没有区别,而在pivot_wider()
中明确声明id_cols
会导致下面的问题#2。当我不使用
mutate(rn = 1:n())
时,生成的宽数据由列表列组成,即使列表长度为1(此处发布的所有其他问题都会导致更长的列表列,即重复(,并且Week
和Point
的组合提供了一个唯一的ID。然而,在这种方法中,上述行未合并的问题得到了解决。
rep_example %>% group_by(Point, Layer) %>%
mutate(Number = as.numeric(Number)) %>%
pivot_wider(id_cols = c(Point, Layer), names_from = Species, values_from = Number)
# A tibble: 57 x 16
# Groups: Point, Layer [57]
Point Layer Lari_deci Quer_rope Pinu_sylv Sorb_aucu Betu_pend Acer_pseu Popu_trem Fagu_sylv
<chr> <chr> <list> <list> <list> <list> <list> <list> <list> <list>
1 P03 C <dbl [1]> <dbl [1]> <dbl [1]> <NULL> <dbl [1]> <NULL> <NULL> <NULL>
2 P03 U <NULL> <NULL> <NULL> <dbl [1]> <NULL> <dbl [1]> <NULL> <NULL>
3 P06 C <dbl [1]> <dbl [1]> <dbl [1]> <NULL> <NULL> <NULL> <dbl [1]> <NULL>
4 P07 C <NULL> <dbl [1]> <dbl [1]> <dbl [1]> <dbl [1]> <NULL> <NULL> <dbl [1]>
5 P07 U <NULL> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL>
6 P08 C <NULL> <dbl [1]> <dbl [1]> <NULL> <NULL> <NULL> <dbl [1]> <NULL>
7 P08 U <NULL> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL>
8 P10 U <NULL> <dbl [1]> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL>
9 P10 C <NULL> <dbl [1]> <dbl [1]> <NULL> <dbl [1]> <NULL> <NULL> <NULL>
10 P11 C <NULL> <dbl [1]> <dbl [1]> <NULL> <NULL> <NULL> <NULL> <NULL>
# ... with 47 more rows, and 6 more variables: Pice_abie <list>, Abie_alba <list>,
# Fran_alnu <list>, Tili_cord <list>, Alnu_glut <list>, Quer_rubr <list>
Warning message:
Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates
pivot_wider
正在使用我使用不同部分数据尝试的其他代码。这个特殊的问题仍未解决,我非常感谢任何形式的帮助!!!
谢谢!
我们可以用rowid
创建一个序列列
library(dplyr)
library(tidyr)
library(data.table)
df1 %>%
mutate(rn = rowid(Point, Species)) %>%
pivot_wider(names_from = Species, values_from = Number,
values_fill = list(Number = '0'))
如果我们想要所有的组合,请使用complete
df1 %>%
complete(Point, Layer, fill = list(Number = '0')) %>%
fill(Species) %>%
pivot_wider(names_from = Species, values_from = Number,
values_fill = list(Number = '0'))
# A tibble: 6 x 11
# Point Layer Lari_deci Quer_rope Pinu_sylv Betu_pend Sorb_aucu Acer_pseu Popu_trem Fagu_sylv Pice_abie
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 P03 C 21 17 5 1 0 0 0 0 0
#2 P03 U 0 0 0 0 3 1 0 0 0
#3 P06 C 3 28 28 0 0 0 6 0 0
#4 P06 U 0 0 0 0 0 0 0 0 0
#5 P07 C 0 3 20 1 0 0 0 110 5
#6 P07 U 0 0 0 0 0 0 0 0 0