r-如何将单列扩展为宽格式,其中0和1是有条件定义的值



如何转换我的列"支付";从长到宽格式,同时保持其他列不变?

对于每个级别的";字母";,当单元格在"0"的值之前时;支付";,则当在宽格式中对应的新变量"的这一行时;例如美元";将具有";0〃;;否则";1〃;。

我尝试了output_format_test<-input_format%>%tidyr::pivot_wider(names_from = age, values_from = payment),但它没有产生预期的结果。

##输入格式

input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")

input_format 
# A tibble: 11 x 3
letter   age payment
<chr>  <dbl> <chr>  
1 A          2 NA     
2 A          3 dollar 
3 A          4 NA     
4 D          2 euro   
5 D          3 dollar 
6 D          4 NA     
7 F          2 NA     
8 F          3 euro   
9 F          3 dollar 
10 F          4 NA     
11 F          4 NA 

##输出格式

output_format <- readr::read_table2(
"letter age payment dollar euro
A 2 NA 0 0
A 3 dollar 1 0
A 4 NA 1 0
D 2 euro 0 1
D 3 dollar 1 1
D 4 NA 1 1
F 2 NA 0 0
F 3 euro 0 1
F 3 dollar 1 1
F 4 NA 1 1
F 4 NA 1 1
")

output_format
# A tibble: 11 x 5
letter   age payment dollar  euro
<chr>  <dbl> <chr>    <dbl> <dbl>
1 A          2 NA           0     0
2 A          3 dollar       1     0
3 A          4 NA           1     0
4 D          2 euro         0     1
5 D          3 dollar       1     1
6 D          4 NA           1     1
7 F          2 NA           0     0
8 F          3 euro         0     1
9 F          3 dollar       1     1
10 F          4 NA           1     1
11 F          4 NA           1     1

谢谢。已编辑。

您还可以使用以下tidyverse解决方案:

library(dplyr)
library(tidyr)
library(stringr)
input_format %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = payment, values_from = payment, 
values_fn = length) %>%
select(- c(id, `NA`)) %>%
bind_cols(input_format$payment) %>%
rename_with(~ str_replace(., "\.\.\.\d+", "payment"), contains(fixed("..."))) %>%
relocate(letter, age, payment) %>%
group_by(letter) %>%
replace_na(list(dollar = 0, euro = 0)) %>%
mutate(across(dollar:euro, ~ cummax(.x))) -> input2

# A tibble: 11 x 5
# Groups:   letter [3]
letter   age payment dollar  euro
<chr>  <dbl> <chr>    <dbl> <dbl>
1 A          2 NA           0     0
2 A          3 dollar       1     0
3 A          4 NA           1     0
4 D          2 euro         0     1
5 D          3 dollar       1     1
6 D          4 NA           1     1
7 F          2 NA           0     0
8 F          3 euro         0     1
9 F          3 dollar       1     1
10 F          4 NA           1     1
11 F          4 NA           1     1

在评论中讨论后,您可以使用以下解决方案来获得您想要的输出:

input2 %>%
group_by(letter, age) %>%
add_count() %>%
group_by(letter, age) %>%
filter((n == 2 & if_all(dollar:euro, ~ .x == 1)) | n == 1) %>%
select(-n) %>%
group_by(letter, age) %>%
add_count() %>%
group_split(letter, age) %>%
map_dfr(~ if(.x$n[1] == 2) {
.x %>% slice_tail(n = 1)
} else {
.x
})
# A tibble: 9 x 6
letter   age payment dollar  euro     n
<chr>  <dbl> <chr>    <dbl> <dbl> <int>
1 A          2 NA           0     0     1
2 A          3 dollar       1     0     1
3 A          4 NA           1     0     1
4 D          2 euro         0     1     1
5 D          3 dollar       1     1     1
6 D          4 NA           1     1     1
7 F          2 NA           0     0     1
8 F          3 dollar       1     1     1
9 F          4 NA           1     1     2

Tidyverse方法

input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
library(tidyverse)
input_format %>% mutate(rowid = row_number(),
payment1 = payment,
dummy = 1) %>%
pivot_wider(id_cols = -c(payment1, dummy), names_from = payment1, values_from = dummy, values_fill = 0, values_fn = length) %>%
select(-`NA`) %>%
group_by(letter) %>%
mutate(across(c('dollar', 'euro'), cumsum))
#> # A tibble: 11 x 6
#> # Groups:   letter [3]
#>    letter   age payment rowid dollar  euro
#>    <chr>  <dbl> <chr>   <int>  <int> <int>
#>  1 A          2 <NA>        1      0     0
#>  2 A          3 dollar      2      1     0
#>  3 A          4 <NA>        3      1     0
#>  4 D          2 euro        4      0     1
#>  5 D          3 dollar      5      1     1
#>  6 D          4 <NA>        6      1     1
#>  7 F          2 <NA>        7      0     0
#>  8 F          3 euro        8      0     1
#>  9 F          3 dollar      9      1     1
#> 10 F          4 <NA>       10      1     1
#> 11 F          4 <NA>       11      1     1

创建于2021-06-04由reprex包(v2.0.0(

使用zoo(和data.table,但不是必需的(:

input_format <- fread("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
output_format <- copy(input_format)[payment == "dollar", dollar := 1][, dollar := na.locf0(dollar), by=.(letter)]
output_format[payment == "euro", euro := 1][, euro := na.locf0(euro), by=.(letter)]
output_format[, c("dollar", "euro")][is.na(output_format[, c("dollar", "euro")])] <- 0

其产生:

> output_format
letter age payment dollar euro
1:      A   2    <NA>      0    0
2:      A   3  dollar      1    0
3:      A   4    <NA>      1    0
4:      D   2    euro      0    1
5:      D   3  dollar      1    1
6:      D   4    <NA>      1    1
7:      F   2    <NA>      0    0
8:      F   3    euro      0    1
9:      F   3  dollar      1    1
10:      F   4    <NA>      1    1
11:      F   4    <NA>      1    1

添加另一种方法:

我们可以使用map_dfcset_namespayment的命名向量上循环。

library(dplyr)
library(purrr)
input_format %>% 
group_by(letter) %>% 
mutate(map_dfc(unique(.$payment) %>% set_names(., .),
~ cumsum(!(payment != .x | is.na(payment)))
)) %>% 
select(- `...1`)
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> # A tibble: 11 x 5
#> # Groups:   letter [3]
#>    letter   age payment dollar  euro
#>    <chr>  <dbl> <chr>    <int> <int>
#>  1 A          2 <NA>         0     0
#>  2 A          3 dollar       1     0
#>  3 A          4 <NA>         1     0
#>  4 D          2 euro         0     1
#>  5 D          3 dollar       1     1
#>  6 D          4 <NA>         1     1
#>  7 F          2 <NA>         0     0
#>  8 F          3 euro         0     1
#>  9 F          3 dollar       1     1
#> 10 F          4 <NA>         1     1
#> 11 F          4 <NA>         1     1

我们可以用我在github上的包简化上面的代码,并使用over代替map_dfc,使用dist_values代替unique

library(dplyover) # https://github.com/TimTeaFan/dplyover
input_format %>% 
group_by(letter) %>% 
mutate(over(dist_values(.$payment),
~ cumsum(!(payment != .x | is.na(payment)))
))
#> # A tibble: 11 x 5
#> # Groups:   letter [3]
#>    letter   age payment dollar  euro
#>    <chr>  <dbl> <chr>    <dbl> <dbl>
#>  1 A          2 <NA>         0     0
#>  2 A          3 dollar       1     0
#>  3 A          4 <NA>         1     0
#>  4 D          2 euro         0     1
#>  5 D          3 dollar       1     1
#>  6 D          4 <NA>         1     1
#>  7 F          2 <NA>         0     0
#>  8 F          3 euro         0     1
#>  9 F          3 dollar       1     1
#> 10 F          4 <NA>         1     1
#> 11 F          4 <NA>         1     1

reprex包于2021-06-04创建(v0.3.0(

然而,无论是我的答案还是@AnilGoyal接受的答案,当每个小组多次提到一种支付方式时,都无法处理数据。我不知道想要的答案是否应该解释这个案例。目前,只有@Wietse de Vries和@Anousiravan R的答案适用于这种数据:

input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
A 5 dollar # this line is new
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")

我们可以很容易地调整上面的方法,如下所示来解释这种情况:

input_format %>% 
group_by(letter) %>% 
mutate(over(dist_values(.$payment),
~ ifelse(
cumsum(!(payment != .x | is.na(payment))) >= 1,
1, 0)
))
#> # A tibble: 12 x 5
#> # Groups:   letter [3]
#>    letter   age payment dollar  euro
#>    <chr>  <dbl> <chr>    <dbl> <dbl>
#>  1 A          2 <NA>         0     0
#>  2 A          3 dollar       1     0
#>  3 A          4 <NA>         1     0
#>  4 A          5 dollar       1     0
#>  5 D          2 euro         0     1
#>  6 D          3 dollar       1     1
#>  7 D          4 <NA>         1     1
#>  8 F          2 <NA>         0     0
#>  9 F          3 euro         0     1
#> 10 F          3 dollar       1     1
#> 11 F          4 <NA>         1     1
#> 12 F          4 <NA>         1     1

reprex包于2021-06-04创建(v0.3.0(

最新更新