如何转换我的列"支付";从长到宽格式,同时保持其他列不变?
对于每个级别的";字母";,当单元格在"0"的值之前时;支付";,则当在宽格式中对应的新变量"的这一行时;例如美元";将具有";0〃;;否则";1〃;。
我尝试了output_format_test<-input_format%>%tidyr::pivot_wider(names_from = age, values_from = payment)
,但它没有产生预期的结果。
##输入格式
input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
input_format
# A tibble: 11 x 3
letter age payment
<chr> <dbl> <chr>
1 A 2 NA
2 A 3 dollar
3 A 4 NA
4 D 2 euro
5 D 3 dollar
6 D 4 NA
7 F 2 NA
8 F 3 euro
9 F 3 dollar
10 F 4 NA
11 F 4 NA
##输出格式
output_format <- readr::read_table2(
"letter age payment dollar euro
A 2 NA 0 0
A 3 dollar 1 0
A 4 NA 1 0
D 2 euro 0 1
D 3 dollar 1 1
D 4 NA 1 1
F 2 NA 0 0
F 3 euro 0 1
F 3 dollar 1 1
F 4 NA 1 1
F 4 NA 1 1
")
output_format
# A tibble: 11 x 5
letter age payment dollar euro
<chr> <dbl> <chr> <dbl> <dbl>
1 A 2 NA 0 0
2 A 3 dollar 1 0
3 A 4 NA 1 0
4 D 2 euro 0 1
5 D 3 dollar 1 1
6 D 4 NA 1 1
7 F 2 NA 0 0
8 F 3 euro 0 1
9 F 3 dollar 1 1
10 F 4 NA 1 1
11 F 4 NA 1 1
谢谢。已编辑。
您还可以使用以下tidyverse
解决方案:
library(dplyr)
library(tidyr)
library(stringr)
input_format %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = payment, values_from = payment,
values_fn = length) %>%
select(- c(id, `NA`)) %>%
bind_cols(input_format$payment) %>%
rename_with(~ str_replace(., "\.\.\.\d+", "payment"), contains(fixed("..."))) %>%
relocate(letter, age, payment) %>%
group_by(letter) %>%
replace_na(list(dollar = 0, euro = 0)) %>%
mutate(across(dollar:euro, ~ cummax(.x))) -> input2
# A tibble: 11 x 5
# Groups: letter [3]
letter age payment dollar euro
<chr> <dbl> <chr> <dbl> <dbl>
1 A 2 NA 0 0
2 A 3 dollar 1 0
3 A 4 NA 1 0
4 D 2 euro 0 1
5 D 3 dollar 1 1
6 D 4 NA 1 1
7 F 2 NA 0 0
8 F 3 euro 0 1
9 F 3 dollar 1 1
10 F 4 NA 1 1
11 F 4 NA 1 1
在评论中讨论后,您可以使用以下解决方案来获得您想要的输出:
input2 %>%
group_by(letter, age) %>%
add_count() %>%
group_by(letter, age) %>%
filter((n == 2 & if_all(dollar:euro, ~ .x == 1)) | n == 1) %>%
select(-n) %>%
group_by(letter, age) %>%
add_count() %>%
group_split(letter, age) %>%
map_dfr(~ if(.x$n[1] == 2) {
.x %>% slice_tail(n = 1)
} else {
.x
})
# A tibble: 9 x 6
letter age payment dollar euro n
<chr> <dbl> <chr> <dbl> <dbl> <int>
1 A 2 NA 0 0 1
2 A 3 dollar 1 0 1
3 A 4 NA 1 0 1
4 D 2 euro 0 1 1
5 D 3 dollar 1 1 1
6 D 4 NA 1 1 1
7 F 2 NA 0 0 1
8 F 3 dollar 1 1 1
9 F 4 NA 1 1 2
Tidyverse方法
input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
library(tidyverse)
input_format %>% mutate(rowid = row_number(),
payment1 = payment,
dummy = 1) %>%
pivot_wider(id_cols = -c(payment1, dummy), names_from = payment1, values_from = dummy, values_fill = 0, values_fn = length) %>%
select(-`NA`) %>%
group_by(letter) %>%
mutate(across(c('dollar', 'euro'), cumsum))
#> # A tibble: 11 x 6
#> # Groups: letter [3]
#> letter age payment rowid dollar euro
#> <chr> <dbl> <chr> <int> <int> <int>
#> 1 A 2 <NA> 1 0 0
#> 2 A 3 dollar 2 1 0
#> 3 A 4 <NA> 3 1 0
#> 4 D 2 euro 4 0 1
#> 5 D 3 dollar 5 1 1
#> 6 D 4 <NA> 6 1 1
#> 7 F 2 <NA> 7 0 0
#> 8 F 3 euro 8 0 1
#> 9 F 3 dollar 9 1 1
#> 10 F 4 <NA> 10 1 1
#> 11 F 4 <NA> 11 1 1
创建于2021-06-04由reprex包(v2.0.0(
使用zoo(和data.table,但不是必需的(:
input_format <- fread("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
output_format <- copy(input_format)[payment == "dollar", dollar := 1][, dollar := na.locf0(dollar), by=.(letter)]
output_format[payment == "euro", euro := 1][, euro := na.locf0(euro), by=.(letter)]
output_format[, c("dollar", "euro")][is.na(output_format[, c("dollar", "euro")])] <- 0
其产生:
> output_format
letter age payment dollar euro
1: A 2 <NA> 0 0
2: A 3 dollar 1 0
3: A 4 <NA> 1 0
4: D 2 euro 0 1
5: D 3 dollar 1 1
6: D 4 <NA> 1 1
7: F 2 <NA> 0 0
8: F 3 euro 0 1
9: F 3 dollar 1 1
10: F 4 <NA> 1 1
11: F 4 <NA> 1 1
添加另一种方法:
我们可以使用map_dfc
和set_names
在payment
的命名向量上循环。
library(dplyr)
library(purrr)
input_format %>%
group_by(letter) %>%
mutate(map_dfc(unique(.$payment) %>% set_names(., .),
~ cumsum(!(payment != .x | is.na(payment)))
)) %>%
select(- `...1`)
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> # A tibble: 11 x 5
#> # Groups: letter [3]
#> letter age payment dollar euro
#> <chr> <dbl> <chr> <int> <int>
#> 1 A 2 <NA> 0 0
#> 2 A 3 dollar 1 0
#> 3 A 4 <NA> 1 0
#> 4 D 2 euro 0 1
#> 5 D 3 dollar 1 1
#> 6 D 4 <NA> 1 1
#> 7 F 2 <NA> 0 0
#> 8 F 3 euro 0 1
#> 9 F 3 dollar 1 1
#> 10 F 4 <NA> 1 1
#> 11 F 4 <NA> 1 1
我们可以用我在github上的包简化上面的代码,并使用over
代替map_dfc
,使用dist_values
代替unique
library(dplyover) # https://github.com/TimTeaFan/dplyover
input_format %>%
group_by(letter) %>%
mutate(over(dist_values(.$payment),
~ cumsum(!(payment != .x | is.na(payment)))
))
#> # A tibble: 11 x 5
#> # Groups: letter [3]
#> letter age payment dollar euro
#> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 A 2 <NA> 0 0
#> 2 A 3 dollar 1 0
#> 3 A 4 <NA> 1 0
#> 4 D 2 euro 0 1
#> 5 D 3 dollar 1 1
#> 6 D 4 <NA> 1 1
#> 7 F 2 <NA> 0 0
#> 8 F 3 euro 0 1
#> 9 F 3 dollar 1 1
#> 10 F 4 <NA> 1 1
#> 11 F 4 <NA> 1 1
reprex包于2021-06-04创建(v0.3.0(
然而,无论是我的答案还是@AnilGoyal接受的答案,当每个小组多次提到一种支付方式时,都无法处理数据。我不知道想要的答案是否应该解释这个案例。目前,只有@Wietse de Vries和@Anousiravan R的答案适用于这种数据:
input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
A 5 dollar # this line is new
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
我们可以很容易地调整上面的方法,如下所示来解释这种情况:
input_format %>%
group_by(letter) %>%
mutate(over(dist_values(.$payment),
~ ifelse(
cumsum(!(payment != .x | is.na(payment))) >= 1,
1, 0)
))
#> # A tibble: 12 x 5
#> # Groups: letter [3]
#> letter age payment dollar euro
#> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 A 2 <NA> 0 0
#> 2 A 3 dollar 1 0
#> 3 A 4 <NA> 1 0
#> 4 A 5 dollar 1 0
#> 5 D 2 euro 0 1
#> 6 D 3 dollar 1 1
#> 7 D 4 <NA> 1 1
#> 8 F 2 <NA> 0 0
#> 9 F 3 euro 0 1
#> 10 F 3 dollar 1 1
#> 11 F 4 <NA> 1 1
#> 12 F 4 <NA> 1 1
reprex包于2021-06-04创建(v0.3.0(