我有一个包含字符值的非常大的数据帧。我想比较这些行,并基于比较创建id。问题是df中有NA我想让它们匹配任意值。另一个问题是,id也需要在同一步骤中创建(或者我以一种过于复杂的方式考虑这个问题)。
这是我创建的玩具df:
library(tidyverse)
library(purrr)
# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "G", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
Var1 Var2 Var3 Var4
Set1 "A" "B" "C" "A"
Set2 "A" "D" "B" "A"
Set3 NA "B" "C" "A"
Set4 "A" "D" "B" "A"
Set5 "F" "G" NA "F"
Set6 "A" "B" "C" "C"
下面是期望的输出,无论是作为单独的df还是作为新列,任何一种都一样好:
# as new column
Var1 Var2 Var3 Var4 COMP
Set1 "A" "B" "C" "A" "Group1"
Set2 "A" "D" "B" "A" "Group2
Set3 NA "B" "C" "A" "Group1"
Set4 "A" "D" "B" "A" "Group3"
Set5 "F" "G" NA "F" "Group4"
Set6 "A" "B" "C" "C" "Group5"
# as new df
COMP
Set1 "Group1"
Set2 "Group2
Set3 "Group1"
Set4 "Group3"
Set5 "Group4"
Set6 "Group5"
我认为这可以用rowwise()
和map
来实现,但即使在阅读了类似的问题之后,我也无法确切地弄清楚如何实现这一点,特别是如何连续一致地命名新组。如有任何意见,不胜感激。
您可以将NA
替换为.
,粘贴到字符串中并使用grepl()
进行模式匹配。
library(magrittr)
sets <- as.data.frame(sets)
sets %>%
replace(is.na(sets), ".") %>%
do.call(paste0, .) %>%
outer(., ., function(x, y) mapply(grepl, x, y)) %>%
t() %>%
max.col(ties.method = "last") %>%
match(unique(.))
[1] 1 2 1 2 3 4
但是有可能将NA
s视为野生将匹配多行,因此这样做可能更安全:
# Change Row 6 so both Row 6 and Row 1 match Row 3
Set6 <- c("B", "B", "C", "A")
sets %>%
replace(is.na(sets), ".") %>%
do.call(paste0, .) %>%
outer(., ., function(x, y) mapply(grepl, x, y)) %>%
apply(2, which)
[[1]]
[1] 1 3
[[2]]
[1] 2 4
[[3]]
[1] 3
[[4]]
[1] 2 4
[[5]]
[1] 5
[[6]]
[1] 3 6
说明哪一行与另一行(包括其本身)匹配。
一个非常丑陋的while
循环解决方案,但我认为它是有效的。
#Change sets to dataframe
sets <- data.frame(sets)
result <- integer(nrow(sets))
group_count <- 1
x <- 1
while(any(result == 0)) {
a <- sets[-x, !is.na(sets[x, ])]
b <- na.omit(unlist(sets[x, ]))
inds <- which(rowSums(sweep(a, 2, as.matrix(b), `==`), na.rm = TRUE) == length(b))
#If a complete match is found
if(length(inds)) {
#Need to adjust the match since we are dropping 1 row from original df
if(all(inds > x)) {
result[c(x, inds + 1)] <- group_count
} else {
result[c(x, inds)] <- group_count
}
} else {
result[x] <- group_count
}
group_count <- group_count + 1
#Get the next row number to check.
x <- which(result == 0)[1]
}
#Reset so you get counts in order 1, 2, 3...
result <- match(result, unique(result))
result
[1] 1 2 1 2 3 4
这里的逻辑是将数据帧中的每一行值与其他行值进行比较,去掉它们的NA
值,如果有匹配,我们用group_count
值更新行。
您可以在创建组id后执行一些模糊连接:
library(tidyverse)
library(fuzzyjoin)
library(stringdist)
#>
#> Attaching package: 'stringdist'
#> The following object is masked from 'package:tidyr':
#>
#> extract
# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "D", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
#> Var1 Var2 Var3 Var4
#> Set1 "A" "B" "C" "A"
#> Set2 "A" "D" "B" "A"
#> Set3 NA "B" "C" "A"
#> Set4 "A" "D" "B" "A"
#> Set5 "F" "G" NA "F"
#> Set6 "A" "B" "C" "C"
elements <-
sets %>%
as_tibble() %>%
pivot_longer(everything()) %>%
pull(value) %>%
unique() %>%
discard(is.na)
elements
#> [1] "A" "B" "C" "D" "F" "G"
groups <-
expand_grid(
Var1 = elements,
Var2 = elements,
Var3 = elements,
Var4 = elements
) %>%
mutate(group = row_number() %>% paste0("Group", .)) %>%
unite(group_str, starts_with("Var"))
groups
#> # A tibble: 1,296 × 2
#> group_str group
#> <chr> <chr>
#> 1 A_A_A_A Group1
#> 2 A_A_A_B Group2
#> 3 A_A_A_C Group3
#> 4 A_A_A_D Group4
#> 5 A_A_A_F Group5
#> 6 A_A_A_G Group6
#> 7 A_A_B_A Group7
#> 8 A_A_B_B Group8
#> 9 A_A_B_C Group9
#> 10 A_A_B_D Group10
#> # … with 1,286 more rows
匹配字符串x和y是精确的但也允许一个字符关闭,如果有一个#
compare <- function(x, y) {
(
stringdist(x, y) <= 1 & paste0(x, y) %>% str_count("#") == 1
) |
(
x == y
)
}
sets %>%
as_tibble(rownames = "set") %>%
mutate_all(~ .x %>% replace_na("#")) %>%
unite(group_str, starts_with("Var")) %>%
fuzzy_left_join(groups, match_fun = compare)
#> Joining by: "group_str"
#> # A tibble: 16 × 4
#> set group_str.x group_str.y group
#> <chr> <chr> <chr> <chr>
#> 1 Set1 A_B_C_A A_B_C_A Group49
#> 2 Set2 A_D_B_A A_D_B_A Group115
#> 3 Set3 #_B_C_A A_B_C_A Group49
#> 4 Set3 #_B_C_A B_B_C_A Group265
#> 5 Set3 #_B_C_A C_B_C_A Group481
#> 6 Set3 #_B_C_A D_B_C_A Group697
#> 7 Set3 #_B_C_A F_B_C_A Group913
#> 8 Set3 #_B_C_A G_B_C_A Group1129
#> 9 Set4 A_D_B_A A_D_B_A Group115
#> 10 Set5 F_G_#_F F_G_A_F Group1049
#> 11 Set5 F_G_#_F F_G_B_F Group1055
#> 12 Set5 F_G_#_F F_G_C_F Group1061
#> 13 Set5 F_G_#_F F_G_D_F Group1067
#> 14 Set5 F_G_#_F F_G_F_F Group1073
#> 15 Set5 F_G_#_F F_G_G_F Group1079
#> 16 Set6 A_B_C_C A_B_C_C Group51
由reprex包(v2.0.1)于2021-09-25创建