r语言 - 将匹配NA的df的字符行与所有内容进行比较,并基于比较创建新的列或df



我有一个包含字符值的非常大的数据帧。我想比较这些行,并基于比较创建id。问题是df中有NA我想让它们匹配任意值。另一个问题是,id也需要在同一步骤中创建(或者我以一种过于复杂的方式考虑这个问题)。

这是我创建的玩具df:

library(tidyverse)
library(purrr)
# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "G", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
Var1 Var2 Var3 Var4
Set1 "A"  "B"  "C"  "A" 
Set2 "A"  "D"  "B"  "A" 
Set3 NA   "B"  "C"  "A" 
Set4 "A"  "D"  "B"  "A" 
Set5 "F"  "G"  NA   "F" 
Set6 "A"  "B"  "C"  "C" 

下面是期望的输出,无论是作为单独的df还是作为新列,任何一种都一样好:

# as new column
Var1 Var2 Var3 Var4 COMP
Set1 "A"  "B"  "C"  "A" "Group1"
Set2 "A"  "D"  "B"  "A" "Group2
Set3 NA   "B"  "C"  "A" "Group1"
Set4 "A"  "D"  "B"  "A" "Group3"
Set5 "F"  "G"  NA   "F" "Group4"
Set6 "A"  "B"  "C"  "C" "Group5"
# as new df
COMP
Set1 "Group1"
Set2 "Group2
Set3 "Group1"
Set4 "Group3"
Set5 "Group4"
Set6 "Group5"

我认为这可以用rowwise()map来实现,但即使在阅读了类似的问题之后,我也无法确切地弄清楚如何实现这一点,特别是如何连续一致地命名新组。如有任何意见,不胜感激。

您可以将NA替换为.,粘贴到字符串中并使用grepl()进行模式匹配。

library(magrittr)
sets <- as.data.frame(sets)
sets %>%
replace(is.na(sets), ".") %>%
do.call(paste0, .) %>%
outer(., ., function(x, y) mapply(grepl, x, y)) %>%
t() %>%
max.col(ties.method = "last") %>%
match(unique(.))
[1] 1 2 1 2 3 4

但是有可能将NAs视为野生将匹配多行,因此这样做可能更安全:

# Change Row 6 so both Row 6 and Row 1 match Row 3
Set6 <- c("B", "B", "C", "A")
sets %>%
replace(is.na(sets), ".") %>%
do.call(paste0, .) %>%
outer(., ., function(x, y) mapply(grepl, x, y)) %>%
apply(2, which)
[[1]]
[1] 1 3
[[2]]
[1] 2 4
[[3]]
[1] 3
[[4]]
[1] 2 4
[[5]]
[1] 5
[[6]]
[1] 3 6

说明哪一行与另一行(包括其本身)匹配。

一个非常丑陋的while循环解决方案,但我认为它是有效的。

#Change sets to dataframe
sets <- data.frame(sets)
result <- integer(nrow(sets))
group_count <- 1
x <- 1
while(any(result == 0)) {
a <- sets[-x, !is.na(sets[x, ])]
b <- na.omit(unlist(sets[x, ]))
inds <- which(rowSums(sweep(a, 2, as.matrix(b), `==`), na.rm = TRUE) == length(b))
#If a complete match is found
if(length(inds)) {
#Need to adjust the match since we are dropping 1 row from original df
if(all(inds > x)) {
result[c(x, inds + 1)] <- group_count  
} else {
result[c(x, inds)] <- group_count  
}
} else {
result[x] <- group_count
}
group_count <- group_count + 1
#Get the next row number to check. 
x <- which(result == 0)[1]
}
#Reset so you get counts in order 1, 2, 3...
result <- match(result, unique(result))
result
[1] 1 2 1 2 3 4

这里的逻辑是将数据帧中的每一行值与其他行值进行比较,去掉它们的NA值,如果有匹配,我们用group_count值更新行。

您可以在创建组id后执行一些模糊连接:

library(tidyverse)
library(fuzzyjoin)
library(stringdist)
#> 
#> Attaching package: 'stringdist'
#> The following object is masked from 'package:tidyr':
#> 
#>     extract
# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "D", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
#>      Var1 Var2 Var3 Var4
#> Set1 "A"  "B"  "C"  "A" 
#> Set2 "A"  "D"  "B"  "A" 
#> Set3 NA   "B"  "C"  "A" 
#> Set4 "A"  "D"  "B"  "A" 
#> Set5 "F"  "G"  NA   "F" 
#> Set6 "A"  "B"  "C"  "C"
elements <-
sets %>%
as_tibble() %>%
pivot_longer(everything()) %>%
pull(value) %>%
unique() %>%
discard(is.na)
elements
#> [1] "A" "B" "C" "D" "F" "G"
groups <-
expand_grid(
Var1 = elements,
Var2 = elements,
Var3 = elements,
Var4 = elements
) %>%
mutate(group = row_number() %>% paste0("Group", .)) %>%
unite(group_str, starts_with("Var"))
groups
#> # A tibble: 1,296 × 2
#>    group_str group  
#>    <chr>     <chr>  
#>  1 A_A_A_A   Group1 
#>  2 A_A_A_B   Group2 
#>  3 A_A_A_C   Group3 
#>  4 A_A_A_D   Group4 
#>  5 A_A_A_F   Group5 
#>  6 A_A_A_G   Group6 
#>  7 A_A_B_A   Group7 
#>  8 A_A_B_B   Group8 
#>  9 A_A_B_C   Group9 
#> 10 A_A_B_D   Group10
#> # … with 1,286 more rows

匹配字符串x和y是精确的但也允许一个字符关闭,如果有一个#

compare <- function(x, y) {
(
stringdist(x, y) <= 1 & paste0(x, y) %>% str_count("#") == 1
) |
(
x == y
)
}
sets %>%
as_tibble(rownames = "set") %>%
mutate_all(~ .x %>% replace_na("#")) %>%
unite(group_str, starts_with("Var")) %>%
fuzzy_left_join(groups, match_fun = compare)
#> Joining by: "group_str"
#> # A tibble: 16 × 4
#>    set   group_str.x group_str.y group    
#>    <chr> <chr>       <chr>       <chr>    
#>  1 Set1  A_B_C_A     A_B_C_A     Group49  
#>  2 Set2  A_D_B_A     A_D_B_A     Group115 
#>  3 Set3  #_B_C_A     A_B_C_A     Group49  
#>  4 Set3  #_B_C_A     B_B_C_A     Group265 
#>  5 Set3  #_B_C_A     C_B_C_A     Group481 
#>  6 Set3  #_B_C_A     D_B_C_A     Group697 
#>  7 Set3  #_B_C_A     F_B_C_A     Group913 
#>  8 Set3  #_B_C_A     G_B_C_A     Group1129
#>  9 Set4  A_D_B_A     A_D_B_A     Group115 
#> 10 Set5  F_G_#_F     F_G_A_F     Group1049
#> 11 Set5  F_G_#_F     F_G_B_F     Group1055
#> 12 Set5  F_G_#_F     F_G_C_F     Group1061
#> 13 Set5  F_G_#_F     F_G_D_F     Group1067
#> 14 Set5  F_G_#_F     F_G_F_F     Group1073
#> 15 Set5  F_G_#_F     F_G_G_F     Group1079
#> 16 Set6  A_B_C_C     A_B_C_C     Group51

由reprex包(v2.0.1)于2021-09-25创建

最新更新