我正在使用R
来处理一些数据帧。我的问题与如何检查第一个数据框中变量中的值是否与另一个数据框中的值匹配有关。比赛与merge
或join
等比赛非常不同。我将介绍我的数据帧(dput()
在末尾):
我的第一个数据帧是df1
。它包含我想与第二个数据帧中的其他变量进行对比的变量name
。它看起来像这样:
df1
name
1 JUAN GIRON
2 GINA OLEAS
3 JUAN FERNANDO ELIZAGA
4 MARCO TORRES
5 JUAN PABLO GONZALEZ
6 IRMA GOMEZ
第二个数据帧是df2
。它还包含一个可变name
,它将用于与df1
name
形成对比。它看起来像这样(在真实情况下df2
可能非常大,超过 1000 行):
df2
name val
1 JUANA MARQUEZ 1
2 FERNANDO ELIZAGA 2
3 IRMA GOMEZ 3
4 PABLO GONZALEZ 4
5 GINA LUCIO 5
6 MARK TORRES 6
7 LETICIA BLACIO 7
8 JUAN PABLO GIRON BELTRAN 8
我正在寻找一种方法来检查name
变量的每一行df1
是否包含或与df2
中的任何name
值匹配。例如,在与name
fromdf2
进行检查后JUAN GIRON
的值应返回,给出值yes
,因为它包含在字符串JUAN PABLO GIRON BELTRAN
fromdf2
中。同样的情况也适用于其他值。最后,我想有这样的东西:
df3
name val
1 JUAN GIRON yes
2 GINA OLEAS no
3 JUAN FERNANDO ELIZAGA yes
4 MARCO TORRES no
5 JUAN PABLO GONZALEZ yes
6 IRMA GOMEZ yes
我怎样才能达到这个结果?我已经尝试使用|
grepl()
连接字符串,但它不起作用,因为当不匹配时,某些值会返回yes
匹配。
此外,由于数据可能很大,我希望有一个dplyr
的解决方案,因为比较是按行进行的,因此可能会很慢。或者欢迎任何快速解决方案。非常感谢!
接下来是数据:
#df1
df1 <- structure(list(name = c("JUAN GIRON", "GINA OLEAS", "JUAN FERNANDO ELIZAGA",
"MARCO TORRES", "JUAN PABLO GONZALEZ", "IRMA GOMEZ")), row.names = c(NA,
-6L), class = "data.frame")
#df2
df2 <- structure(list(name = c("JUANA MARQUEZ", "FERNANDO ELIZAGA",
"IRMA GOMEZ", "PABLO GONZALEZ", "GINA LUCIO", "MARK TORRES",
"LETICIA BLACIO", "JUAN PABLO GIRON BELTRAN"), val = 1:8), row.names = c(NA,
-8L), class = "data.frame")
也许我们可以这样做
df1 %>%
mutate(val = c("no", "yes")[1 + (rowSums(
outer(
strsplit(name, "\s+"),
strsplit(df2$name, "\s+"),
Vectorize(function(x, y) all(x %in% y) | all(y %in% x))
)
) > 0)])
这给了
name val
1 JUAN GIRON yes
2 GINA OLEAS no
3 JUAN FERNANDO ELIZAGA yes
4 MARCO TORRES no
5 JUAN PABLO GONZALEZ yes
6 IRMA GOMEZ yes
这是一种使用正则表达式模式并处理长度为 2 或 3 的名称的方法。还有改进的余地,我很想阅读这个问题的其他答案。
# Input
a <- strsplit(df2$name, " ")
# Output
b <- c()
# Define regex pattern
for(i in 1:length(a)){
if(length(a[[i]]) == 3){
temp <- paste0(
a[[i]][1], " ", a[[i]][2], "|",
a[[i]][1], " ", a[[i]][3], "|",
a[[i]][2], " ", a[[i]][3])
} else if(length(a[[i]] == 2)){
temp <- paste(a[[i]], collapse = " ")
} else {
stop("Length of split name was not 2 or 3")
}
b <- c(b, temp)
}
df1$val <- grepl(paste(b, collapse = "|"), df1$name)
或者,使用上面的循环定义b
后:
library(dplyr)
patt <- paste(b, collapse = "|")
df1 %>%
mutate(val = grepl(patt, name))
结果:
> df1
name val
1 JUAN GIRON TRUE
2 GINA OLEAS FALSE
3 JUAN FERNANDO ELIZAGA TRUE
4 MARCO TORRES FALSE
5 JUAN PABLO GONZALEZ TRUE
6 IRMA GOMEZ TRUE
ThomasIsCoding 的答案很棒。但是使用outer()
会非常消耗内存且不可并行化。以下解决方案使用嵌套map()
。此外,furrr
包用于并行化外部map()
。
具有更大df2
的基准测试表明,嵌套和并行化都产生了显着的加速,总速度约为两倍。
数据和包
library(tidyverse)
library(furrr)
#df1
df1 <- structure(list(name = c("JUAN GIRON", "GINA OLEAS", "JUAN FERNANDO ELIZAGA",
"MARCO TORRES", "JUAN PABLO GONZALEZ", "IRMA GOMEZ")), row.names = c(NA,
-6L), class = "data.frame")
#df2
df2 <- structure(list(name = c("JUANA MARQUEZ", "FERNANDO ELIZAGA",
"IRMA GOMEZ", "PABLO GONZALEZ", "GINA LUCIO", "MARK TORRES",
"LETICIA BLACIO", "JUAN PABLO GIRON BELTRAN"), val = 1:8), row.names = c(NA,
-8L), class = "data.frame")
小型数据集演示
plan(multisession, workers = 8) # 8 for my quad-core with hyperthreading
n2 <- df2$name |>
str_split("\s+")
df1 |>
mutate(val = name |>
str_split("\s+") |>
future_map_lgl((n1e) map_lgl(n2,
(n2e) all(n1e %in% n2e) | all(n2e %in% n1e)
) |> any()
) |>
factor(labels = c("no", "yes"))
)
#> name val
#> 1 JUAN GIRON yes
#> 2 GINA OLEAS no
#> 3 JUAN FERNANDO ELIZAGA yes
#> 4 MARCO TORRES no
#> 5 JUAN PABLO GONZALEZ yes
#> 6 IRMA GOMEZ yes
基准代码
# Make df2 much larger
df2xl <- df2 |>
list() |>
rep(10000) |>
bind_rows()
bench::mark(
Thomas = df1 %>%
mutate(val = c("no", "yes")[1 + (rowSums(
outer(
strsplit(name, "\s+"),
strsplit(df2xl$name, "\s+"),
Vectorize(function(x, y) all(x %in% y) | all(y %in% x))
)
) > 0)]),
nested_map = {
n2 <- df2xl$name |>
str_split("\s+")
df1 |>
mutate(val = name |>
str_split("\s+") |>
map_lgl((n1e) map_lgl(n2,
(n2e) all(n1e %in% n2e) | all(n2e %in% n1e)
) |> any()
) |>
factor(labels = c("no", "yes"))
)
},
parallel_nested_map = {
n2 <- df2xl$name |>
str_split("\s+")
df1 |>
mutate(val = name |>
str_split("\s+") |>
future_map_lgl((n1e) map_lgl(n2,
(n2e) all(n1e %in% n2e) | all(n2e %in% n1e)
) |> any()
) |>
factor(labels = c("no", "yes"))
)
},
check = F,
min_iterations = 10,
filter_gc = F
)
基准测试结果
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 Thomas 2.48s 2.53s 0.396 21.51MB 8.31
#> 2 nested_map 1.72s 1.73s 0.563 2.46MB 8.28
#> 3 parallel_nested_map 1.07s 1.22s 0.827 2.86MB 2.56
创建于 2022-04-12 由 reprex 软件包 (v2.0.1)
如果名称的顺序没有改变.*
可以在名称之间插入,然后在两个方向上使用grep
(df2$names中的df1$names和df1$names中的df2$names)并将它们与或|
组合在一起。
transform(df1, val = c("no", "yes")[1+(sapply(gsub(" +", ".*", df1$name),
(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce((y, x) y | grepl(x, df1$name), gsub(" +", ".*", df2$name), FALSE))])
# name val
#1 JUAN GIRON yes
#2 GINA OLEAS no
#3 JUAN FERNANDO ELIZAGA yes
#4 MARCO TORRES no
#5 JUAN PABLO GONZALEZ yes
#6 IRMA GOMEZ yes
如果不允许名称部分匹配,则用b
将名称括起来。
transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\b[^ ]+\b)", ".*\\b\1\\b", df1$name),
(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce((y, x) y | grepl(x, df1$name),
gsub(" *(\b[^ ]+\b)", ".*\\b\1\\b", df2$name), FALSE))])
如果订单可以查昂格,通过将名称放在(?=.*NAMME)
中或用b
(?=.*\bNAME\b)
包围名称来积极向前看。
transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\b[^ ]+\b)", "(?=.*\\b\1\\b)", df1$name),
(x) any(grep(x, df2$name, perl=TRUE)), USE.NAMES = FALSE) |
Reduce((y, x) y | grepl(x, df1$name, perl=TRUE),
gsub(" *(\b[^ ]+\b)", "(?=.*\\b\1\\b)", df2$name), FALSE))] )
也可以使用agrepl
并允许删除,这将类似于版本,假设名称的顺序不会改变并且允许名称的部分匹配。
transform(df1, val = c("no", "yes")[1+(
sapply(df1$name, (x) any(agrepl(x, df2$name,
list(cost=99, insertions=0, deletions=99, substitutions=0)))) |
Reduce((y, x) y | agrepl(x, df1$name, list(cost=99, insertions=0,
deletions=99, substitutions=0)), df2$name, FALSE))])
另一种选择是使用查找表:
s1 <- strsplit(df1$name, " ", TRUE)
lup1 <- list2env(split(rep(seq_along(s1), lengths(s1)), unlist(s1)))
s2 <- strsplit(df2$name, " ", TRUE)
lup2 <- list2env(split(rep(seq_along(s2), lengths(s2)), unlist(s2)))
`[<-`(sapply(s1, (x) any(Reduce(intersect, mget(x, lup2, ifnotfound =
list(NULL))))), unlist(lapply(s2, (x) Reduce(intersect, mget(x, lup1,
ifnotfound = list(NULL))))), TRUE)
#[1] TRUE FALSE TRUE FALSE TRUE TRUE
基准:
也可以将比较限制为那些没有匹配的索引(GKi1b),其中也许使用索引使用which
而不是两次使用逻辑向量可以改进并退出循环以防万一都有命中。如果名称不是唯一的,请在名称上使用unique
。
library(dplyr)
bench::mark(
Thomas = df1 %>%
mutate(val = c("no", "yes")[1 + (rowSums(
outer(
strsplit(name, "\s+"),
strsplit(df2$name, "\s+"),
Vectorize(function(x, y) all(x %in% y) | all(y %in% x))
)
) > 0)]),
GKi1 = transform(df1, val = c("no", "yes")[1+(sapply(gsub(" +", ".*", df1$name),
(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce((y, x) y | grepl(x, df1$name), gsub(" +", ".*", df2$name), FALSE))]),
GKi1b = transform(df1, val = c("no", "yes")[1 +
Reduce((i, x) `[<-`(i, !i, grepl(x, df1$name[!i])), gsub(" +", ".*",
df2$name), sapply(gsub(" +", ".*", df1$name), (x) any(grep(x, df2$name)),
USE.NAMES = FALSE)) ]),
GKi1c = transform(df1, val = c("no", "yes")[1+(sapply(gsub(" +", ".*", df1$name),
(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
grepl(paste(gsub(" +", ".*", df2$name), collapse = "|"), df1$name) )]),
GKi2 = transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\b[^ ]+\b)", ".*\\b\1\\b", df1$name),
(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce((y, x) y | grepl(x, df1$name),
gsub(" *(\b[^ ]+\b)", ".*\\b\1\\b", df2$name), FALSE))]),
GKi3 = transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\b[^ ]+\b)", "(?=.*\\b\1\\b)", df1$name),
(x) any(grep(x, df2$name, perl=TRUE)), USE.NAMES = FALSE) |
Reduce((y, x) y | grepl(x, df1$name, perl=TRUE),
gsub(" *(\b[^ ]+\b)", "(?=.*\\b\1\\b)", df2$name), FALSE))] ),
GKi4 = transform(df1, val = c("no", "yes")[1+(
sapply(df1$name, (x) any(agrepl(x, df2$name,
list(cost=99, insertions=0, deletions=99, substitutions=0)))) |
Reduce((y, x) y | agrepl(x, df1$name, list(cost=99, insertions=0,
deletions=99, substitutions=0)), df2$name, FALSE))]),
GKi5 = {
s1 <- strsplit(df1$name, " ", TRUE)
lup1 <- list2env(split(rep(seq_along(s1), lengths(s1)), unlist(s1)))
s2 <- strsplit(df2$name, " ", TRUE)
lup2 <- list2env(split(rep(seq_along(s2), lengths(s2)), unlist(s2)))
transform(df1, val = c("no", "yes")[1+`[<-`(sapply(s1, (x) any(Reduce(base::intersect, mget(x, lup2, ifnotfound =
list(NULL))))), unlist(lapply(s2, (x) Reduce(base::intersect, mget(x, lup1,
ifnotfound = list(NULL))))), TRUE)])
}
)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:tm> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 Thomas 863µs 894µs 919. 4.08KB 21.4 429 10 467ms
2 GKi1 211µs 218µs 3770. 0B 14.6 1803 7 478ms
3 GKi1b 211µs 226µs 3020. 0B 14.6 1448 7 479ms
4 GKi1c 183µs 200µs 3424. 0B 10.3 1667 5 487ms
5 GKi2 262µs 275µs 2755. 0B 12.4 1336 6 485ms
6 GKi3 391µs 409µs 2010. 0B 9.19 875 4 435ms
7 GKi4 374µs 386µs 2295. 0B 16.5 1110 8 484ms
8 GKi5 272µs 285µs 2570. 2.82KB 21.1 1220 10 475ms
所有变体都比使用一个CPU内核的ThomasIsCode快2倍以上。
library(data.table)
library(stringi)
library(purrr)
setDT(df1)
df1[,val := fifelse(map_lgl(stri_replace_all_fixed(name, " ", "|"), ~any(stri_count_regex(..df2$name, .x) >= 2)), "yes", "no")][]
#> name val
#> 1: JUAN GIRON yes
#> 2: GINA OLEAS no
#> 3: JUAN FERNANDO ELIZAGA yes
#> 4: MARCO TORRES no
#> 5: JUAN PABLO GONZALEZ yes
#> 6: IRMA GOMEZ yes
这将创建一个以df2
(大海捞针)为单位的每个名称的部分匹配计数(针)的向量。
[[1]]
[1] 1 0 0 0 0 0 0 2
[[2]]
[1] 0 0 0 0 1 0 0 0
[[3]]
[1] 1 2 0 0 0 0 0 1
[[4]]
[1] 0 0 0 0 0 1 0 0
[[5]]
[1] 1 0 0 2 0 0 0 2
[[6]]
[1] 0 0 2 0 0 0 0 0
我将导致"是"的最小匹配数设置为 2,但这个数字可以调整(假设我们不知道哪个部分是姓氏或名字)。
我在这里追求的是健壮,而不是速度或优雅。我确信它可以被清理和加速,但这解决了所有讨论的问题,包括在评论中关于分别处理家族和名字匹配的添加规范。
首先,以下是将家族和名字分开的更新数据:
df1_split <-
tibble(
Given = c("JUAN", "GINA", "JUAN FERNANDO"
, "MARCO", "JUAN PABLO", "IRMA"
, "JUAN", "JUAN CARLOS")
, Family = c("GIRON", "OLEAS", "ELIZAGA"
, "TORRES", "GONZALEZ", "GOMEZ"
, "GOMEZ", "MARTINEZ")
)
df2_split <-
tibble(
Given = c("JUANA", "FERNANDO",
"IRMA", "PABLO", "GINA", "MARK",
"LETICIA", "JUAN PABLO"
, "FERNANDO CARLOS"
, "JUAN FERNANDO")
, Family = c("MARQUEZ", "ELIZAGA",
"GOMEZ", "GONZALEZ", "LUCIO", "TORRES",
"BLACIO", "GIRON BELTRAN"
, "MARTINEZ"
, "ELIZAGA")
)
请注意,我添加了几个名称以突出显示双倍名称的一些问题。
然后,此函数将检查任何名称集。它根据名称是单名(例如"胡安")还是双名(例如"胡安卡洛斯")以不同的方式处理名称。对于要检查的单个名称,它只是查看它们是否存在。对于双名名称,如果要检查的名称也具有两个名称,或者如果要检查的名称只有一个名称,则检查它们是否按相同的顺序排列。
check_names <- function(to_check, against){
split_against <-
str_split(against, " ")
str_split(to_check, " ") %>%
lapply(function(this_name){
if(length(this_name) == 1){
to_ret <-
sapply(split_against, function(this_against){
any(this_name == this_against)
}) %>%
which
} else if(length(this_name) == 2){
to_ret <-
sapply(split_against, function(this_against){
if(length(this_against) == 2){
return(all(this_against == this_name))
} else if(length(this_against) == 1){
return(any(this_against == this_name))
} else{
stop("Names (against) cannot have three words: "
, this_against)
}
}) %>%
which
} else{
stop("Names (to_check) cannot have three words: "
, this_name)
}
})
}
然后,我们将该函数包装在一起,分别传递族和给定名称。然后检查结果以查看是否有任何索引与姓氏和名字都匹配。
check_both_simple <- function(to_check_given, to_check_family, against_given, against_family){
checked_given <- check_names(to_check_given, against_given)
checked_family <- check_names(to_check_family, against_family)
valid_matches <- lapply(1:length(checked_given), function(idx){
checked_given[[idx]][checked_given[[idx]] %in% checked_family[[idx]]]
})
to_return <-
ifelse(sapply(valid_matches, length) > 0
, "yes"
, "no")
return(to_return)
}
然后,我们可以在 mutate 的调用中使用它来添加列:
df1_split %>%
mutate(Match = check_both_simple(Given, Family
, df2_split$Given
, df2_split$Family))
返回:
# A tibble: 8 × 3
Given Family Match
<chr> <chr> <chr>
1 JUAN GIRON yes
2 GINA OLEAS no
3 JUAN FERNANDO ELIZAGA yes
4 MARCO TORRES no
5 JUAN PABLO GONZALEZ yes
6 IRMA GOMEZ yes
7 JUAN GOMEZ no
8 JUAN CARLOS MARTINEZ no
并且还应该处理评论中讨论的所有奇怪的边缘情况。
这种方法的好处(也是我首先如此健壮地构建它的部分原因)是,您还可以设置函数以返回匹配的索引。
check_both_idx <- function(to_check_given, to_check_family, against_given, against_family){
checked_given <- check_names(to_check_given, against_given)
checked_family <- check_names(to_check_family, against_family)
valid_matches <- lapply(1:length(checked_given), function(idx){
checked_given[[idx]][checked_given[[idx]] %in% checked_family[[idx]]]
})
return(valid_matches)
}
使用它,您实际上可以提取找到的匹配项并手动检查它们。这将允许您识别任何其他边缘情况,在这些情况下,您不同意找到的匹配项,或者一个匹配项可能明显优于另一个匹配项。
df1_split %>%
mutate(Match_idx = check_both_idx(Given, Family
, df2_split$Given
, df2_split$Family)
, Matches = sapply(Match_idx, function(this_idx_set){
paste(df2_split$Given[this_idx_set]
, df2_split$Family[this_idx_set]) %>%
paste(collapse = "; ")
})
, Match = ifelse(sapply(Match_idx, length) > 0
, "yes"
, "no")
, Match_idx = sapply(Match_idx, paste, collapse = "; ")
)
返回:
# A tibble: 8 × 5
Given Family Match_idx Matches Match
<chr> <chr> <chr> <chr> <chr>
1 JUAN GIRON "8" "JUAN PABLO GIRON BELTRAN" yes
2 GINA OLEAS "" "" no
3 JUAN FERNANDO ELIZAGA "2; 10" "FERNANDO ELIZAGA; JUAN FERNANDO ELIZAGA" yes
4 MARCO TORRES "" "" no
5 JUAN PABLO GONZALEZ "4" "PABLO GONZALEZ" yes
6 IRMA GOMEZ "3" "IRMA GOMEZ" yes
7 JUAN GOMEZ "" "" no
8 JUAN CARLOS MARTINEZ "" "" no
编辑以添加:以下两组提供了一些棘手的示例,这些示例目前会被其他答案错误地决定。这些例子来自评论中的讨论,以澄清应该匹配的内容。
tricky_1 <-
tibble(
Given = c("JUAN", "JUANITA GINA"
, "JUAN CARLO", "GOMEZ")
, Family = c("GIRON BELTRAN", "OLEAS"
, "MARTINEZ", "IRMA")
)
tricky_2 <-
tibble(
Given = c("JUAN PABLO", "GINA"
, "CARLO JUAN", "IRMA")
, Family = c("GIRON", "OLEAS GIRON"
, "MARTINEZ", "GOMEZ")
)
我们可以像这样并排查看它们:
bind_cols(
tricky_1 %>%
setNames(paste0("toCheck_", names(.)))
, tricky_2 %>%
setNames(paste0("against_", names(.)))
) %>%
mutate(shouldMatch = c("yes", "yes", "no", "no"))
返回:
# A tibble: 4 × 5
toCheck_Given toCheck_Family against_Given against_Family shouldMatch
<chr> <chr> <chr> <chr> <chr>
1 JUAN GIRON BELTRAN JUAN PABLO GIRON yes
2 JUANITA GINA OLEAS GINA OLEAS GIRON yes
3 JUAN CARLO MARTINEZ CARLO JUAN MARTINEZ no
4 GOMEZ IRMA IRMA GOMEZ no
前两个应该匹配,因为家族和名字在每个方向上都有 1-2 个匹配。但是,这意味着要检查的名称或要检查的名称都不完全包含在另一个中。第三个共享所有组件,但我认为"JUAN CARLO"不应该与"CARLO JUAN"匹配。 第四个家庭和名字颠倒了,所以不应该返回匹配项。
我的答案中的代码处理这些情况:
tricky_1 %>%
mutate(Match_idx = check_both_idx(Given, Family
, tricky_2$Given
, tricky_2$Family)
, Matches = sapply(Match_idx, function(this_idx_set){
paste(tricky_2$Given[this_idx_set]
, tricky_2$Family[this_idx_set]) %>%
paste(collapse = "; ")
})
, Match = ifelse(sapply(Match_idx, length) > 0
, "yes"
, "no")
, Match_idx = sapply(Match_idx, paste, collapse = "; ")
)
返回:
# A tibble: 4 × 5
Given Family Match_idx Matches Match
<chr> <chr> <chr> <chr> <chr>
1 JUAN GIRON BELTRAN "1" "JUAN PABLO GIRON" yes
2 JUANITA GINA OLEAS "2" "GINA OLEAS GIRON" yes
3 JUAN CARLO MARTINEZ "" "" no
4 GOMEZ IRMA "" "" no
df2_flat <- df2$name |> stringr::str_split(" ") |> purrr::flatten()
df1 |>
mutate(splitnames = stringr::str_split(name, " ")) |>
rowwise() |>
mutate(val = all(splitnames %in% df2_flat)) |>
select(-splitnames)
生成指定的列表
我会怎么做
library(tidyverse)
df1 %>%
mutate(val = sapply(name, (n) {
result = strsplit(n, " ")[[1]] %>%
sapply(., (sn) { #loop through each name and look in df2
sum(grepl(sn, df2$name)) #Could be rewritten to a for-loop to break at the first missed name
})
if(0 %in% result) {
return("no") #at least one name was not found
} else {
return("yes") #defaults to yes, since this can only be reached if no names have been missed (i.e. all have been found)
}
}))
添加我在上面没有看到的另一个解决方案:
基本上使用left_join()
、anti_join
和full_join
来匹配数据帧中的值
#see which values in df2 match df1
left_join(df1,df2,by="name")
#see which values don't match in either df1 and df2
anti_join(df1,df2,by="name")