根据r中另一个数据框中的部分文本替换一列中的值



这是我在stackoverflow上的第一篇文章,英语不是我的母语,所以我要提前为语法和编程上的任何错误道歉。

我需要根据另一个数据帧中的部分值替换数据帧中的一列中的值。我的问题与这篇文章类似,但在他们的例子中,他们列出了所有可能的错误。在我的例子中,我只需要字符串的一部分来知道我是否需要替换一个值。

我已经尝试使用"if_else"one_answers";grepl"dplyr。"Grepl"只要我在第二个数据帧上只有一行,就可以工作,当我插入另一个示例时,我得到一个错误。

现在我真正的DF大约有30k行和33个变量,第二个DF的正确值可能每个月都在增长,所以我试图尽可能地摆脱循环。

我用随机数据制作了一个模拟表来模拟我的需求:

library(dplyr)

df1 <- data.frame(Supplier = c("AAA","CCC","CCE","DDD","EEE","EED","GGG","HHH","III","JJJ"), 
Value = c(100,200,300,400,200, 100,200,40,150,70))


df2 <- data.frame(Supplier =c("CC","EE","GG"), 
New_Supplier = c("Red","Blue","Green"))

#Example 1: Unfortunately this Won't work unless I have an exact match:
df1$Supplier <- if_else(df1$Supplier %in% df2$Supplier, df2$New_Supplier, df1$Supplier)
# Example 2: Only works if I have one example:
df1$Supplier <- if_else(grepl(df2$Supplier, df1$Supplier), df2$New_Supplier, df1$Supplier)

第一个数据帧是这样的

Supplier Value
1       AAA   100
2       CCC   200
3       CCE   300
4       DDD   400
5       EEE   200
6       EED   100
7       GGG   200
8       HHH    40
9       III   150
10      JJJ    70

第二个数据帧:

Supplier New_Supplier
1       CC          Red
2       EE         Blue
3       GG        Green

我的最终目标是有这样的东西:

Supplier Value
1       AAA   100
2       Red   200
3       Red   300
4       DDD   400
5      Blue   200
6      Blue   100
7     Green   200
8       HHH    40
9       III   150
10      JJJ    70

提前感谢!

这似乎是fuzzy_joinregex_left_join的情况。在regex_left_join之后,coalecse将列放在一起,以便它将返回每一行的第一个非na元素

library(fuzzyjoin)
library(dplyr)
regex_left_join(df1, df2, by = 'Supplier') %>% 
transmute(Supplier = coalesce(New_Supplier, Supplier.x), Value)

与产出

Supplier Value
1       AAA   100
2       Red   200
3       Red   300
4       DDD   400
5      Blue   200
6      Blue   100
7     Green   200
8       HHH    40
9       III   150
10      JJJ    70

Base R方法:

# Coerce 0 length vectors to na values of the appropriate type: 
# zero_to_nas => function()
zero_to_nas <- function(x){
if(identical(x, character(0))){
res <- NA_character_ 
}else if(identical(x, integer(0))){
res < -NA_integer_
}else if(identical(x, numeric(0))){
res <- NA_real_
}else if(identical(x, complex(0))){
res <- NA_complex_
}else if(identical(x, logical(0))){
res <- NA
}else{
res <- x
}

# If the result is Null return the vector:
if(is.null(res)){
res <- x
}else{
invisible() 
}

# Explicitly define returned object: vector => Global Env
return(res)

}
# Unlist handling 0 length vectors: list_2_vec => function()
list_2_vec <- function(lst){
# Unlist cleaned list: res => vector
res <- unlist(lapply(lst, zero_to_nas))
# Explictly define return object: vector => GlobalEnv()
return(res)
}
# Function to perform a fuzzy match: 
# fuzzy_match => function()
fuzzy_match <- function(vec_to_match_to, vec_to_match_on){
# Perform a fuzzy match: res => character vector:
res <- list_2_vec(
regmatches(
vec_to_match_to, 
gregexpr(
paste0(
vec_to_match_on, 
collapse = "|"
),
vec_to_match_to
)
)
)
# Explicitly define returned object: 
# character vector => Global Env
return(res)
}
# Function to coalesce vectors: br_coalesce => function()
br_coalesce <- function(vec, ..., to_vec = TRUE){

# Coalesce the vectors: res_ir => list
res_ir <- apply(
cbind(
as.list(...), 
as.list(vec)
),
1,
function(x){
head(zero_to_nas(x[!(is.na(x))]), 1)
}
)

# If the result is null return the original vector:
if(is.null(unlist(res_ir))){
res_ir <- vec
}else{
invisible() 
}
# If the we want the result to be a vector not a list then:
if(isTRUE(to_vec)){
# Unlist the resultant list: res => vector
res <- unlist(res_ir)
# Otherwise
}else{
# Deep copy the list: res => list
res <- res_ir
}

# Explicitly define returned object: 
# list or vector => Global Env
return(res)

}
# Apply the fuzzy match and coalesce functions: 
# clean_df => data.frame
clean_df <- transform(
df1, 
Supplier = br_coalesce(
df1$Supplier, 
df2$New_Supplier[
match(
fuzzy_match(
df1$Supplier, 
df2$Supplier
),
df2$Supplier
)
]
)
)

数据:

df1 <- data.frame(Supplier = c("AAA","CCC","CCE","DDD","EEE","EED","GGG","HHH","III","JJJ"), 
Value = c(100,200,300,400,200, 100,200,40,150,70))

df2 <- data.frame(Supplier =c("CC","EE","GG"), 
New_Supplier = c("Red","Blue","Green"))

最新更新