我有一个数据集,其中有一些NA,但我可以手动计算出值应该是什么,因为df是名称的列,其余的列只是数字,后面是最后一列和总计。每行最多只出现一个NA,所以我可以通过使用合计列和所有其他列的总和来计算值。只是想知道什么是填充这些NA的最佳方法,而不必逐个硬编码,因为我使用的df是相当大的
示例df:
df = structure(list(city = c("sydney", "new york", "london", "beijing", "paris", "madrid"),
year = c(2005:2010),
A = c(1, 4, 5 , NA, 2, 1),
B = c(3, NA, 4 , 9, 0, 6),
C = c(3, 4 , 6, 1, 8, NA),
total = c(NA, 10, 15, 14, NA, 15)),
class = "data.frame", row.names = c(NA, -6L))
df
此解决方案也可以帮助您。
library(purrr)
library(dplyr)
df %>%
rowwise() %>%
mutate(total = ifelse(is.na(total), sum(c_across(A:C)), total),
pmap_df(select(cur_data(), A:total), ~ {x <- c(...)[1:3]
replace(x, is.na(x), c(...)[4] - sum(x, na.rm = TRUE))}))
# A tibble: 6 x 6
# Rowwise:
city year A B C total
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 sydney 2005 1 3 3 7
2 new york 2006 4 2 4 10
3 london 2007 5 4 6 15
4 beijing 2008 4 9 1 14
5 paris 2009 2 0 8 10
6 madrid 2010 1 6 8 15
有点硬编码,但可以在这个意义上进行修改。
您需要首先替换total列中的NA
,然后您可以简单地计算其余部分。您还可以为a、B、C列创建一个函数,这样您就不会重复代码,但只有3列应该不会有问题。
df = structure(list(city = c("sydney", "new york", "london", "beijing", "paris", "madrid"),
year = c(2005:2010),
A = c(1, 4, 5 , NA, 2, 1),
B = c(3, NA, 4 , 9, 0, 6),
C = c(3, 4 , 6, 1, 8, NA),
total = c(NA, 10, 15, 14, NA, 15)),
class = "data.frame", row.names = c(NA, -6L))
df
#> city year A B C total
#> 1 sydney 2005 1 3 3 NA
#> 2 new york 2006 4 NA 4 10
#> 3 london 2007 5 4 6 15
#> 4 beijing 2008 NA 9 1 14
#> 5 paris 2009 2 0 8 NA
#> 6 madrid 2010 1 6 NA 15
df$total <- ifelse(is.na(df$total), rowSums(df[, c("A", "B", "C")]), df$total)
df$A <- ifelse(is.na(df$A), df$total - rowSums(df[, c("A", "B", "C")], na.rm = TRUE), df$A)
df$B <- ifelse(is.na(df$B), df$total - rowSums(df[, c("A", "B", "C")], na.rm = TRUE), df$B)
df$C <- ifelse(is.na(df$C), df$total - rowSums(df[, c("A", "B", "C")], na.rm = TRUE), df$C)
df
#> city year A B C total
#> 1 sydney 2005 1 3 3 7
#> 2 new york 2006 4 2 4 10
#> 3 london 2007 5 4 6 15
#> 4 beijing 2008 4 9 1 14
#> 5 paris 2009 2 0 8 10
#> 6 madrid 2010 1 6 8 15
创建于2022-02-09由reprex包(v2.0.1(
UPDATE:在total列中替换NA
后,可以使用zoo
包中的na.aprox
函数来插值其余值。
library(zoo)
df$total <- ifelse(is.na(df$total), rowSums(df[, c("A", "B", "C")]), df$total) # first totals
df[, c("A", "B", "C")] <- na.approx(df[, c("A", "B", "C", "total")], rule = 2) # then rest
df
city year A B C total
1 sydney 2005 1.0 3.0 3 7
2 new york 2006 4.0 3.5 4 10
3 london 2007 5.0 4.0 6 15
4 beijing 2008 3.5 9.0 1 14
5 paris 2009 2.0 0.0 8 10
6 madrid 2010 1.0 6.0 8 15
数据表解决方案
library(data.table)
setDT(df)
cols <- c("A", "B", "C")
df[, (cols) := lapply(.SD, function(x) {
ifelse(is.na(x), total - rowSums(.SD, na.rm = T), x)
}), .SDcols = cols][is.na(total), total := rowSums(.SD), .SDcols = cols]
df
# city year A B C total
# 1: sydney 2005 1 3 3 7
# 2: new york 2006 4 2 4 10
# 3: london 2007 5 4 6 15
# 4: beijing 2008 4 9 1 14
# 5: paris 2009 2 0 8 10
# 6: madrid 2010 1 6 8 15
数据
df = structure(list(
city = c("sydney", "new york", "london", "beijing", "paris", "madrid"),
year = c(2005:2010),
A = c(1, 4, 5 , NA, 2, 1),
B = c(3, NA, 4 , 9, 0, 6),
C = c(3, 4 , 6, 1, 8, NA),
total = c(NA, 10, 15, 14, NA, 15)),
class = "data.frame", row.names = c(NA, -6L)
)
这是一个带有apply
的基本R解决方案。
df = structure(list(city = c("sydney", "new york", "london", "beijing", "paris", "madrid"),
year = c(2005:2010),
A = c(1, 4, 5 , NA, 2, 1),
B = c(3, NA, 4 , 9, 0, 6),
C = c(3, 4 , 6, 1, 8, NA),
total = c(NA, 10, 15, 14, NA, 15)),
class = "data.frame", row.names = c(NA, -6L))
df[-(1:2)] <- t(apply(df[-(1:2)], 1, (x) {
if(is.na(x[4])) {
x[4] <- sum(x[-4])
} else if(anyNA(x[-4])) {
x[-4][is.na(x[-4])] <- x[4] - sum(x[-4][!is.na(x[-4])])
}
x
}))
df
#> city year A B C total
#> 1 sydney 2005 1 3 3 7
#> 2 new york 2006 4 2 4 10
#> 3 london 2007 5 4 6 15
#> 4 beijing 2008 4 9 1 14
#> 5 paris 2009 2 0 8 10
#> 6 madrid 2010 1 6 8 15
创建于2022-02-09由reprex包(v2.0.1(