r语言 - 模拟数据并将缺失值随机添加到数据帧



如何在模拟数据帧中随机添加某些或每列缺失值(例如每列中随机~5%缺失(,另外,是否有更有效的方法模拟具有连续列和因子列的数据帧?

#Simulate some data
N <- 2000
data <- data.frame(id = 1:2000,age = rnorm(N,18:90),bmi = rnorm(N,15:40),
chol = rnorm(N,50:350), insulin = rnorm(N,2:40),sbp = rnorm(N, 50:200),
dbp = rnorm(N, 30:150), sex = c(rep(1, 1000), rep(2, 1000)), 
smoke = rep(c(1, 2), 1000), educ = sample(LETTERS[1:4]))

#Manually add some missing values
data <- data %>%
mutate(age = "is.na<-"(age, age <19 | age >88),
bmi = "is.na<-"(bmi, bmi >38 | bmi <16),
insulin = "is.na<-"(insulin, insulin >38),
educ = "is.na<-"(educ, bmi >35))

在我看来,最好的解决方案是为此使用鼠标包。这是一个专用于插补的 R 包。它还具有一个名为截肢的函数,用于将缺失的数据引入data.frame。

截肢 - 生成用于模拟目的的缺失数据 此函数以 MCAR、MAR 或 MNAR 方式生成多变量缺失数据。

此解决方案的优点是您可以设置多个参数来模拟缺失的数据。

ampute(data, prop = 0.5, patterns = NULL, freq = NULL, mech = "MAR",
weights = NULL, cont = TRUE, type = NULL, odds = NULL,
bycases = TRUE, run = TRUE)

如您所见,您可以设置缺失值的百分比、缺失数据机制(MCAR 将是完全随机缺失的选择(和其他几个参数。这个解决方案也很干净,因为它只有 1 行代码。

这是一种tidyverse方法,它将删除您指定的每列大约 20% 的数据:

set.seed(1)
# example data
N <- 20
data <- data.frame(id = 1:N,
age = rnorm(N,18:90),
bmi = rnorm(N,15:40),
chol = rnorm(N,50:350))
library(tidyverse)
# specify which variables should have missing data and prc of missing data
c_names = c("age","bmi")
prc_missing = 0.20
data %>%
gather(var, value, -id) %>%    # reshape data
mutate(r = runif(nrow(.)),     # simulate a random number from 0 to 1 for each row
value = ifelse(var %in% c_names & r <= prc_missing, NA, value)) %>%  # if it's one of the variables you specified and the random number is less than your threshold update to NA
select(-r) %>%                 # remove random number
spread(var, value)             # reshape back to original format
#    id      age      bmi     chol
# 1   1 17.37355 15.91898 49.83548
# 2   2 19.18364 16.78214 50.74664
# 3   3 19.16437 17.07456 52.69696
# 4   4       NA 16.01065 53.55666
# 5   5 22.32951 19.61983 53.31124
# 6   6 22.17953 19.94387 54.29250
# 7   7 24.48743       NA 56.36458
# 8   8 25.73832 20.52925 57.76853
# 9   9 26.57578       NA 57.88765
# 10 10 26.69461 24.41794 59.88111
# 11 11 29.51178 26.35868 60.39811
# 12 12       NA 25.89721 60.38797
# 13 13       NA 27.38767 62.34112
# 14 14 28.78530 27.94619 61.87064
# 15 15 33.12493 27.62294 65.43302
# 16 16 32.95507       NA 66.98040
# 17 17 33.98381 30.60571 65.63278
# 18 18 35.94384       NA 65.95587
# 19 19 36.82122 34.10003 68.56972
# 20 20 37.59390 34.76318 68.86495

这是一种替代方法,它将删除您指定的列的 20% 的数据:

set.seed(1)
# example data
N <- 20
data <- data.frame(id = 1:N,
age = rnorm(N,18:90),
bmi = rnorm(N,15:40),
chol = rnorm(N,50:350))
library(tidyverse)
# specify which variables should have missing data and prc of missing data
c_names = c("age","bmi")
prc_missing = 0.20
n_remove = prc_missing * nrow(data)
data %>%
gather(var, value, -id) %>%   # reshape data
sample_frac(1) %>%            # shuffle rows
group_by(var) %>%             # for each variables
mutate(value = ifelse(var %in% c_names & row_number() <= n_remove, NA, value)) %>%  # update to NA top x number of rows if it's one of the variables you specified
spread(var, value)            # reshape to original format
# # A tibble: 20 x 4
#      id   age   bmi  chol
#   <int> <dbl> <dbl> <dbl>
# 1     1  17.4  15.9  49.8
# 2     2  19.2  16.8  50.7
# 3     3  19.2  17.1  52.7
# 4     4  NA    16.0  53.6
# 5     5  22.3  NA    53.3
# 6     6  22.2  19.9  54.3
# 7     7  24.5  20.8  56.4
# 8     8  25.7  NA    57.8
# 9     9  26.6  NA    57.9
# 10    10  NA    NA    59.9
# 11    11  NA    26.4  60.4
# 12    12  NA    25.9  60.4
# 13    13  29.4  27.4  62.3
# 14    14  28.8  27.9  61.9
# 15    15  33.1  27.6  65.4
# 16    16  33.0  29.6  67.0
# 17    17  34.0  30.6  65.6
# 18    18  35.9  31.9  66.0
# 19    19  36.8  34.1  68.6
# 20    20  37.6  34.8  68.9

这行得通吗?

n_rows <- nrow(data)
perc_missing <- 5 # percentage missing data
row_missing <- sample(1:n_rows, sample(1:n_rows, round(perc_missing/100 * n_rows,0))) # sample randomly x% of rows
col_missing <- 1 # define column 
data[row_missing, col_missing] <- NA # assign missing values

最新更新