r-根据另一列中的值,用其他行中的日期替换缺失的日期

  • 本文关键字:日期 其他 替换 一列 r date na
  • 更新时间 :
  • 英文 :


我有一个包含出生日期的数据集。有时出生日期不见了。如果MG_FID相同,这意味着这些人是兄弟姐妹,并且父亲_dob和母亲_dob相同。因此,对于那些IID,对于MG_FID相同的人,我想替换mother_dob和father_dob中的NA。

mg_fid<-structure(list(IID = c(1101480L, 1101481L, 1101509L, 1101554L, 
1101600L, 1101619L, 1101623L, 1101625L, 1101637L, 1101639L, 1101644L, 
1101647L, 1101650L, 1101652L, 1101654L, 1101655L, 1101656L, 1101659L, 
1101660L, 1101661L, 1101662L, 1101674L, 1101675L, 1101678L, 1101679L, 
1101681L, 1101685L, 1101692L, 1101693L, 1101694L, 1101696L, 1101701L, 
1101705L, 1101709L, 1101712L, 1101715L, 1101717L, 1101718L, 1101720L, 
1131480L, 1131481L, 1131554L, 1131600L, 1131619L, 1131623L, 1131625L, 
1131639L, 1131644L, 1131652L, 1131659L, 1131660L, 1131661L, 1131678L, 
1131679L, 1131685L, 1131692L, 1131696L, 1131701L, 1131705L, 1131709L, 
1131715L, 1131717L, 1131718L, 1193598L, 1193602L, 1193607L, 1193609L, 
1193612L, 1193617L, 1193618L, 1193620L, 1193621L, 1193622L, 1193634L, 
1193638L, 1193639L, 1193640L, 1193642L, 1193653L, 1193656L, 1193659L, 
1193660L, 1193661L, 1193664L, 1193667L, 1193668L, 1193689L, 2202638L, 
2202647L, 2202652L, 2202655L, 2202660L, 2202661L, 2202665L, 2202673L, 
2202686L, 2202693L, 2232655L, 2232661L, 2232665L, 2232686L, 2296211L, 
2296214L, 2296217L, 3305120L, 3305129L, 3305135L, 3305139L, 3305141L, 
3305144L, 3305151L, 3305153L, 3305154L, 3305161L, 3305164L, 3305168L, 
3335154L, 3345153L, 3392379L, 3392380L, 3392385L, 3392389L, 3392390L, 
3392391L, 3392392L, 3392395L, 3392400L, 4406324L, 4406326L, 4406327L, 
4406339L, 4406341L, 4406343L, 4406349L, 4406352L, 4406356L, 4406366L, 
4406370L, 4406382L, 4406391L, 4406395L, 4406396L, 4406398L, 4406403L, 
4406409L, 4406416L, 4406421L, 4406422L, 4406425L, 4406436L, 4406438L, 
4406460L, 4406467L, 4406469L, 4406480L, 4406483L, 4406484L, 4406485L, 
4436326L, 4436366L, 4436382L, 4436396L, 4436416L, 4436422L, 4436425L, 
4436436L, 4436467L, 4436469L, 4436480L, 4436483L, 4494908L, 4494910L, 
4494916L, 4494919L), child_dob = structure(c(15454, 15405, 15112, 
14371, 14789, 15154, 12628, 15548, 12871, 16199, 14206, 13984, 
13718, 14602, 15889, 16216, 13309, 15490, 13870, 12242, 12740, 
14956, 11581, 14433, 16346, 14208, 16288, 12957, 15932, 13314, 
11487, 11768, 14650, 16247, 12741, 15075, 14823, 14160, 13284, 
14798, 16300, 14803, 14049, NA, 13496, 16183, NA, 14206, 11675, 
11226, 14630, 13035, 13767, 15667, 15201, NA, 12014, 12646, 15524, 
16247, 16884, 15722, 14792, 16310, 16959, 16173, 16765, 15535, 
NA, 16659, 14909, 16004, 15088, 15293, 15844, 14653, 16304, 15694, 
15958, 14993, 15274, 15332, 16363, 14846, 16304, 16597, 15226, 
11404, 15126, 12299, 14852, 15606, 15884, 16515, 15475, 15982, 
13819, 15996, 16512, 14823, 16631, 14442, 16672, 14120, 16499, 
15269, 13863, 16755, 15690, 15380, 13232, 13482, 13997, 16924, 
13431, 14692, 14789, 12156, 15839, 16585, 15884, 13266, 16027, 
13214, 14770, 13228, 15578, 13892, 15133, 12759, 13151, 14447, 
14922, 14841, 14572, 15804, 15260, 14869, 11111, 15579, 13473, 
15099, 15588, 11801, 13915, 14692, 12526, 14145, 13782, 15131, 
14759, 13354, 12747, 13389, 13530, 15554, 15211, 11365, NA, NA, 
12577, 15479, NA, 14759, NA, 16528, NA, 11559, NA, NA, NA, NA, 
NA, NA), class = "Date"), mother_dob = structure(c(3751, 4152, 
2012, 4033, 3220, 3547, 226, 4627, 936, 3971, 488, -751, NA, 
-152, 7709, 6247, 628, 1515, 2679, 1220, 982, 3470, -480, 1034, 
7168, 1783, 3356, 2352, 4166, 892, 1601, 1002, 3436, 3581, 2842, 
NA, 7439, 1380, -476, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7939, 
7973, NA, 7396, 8002, NA, 8499, 7879, 7512, 7512, 7495, 7919, 
7503, 7412, 8069, 7823, 7321, 7399, 7686, 7682, 7504, 7738, 9053, 
7579, 1088, 3346, 2276, 1738, 3566, 5392, 1106, 214, 2348, 1863, 
NA, NA, NA, NA, 999, NA, 4484, 8885, 1958, 3197, 5721, 2815, 
5304, 5451, 2145, 5354, 4758, 2517, 1557, 5354, 2145, 6377, 6011, 
4084, -406, 332, 390, -716, 2218, 3982, 1251, 2842, 3158, 2008, 
5332, 5872, 1643, 7342, 3350, 5199, 1065, -2651, 6334, NA, 6508, 
2554, -835, 1502, 132, 1409, 4033, 3289, 1888, 2864, 1766, 629, 
739, 3158, 1915, 778, 3781, NA, 5199, NA, 6508, 132, 4033, 3289, 
1888, NA, NA, 3158, NA, 7075, NA, 2985, 431), class = "Date"), 
father_dob = structure(c(3048, 4252, -485, 4363, 536, 565, 
56, 1953, -534, 3535, -4109, -2086, NA, -321, 4858, 4287, 
2169, -895, 2459, 873, 35, 3127, -1333, -73, 3866, 1573, 
3192, 1424, -3378, -182, -139, 443, 3353, 5299, 97, NA, 5978, 
1369, -3093, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7606, 
NA, 6055, 7062, NA, 7893, 8325, 7297, 7297, 6510, 7849, 6883, 
7503, 7536, 7356, 6338, NA, 7600, 7908, 7339, 5150, 7532, 
7128, 511, 454, 2154, 2348, 2703, 4071, 1491, -1679, 3894, 
-1475, NA, NA, NA, NA, -179, NA, 2941, 6677, 1067, -1206, 
4403, 981, 3626, 437, 829, 2087, 2980, 2364, 653, 2087, 829, 
6281, 6839, 4344, -3220, 3175, 335, 410, 1961, 2730, -951, 
2670, 2938, -3676, 1977, 5758, 1154, 7582, 3564, 3297, 4148, 
-2770, 5088, NA, 7605, 4096, -1145, 948, 1022, 560, 3016, 
-1902, 2022, -703, 40, 446, 91, 1807, 1433, 4119, NA, NA, 
3297, NA, 7605, 1022, 3016, -1902, 2022, NA, NA, 1807, NA, 
5903, NA, 2521, 849), class = "Date"), MG_FID = c(1L, 2L, 
0L, 3L, 4L, 5L, 7L, 8L, 0L, 9L, 10L, 0L, 0L, 11L, 0L, 0L, 
0L, 12L, 13L, 14L, 0L, 0L, 0L, 15L, 16L, 17L, 18L, 19L, 20L, 
0L, 21L, 22L, 23L, 24L, 0L, 25L, 26L, 27L, 0L, 1L, 2L, 3L, 
4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 18L, 
19L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 28L, 0L, 29L, 30L, 0L, 31L, 0L, 
28L, 29L, 30L, 31L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 32L, 
35L, 33L, 0L, 0L, 34L, 33L, 35L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 36L, 37L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 38L, 0L, 
39L, 0L, 0L, 40L, 41L, 0L, 0L, 42L, 0L, 43L, 44L, 45L, 0L, 
46L, 47L, 48L, 49L, 50L, 0L, 51L, 37L, 38L, 39L, 40L, 42L, 
43L, 44L, 45L, 47L, 48L, 49L, 50L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
-174L))
mg_fid

我试过下面的代码,但它把我所有的日期都变成了随机数字。

parent_age %>% group_by(MG_FID) %>% 
mutate(father_dob = ifelse(is.na(father_dob),max(father_dob,na.rm=TRUE),father_dob), 
mother_dob = ifelse(is.na(mother_dob),max(mother_dob,na.rm=TRUE),mother_dob))

有人有什么想法吗?如果我写的东西没有意义,请告诉我。

# ensure mg_fid is tibble
mg_fid <- mg_fid %>% as_tibble(mg_fid)
# mutate columns ending in "r_dob", exclude the rows where MG_FID==0
mg_fid_new <- bind_rows(
mg_fid %>% filter(MG_FID==0), 
mg_fid %>% filter(MG_FID!=0) %>% 
group_by(MG_FID) %>%
mutate(across(.cols = ends_with("r_dob"),~mean(.x, na.rm=T)))
)

输入:如上所述,由OP 更新

输出:

# A tibble: 174 x 5
IID child_dob  mother_dob father_dob MG_FID
<int> <date>     <date>     <date>      <int>
1 1101509 2011-05-18 1975-07-06 1968-09-03      0
2 1101637 2005-03-29 1972-07-25 1968-07-16      0
3 1101647 2008-04-15 1967-12-12 1964-04-16      0
4 1101650 2007-07-24 NA         NA              0
5 1101654 2013-07-03 1991-02-09 1983-04-21      0
6 1101655 2014-05-26 1987-02-08 1981-09-27      0
7 1101656 2006-06-10 1971-09-21 1975-12-10      0
8 1101662 2004-11-18 1972-09-09 1970-02-05      0
9 1101674 2010-12-13 1979-07-03 1978-07-25      0
10 1101675 2001-09-16 1968-09-08 1966-05-09      0
# ... with 164 more rows

在的母列和父列中NA的数量已经减少

> sapply(mg_fid, function(x) sum(is.na(x)))
IID  child_dob mother_dob father_dob     MG_FID 
0         15         40         43          0 
> sapply(mg_fid_new, function(x) sum(is.na(x)))
IID  child_dob mother_dob father_dob     MG_FID 
0         15          8         11          0 

NA值可以通过使用group_byfill的组合来替换,如下所示:

library(tidyverse)
library(janitor)
#> 
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#> 
#>     chisq.test, fisher.test
data<-structure(list(IID = c(1101480L, 1101481L, 1101509L, 1101554L, 
1101600L, 1101619L, 1101623L, 1101625L, 1101637L, 1101639L, 1101644L, 
1101647L, 1101650L, 1101652L, 1101654L, 1101655L, 1101656L, 1101659L, 
1101660L, 1101661L, 1101662L, 1101674L, 1101675L, 1101678L, 1101679L, 
1101681L, 1101685L, 1101692L, 1101693L, 1101694L, 1101696L, 1101701L, 
1101705L, 1101709L, 1101712L, 1101715L, 1101717L, 1101718L, 1101720L, 
1131480L, 1131481L, 1131554L, 1131600L, 1131619L, 1131623L, 1131625L, 
1131639L, 1131644L, 1131652L, 1131659L, 1131660L, 1131661L, 1131678L, 
1131679L, 1131685L, 1131692L, 1131696L, 1131701L, 1131705L, 1131709L, 
1131715L, 1131717L, 1131718L, 1193598L, 1193602L, 1193607L, 1193609L, 
1193612L, 1193617L, 1193618L, 1193620L, 1193621L, 1193622L, 1193634L, 
1193638L, 1193639L, 1193640L, 1193642L, 1193653L, 1193656L, 1193659L, 
1193660L, 1193661L, 1193664L, 1193667L, 1193668L, 1193689L, 2202638L, 
2202647L, 2202652L, 2202655L, 2202660L, 2202661L, 2202665L, 2202673L, 
2202686L, 2202693L, 2232655L, 2232661L, 2232665L, 2232686L, 2296211L, 
2296214L, 2296217L, 3305120L, 3305129L, 3305135L, 3305139L, 3305141L, 
3305144L, 3305151L, 3305153L, 3305154L, 3305161L, 3305164L, 3305168L, 
3335154L, 3345153L, 3392379L, 3392380L, 3392385L, 3392389L, 3392390L, 
3392391L, 3392392L, 3392395L, 3392400L, 4406324L, 4406326L, 4406327L, 
4406339L, 4406341L, 4406343L, 4406349L, 4406352L, 4406356L, 4406366L, 
4406370L, 4406382L, 4406391L, 4406395L, 4406396L, 4406398L, 4406403L, 
4406409L, 4406416L, 4406421L, 4406422L, 4406425L, 4406436L, 4406438L, 
4406460L, 4406467L, 4406469L, 4406480L, 4406483L, 4406484L, 4406485L, 
4436326L, 4436366L, 4436382L, 4436396L, 4436416L, 4436422L, 4436425L, 
4436436L, 4436467L, 4436469L, 4436480L, 4436483L, 4494908L, 4494910L, 
4494916L, 4494919L), child_dob = structure(c(15454, 15405, 15112, 
      14371, 14789, 15154, 12628, 15548, 12871, 16199, 14206, 13984, 
      13718, 14602, 15889, 16216, 13309, 15490, 13870, 12242, 12740, 
      14956, 11581, 14433, 16346, 14208, 16288, 12957, 15932, 13314, 
      11487, 11768, 14650, 16247, 12741, 15075, 14823, 14160, 13284, 
      14798, 16300, 14803, 14049, NA, 13496, 16183, NA, 14206, 11675, 
      11226, 14630, 13035, 13767, 15667, 15201, NA, 12014, 12646, 15524, 
      16247, 16884, 15722, 14792, 16310, 16959, 16173, 16765, 15535, 
      NA, 16659, 14909, 16004, 15088, 15293, 15844, 14653, 16304, 15694, 
      15958, 14993, 15274, 15332, 16363, 14846, 16304, 16597, 15226, 
      11404, 15126, 12299, 14852, 15606, 15884, 16515, 15475, 15982, 
      13819, 15996, 16512, 14823, 16631, 14442, 16672, 14120, 16499, 
      15269, 13863, 16755, 15690, 15380, 13232, 13482, 13997, 16924, 
      13431, 14692, 14789, 12156, 15839, 16585, 15884, 13266, 16027, 
      13214, 14770, 13228, 15578, 13892, 15133, 12759, 13151, 14447, 
      14922, 14841, 14572, 15804, 15260, 14869, 11111, 15579, 13473, 
      15099, 15588, 11801, 13915, 14692, 12526, 14145, 13782, 15131, 
      14759, 13354, 12747, 13389, 13530, 15554, 15211, 11365, NA, NA, 
      12577, 15479, NA, 14759, NA, 16528, NA, 11559, NA, NA, NA, NA, 
      NA, NA), class = "Date"), mother_dob = structure(c(3751, 4152, 
                                                         2012, 4033, 3220, 3547, 226, 4627, 936, 3971, 488, -751, NA, 
                                                         -152, 7709, 6247, 628, 1515, 2679, 1220, 982, 3470, -480, 1034, 
                                                         7168, 1783, 3356, 2352, 4166, 892, 1601, 1002, 3436, 3581, 2842, 
                                                         NA, 7439, 1380, -476, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                                         NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7939, 
                                                         7973, NA, 7396, 8002, NA, 8499, 7879, 7512, 7512, 7495, 7919, 
                                                         7503, 7412, 8069, 7823, 7321, 7399, 7686, 7682, 7504, 7738, 9053, 
                                                         7579, 1088, 3346, 2276, 1738, 3566, 5392, 1106, 214, 2348, 1863, 
                                                         NA, NA, NA, NA, 999, NA, 4484, 8885, 1958, 3197, 5721, 2815, 
                                                         5304, 5451, 2145, 5354, 4758, 2517, 1557, 5354, 2145, 6377, 6011, 
                                                         4084, -406, 332, 390, -716, 2218, 3982, 1251, 2842, 3158, 2008, 
                                                         5332, 5872, 1643, 7342, 3350, 5199, 1065, -2651, 6334, NA, 6508, 
                                                         2554, -835, 1502, 132, 1409, 4033, 3289, 1888, 2864, 1766, 629, 
                                                         739, 3158, 1915, 778, 3781, NA, 5199, NA, 6508, 132, 4033, 3289, 
                                                         1888, NA, NA, 3158, NA, 7075, NA, 2985, 431), class = "Date"), 
father_dob = structure(c(3048, 4252, -485, 4363, 536, 565, 
56, 1953, -534, 3535, -4109, -2086, NA, -321, 4858, 4287, 
2169, -895, 2459, 873, 35, 3127, -1333, -73, 3866, 1573, 
3192, 1424, -3378, -182, -139, 443, 3353, 5299, 97, NA, 5978, 
1369, -3093, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7606, 
NA, 6055, 7062, NA, 7893, 8325, 7297, 7297, 6510, 7849, 6883, 
7503, 7536, 7356, 6338, NA, 7600, 7908, 7339, 5150, 7532, 
7128, 511, 454, 2154, 2348, 2703, 4071, 1491, -1679, 3894, 
-1475, NA, NA, NA, NA, -179, NA, 2941, 6677, 1067, -1206, 
4403, 981, 3626, 437, 829, 2087, 2980, 2364, 653, 2087, 829, 
6281, 6839, 4344, -3220, 3175, 335, 410, 1961, 2730, -951, 
2670, 2938, -3676, 1977, 5758, 1154, 7582, 3564, 3297, 4148, 
-2770, 5088, NA, 7605, 4096, -1145, 948, 1022, 560, 3016, 
-1902, 2022, -703, 40, 446, 91, 1807, 1433, 4119, NA, NA, 
3297, NA, 7605, 1022, 3016, -1902, 2022, NA, NA, 1807, NA, 
5903, NA, 2521, 849), class = "Date"), MG_FID = c(1L, 2L, 
                            0L, 3L, 4L, 5L, 7L, 8L, 0L, 9L, 10L, 0L, 0L, 11L, 0L, 0L, 
                            0L, 12L, 13L, 14L, 0L, 0L, 0L, 15L, 16L, 17L, 18L, 19L, 20L, 
                            0L, 21L, 22L, 23L, 24L, 0L, 25L, 26L, 27L, 0L, 1L, 2L, 3L, 
                            4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 18L, 
                            19L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 0L, 0L, 0L, 0L, 0L, 
                            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
                            0L, 0L, 0L, 0L, 0L, 0L, 0L, 28L, 0L, 29L, 30L, 0L, 31L, 0L, 
                            28L, 29L, 30L, 31L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 32L, 
                            35L, 33L, 0L, 0L, 34L, 33L, 35L, 0L, 0L, 0L, 0L, 0L, 0L, 
                            0L, 0L, 0L, 36L, 37L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 38L, 0L, 
                            39L, 0L, 0L, 40L, 41L, 0L, 0L, 42L, 0L, 43L, 44L, 45L, 0L, 
                            46L, 47L, 48L, 49L, 50L, 0L, 51L, 37L, 38L, 39L, 40L, 42L, 
                            43L, 44L, 45L, 47L, 48L, 49L, 50L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
                                                                                                                     -174L))
data <- data %>% 
as_tibble() %>% 
clean_names()
data %>% 
group_by(mg_fid) %>% 
fill(mother_dob:father_dob, .direction = "downup") %>% 
ungroup()
#> # A tibble: 174 x 5
#>        iid child_dob  mother_dob father_dob mg_fid
#>      <int> <date>     <date>     <date>      <int>
#>  1 1101480 2012-04-24 1980-04-09 1978-05-07      1
#>  2 1101481 2012-03-06 1981-05-15 1981-08-23      2
#>  3 1101509 2011-05-18 1975-07-06 1968-09-03      0
#>  4 1101554 2009-05-07 1981-01-16 1981-12-12      3
#>  5 1101600 2010-06-29 1978-10-26 1971-06-21      4
#>  6 1101619 2011-06-29 1979-09-18 1971-07-20      5
#>  7 1101623 2004-07-29 1970-08-15 1970-02-26      7
#>  8 1101625 2012-07-27 1982-09-02 1975-05-08      8
#>  9 1101637 2005-03-29 1972-07-25 1968-07-16      0
#> 10 1101639 2014-05-09 1980-11-15 1979-09-06      9
#> # ... with 164 more rows

创建于2022-02-24由reprex包(v2.0.1(

但有一个问题——mg_fid == 0组有82个条目,其中缺少6个mother_dob和8个father_dob。您打算如何更换这些NA?

data %>% 
count(mg_fid, sort = TRUE)
# A tibble: 51 x 2
mg_fid     n
<int> <int>
1      0    82
2      1     2
3      2     2
4      3     2
5      4     2
6      5     2
7      7     2
8      8     2
9      9     2
10     10     2
# ... with 41 more rows
data %>% 
filter(mg_fid == 0) %>% 
map_df(~ sum(is.na(.)))
# A tibble: 1 x 5
iid child_dob mother_dob father_dob mg_fid
<int>     <int>      <int>      <int>  <int>
1     0         5          6          8      0

相关内容

  • 没有找到相关文章

最新更新