我无法想象我们如何删除列日期的重复值的行,并选择删除具有inst的缺失值(NA)的行,按id分组。
我的数据的一个最小的工作示例是:
id <- c("N101", "N102", "N103", "N103", "N103", "N103", "N104", "N105", "N107", "N107", "N108", "N109", "N110", "N111", "N112", "N113", "N114", "N115", "N116", "N116")
inst <- c("angers", "strasbourg", NA, "angers", "montpellier", NA, "rouen", "limoges", NA, "brest", "stanne", "aphp_psl", "stanne", "strasbourg", "clairval", "stanne", "stanne", "caen", NA, "brest")
dates <- c("2008-07-13", "2008-02-13", "2008-05-13", "2008-05-13", "2010-12-14", "2011-12-19", "2013-11-12", "2014-01-31", "2008-06-13", "2009-06-09", "2009-03-10", "2008-12-10", "2010-04-15", "2008-01-13", "2017-03-13", "2014-05-14", "2012-05-15", "2009-10-22", "2010-10-18", "2011-05-03")
df1 <- data.frame (id, inst, dates)
> df1
id inst dates
1 N101 angers 2008-07-13
2 N102 strasbourg 2008-02-13
3 N103 <NA> 2008-05-13
4 N103 angers 2008-05-13
5 N103 montpellier 2010-12-14
6 N103 <NA> 2011-12-19
7 N104 rouen 2013-11-12
8 N105 limoges 2014-01-31
9 N107 <NA> 2008-06-13
10 N107 brest 2009-06-09
11 N108 stanne 2009-03-10
12 N109 aphp_psl 2008-12-10
13 N110 stanne 2010-04-15
14 N111 strasbourg 2008-01-13
15 N112 clairval 2017-03-13
16 N113 stanne 2014-05-14
17 N114 stanne 2012-05-15
18 N115 caen 2009-10-22
19 N116 <NA> 2010-10-18
20 N116 brest 2011-05-03
在上面的MWE中,第3行3 N103 <NA> 2008-05-13
应删除并产生df:
id <- c("N101", "N102", "N103", "N103", "N103", "N104", "N105", "N107", "N107", "N108", "N109", "N110", "N111", "N112", "N113", "N114", "N115", "N116", "N116")
inst <- c("angers", "strasbourg", "angers", "montpellier", NA, "rouen", "limoges", NA, "brest", "stanne", "aphp_psl", "stanne", "strasbourg", "clairval", "stanne", "stanne", "caen", NA, "brest")
dates <- c("2008-07-13", "2008-02-13", "2008-05-13", "2010-12-14", "2011-12-19", "2013-11-12", "2014-01-31", "2008-06-13", "2009-06-09", "2009-03-10", "2008-12-10", "2010-04-15", "2008-01-13", "2017-03-13", "2014-05-14", "2012-05-15", "2009-10-22", "2010-10-18", "2011-05-03")
df2 <- data.frame (id, inst, dates)
> df2
id inst dates
1 N101 angers 2008-07-13
2 N102 strasbourg 2008-02-13
3 N103 angers 2008-05-13
4 N103 montpellier 2010-12-14
5 N103 <NA> 2011-12-19
6 N104 rouen 2013-11-12
7 N105 limoges 2014-01-31
8 N107 <NA> 2008-06-13
9 N107 brest 2009-06-09
10 N108 stanne 2009-03-10
11 N109 aphp_psl 2008-12-10
12 N110 stanne 2010-04-15
13 N111 strasbourg 2008-01-13
14 N112 clairval 2017-03-13
15 N113 stanne 2014-05-14
16 N114 stanne 2012-05-15
17 N115 caen 2009-10-22
18 N116 <NA> 2010-10-18
19 N116 brest 2011-05-03
你知道吗?
谢谢你的帮助。
有一些排序,使NAs排在最后,然后检查是否重复,以避免组操作:
o <- order(df1$id, df1$dates, is.na(df1$inst))
df1[o,][!duplicated(df1[o, c("id","dates")]),]
# id inst dates
#1 N101 angers 2008-07-13
#2 N102 strasbourg 2008-02-13
#4 N103 angers 2008-05-13
#5 N103 montpellier 2010-12-14
#6 N103 <NA> 2011-12-19
#7 N104 rouen 2013-11-12
#8 N105 limoges 2014-01-31
#9 N107 <NA> 2008-06-13
#10 N107 brest 2009-06-09
#11 N108 stanne 2009-03-10
#12 N109 aphp_psl 2008-12-10
#13 N110 stanne 2010-04-15
#14 N111 strasbourg 2008-01-13
#15 N112 clairval 2017-03-13
#16 N113 stanne 2014-05-14
#17 N114 stanne 2012-05-15
#18 N115 caen 2009-10-22
#19 N116 <NA> 2010-10-18
#20 N116 brest 2011-05-03
如果dplyr是先决条件,那么您可以调整它:
df1 %>%
arrange(id, dates, is.na(inst)) %>%
filter(!duplicated(select(., id, dates)))
与dplyr
:
library(dplyr)
df1 %>%
group_by(id, dates) %>%
filter(!(is.na(inst) & n() > 1L))
输出:
# A tibble: 19 x 3
# Groups: id, dates [19]
id inst dates
<fct> <fct> <fct>
1 N101 angers 2008-07-13
2 N102 strasbourg 2008-02-13
3 N103 angers 2008-05-13
4 N103 montpellier 2010-12-14
5 N103 NA 2011-12-19
6 N104 rouen 2013-11-12
7 N105 limoges 2014-01-31
8 N107 NA 2008-06-13
9 N107 brest 2009-06-09
10 N108 stanne 2009-03-10
11 N109 aphp_psl 2008-12-10
12 N110 stanne 2010-04-15
13 N111 strasbourg 2008-01-13
14 N112 clairval 2017-03-13
15 N113 stanne 2014-05-14
16 N114 stanne 2012-05-15
17 N115 caen 2009-10-22
18 N116 NA 2010-10-18
19 N116 brest 2011-05-03