我拥有多年的属性数据。然而,从2007年起,我只有该房产的地址。我想将该房产的地址(根据房产编号(复制/拖动到2007年之前的所有条目中。
样本数据低于
structure(list(year = c(1990L, 1992L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 2012L, 2013L, 2014L, 2015L, 1990L, 1992L, 1998L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L), property = c("1182 047",
"1182 047", "1182 047", "1182 047", "1182 047", "1182 047", "1182 047",
"1182 047", "1182 047", "1182 047", "1182 047", "1182 047", "1182 047",
"1182 047", "1182 047", "1182 047", "1182 047", "1182 047", "1182 047",
"1182 047", "5421 047", "5421 047", "5421 047", "5421 047", "5421 047",
"5421 047", "5421 047", "5421 047", "5421 047", "5421 047"),
street = c("", "", "", "", "", "", "", "", "", "", "", "",
"400 MINER RD", "400 MINER RD", "400 MINER RD", "400 MINER RD",
"400 MINER RD", "400 MINER RD", "400 MINER RD", "400 MINER RD",
"", "", "", "", "", "", "", "", "", ""), city = c("", "",
"", "", "", "", "", "", "", "", "", "", "ORINDA CA", "ORINDA CA",
"ORINDA CA", "ORINDA CA", "ORINDA CA", "ORINDA CA", "ORINDA CA",
"ORINDA CA", "", "", "", "", "", "", "", "", "", ""), city_overflow = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", ""),
zip = c("", "", "", "", "", "", "", "", "", "", "", "", "94563",
"94563", "94563", "94563", "94563", "94563", "94563", "94563",
"", "", "", "", "", "", "", "", "", ""), name = c("BARSAMIAN BETSY A",
"BARSAMIAN BETSY A", "BARSAMIAN BETSY A", "BARSAMIAN BETSY A",
"BARSAMIAN BETSY A", "BARSAMIAN BETSY A", "BARSAMIAN BETSY A",
"BARSAMIAN BETSY A", "BARSAMIAN BETSY A", "BARSAMIAN BETSY A",
"BARSAMIAN BETSY A", "BARSAMIAN BETSY A", "BARSAMIAN BETSY A",
"BARSAMIAN BETSY A", "BARSAMIAN BETSY A", "BARSAMIAN BETSY A",
"BARSAMIAN BETSY A", "BARSAMIAN BETSY A", "BARSAMIAN BETSY A",
"BARSAMIAN BETSY A", "ZHANG ZHI HAO & JIANG JIE YING", "ZHANG ZHI HAO & JIANG JIE YING",
"ZHANG ZHI HAO & JIANG JIE YING", "ZHANG ZHI HAO & JIANG JIE YING",
"ZHANG ZHI HAO & JIANG JIE YING", "ZHANG ZHI HAO & JIANG JIE YING",
"ZHANG ZHI HAO & JIANG JIE YING", "ZHANG ZHI HAO & JIANG JIE YING",
"ZHANG ZHI HAO & JIANG JIE YING", "ZHANG ZHI HAO & JIANG JIE YING"
), period = c("0000", "0000", "0000", "0000", "0000", "0106",
"0106", "0106", "0106", "0106", "0106", "0709", "0709", "0709",
"1014", "1014", "1014", "1014", "1014", "1500", "0000", "0000",
"0000", "0000", "0000", "0106", "0106", "0106", "0106", "0106"
), type = c("individual", "individual", "individual", "individual",
"individual", "individual", "individual", "individual", "individual",
"individual", "individual", "individual", "individual", "individual",
"individual", "individual", "individual", "individual", "individual",
"individual", "individual", "individual", "individual", "individual",
"individual", "individual", "individual", "individual", "individual",
"individual")), row.names = c(NA, -30L), class = c("data.table",
"data.frame"))
我希望每一行都包含与其属性号相关的地址,该属性号可以在2007年之后的所有数据条目中找到。
使用tidyr::fill()
这将是解决这类问题的一个很好的办法。我们首先需要确保,数据按property
和year
列dplyr::arrange()
排序为之工作。为了确保地址数据永远不会粘贴到另一个性质的细胞,我们使用dplyr::group_by()
。
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
prop_df %>%
arrange(property, -year) %>%
group_by(property) %>%
mutate_at(vars(street, city, zip), ~ifelse(. == "", NA, .)) %>%
fill(street, city, zip)
#> # A tibble: 30 x 9
#> year property street city city_overflow zip name period type
#> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 2015 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 1500 indivi…
#> 2 2014 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 1014 indivi…
#> 3 2013 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 1014 indivi…
#> 4 2012 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 1014 indivi…
#> 5 2011 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 1014 indivi…
#> 6 2010 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 1014 indivi…
#> 7 2009 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 0709 indivi…
#> 8 2008 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 0709 indivi…
#> 9 2007 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 0709 indivi…
#> 10 2006 1182 047 400 MINE… ORINDA… "" 94563 BARSAMIA… 0106 indivi…
#> # … with 20 more rows
另一种方法可能是首先为每个用dplyr
的select()
、filter()
和distinct()
和然后用CCD_ 10将其加入到原始数据集中。
library(dplyr, warn.conflicts = FALSE)
address_dict <-
prop_df %>%
select(property, street, city, zip) %>%
filter_at(vars(street, city, zip), ~. != "") %>%
distinct()
prop_df %>%
select(-c(street, city, zip)) %>%
full_join(address_dict)
#> Joining, by = "property"
#> # A tibble: 30 x 9
#> year property city_overflow name period type street city zip
#> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1990 1182 047 "" BARSAMIAN… 0000 individ… 400 MIN… ORIND… 94563
#> 2 1992 1182 047 "" BARSAMIAN… 0000 individ… 400 MIN… ORIND… 94563
#> 3 1998 1182 047 "" BARSAMIAN… 0000 individ… 400 MIN… ORIND… 94563
#> 4 1999 1182 047 "" BARSAMIAN… 0000 individ… 400 MIN… ORIND… 94563
#> 5 2000 1182 047 "" BARSAMIAN… 0000 individ… 400 MIN… ORIND… 94563
#> 6 2001 1182 047 "" BARSAMIAN… 0106 individ… 400 MIN… ORIND… 94563
#> 7 2002 1182 047 "" BARSAMIAN… 0106 individ… 400 MIN… ORIND… 94563
#> 8 2003 1182 047 "" BARSAMIAN… 0106 individ… 400 MIN… ORIND… 94563
#> 9 2004 1182 047 "" BARSAMIAN… 0106 individ… 400 MIN… ORIND… 94563
#> 10 2005 1182 047 "" BARSAMIAN… 0106 individ… 400 MIN… ORIND… 94563
#> # … with 20 more rows
使用data.table
和zoo::na.locf
library(data.table)
#Columns to replace values
cols <- c('street', 'city', 'city_overflow', 'zip')
#Replace empty values with NA
df[, (cols) := replace(.SD, .SD == '', NA), .SDcols = cols]
#Replace NA's values with next values.
df[,(cols) := lapply(.SD, zoo::na.locf, fromLast = TRUE),.SDcols = cols, property]