我需要清理一系列大型dbs。其中一列是日期矢量。根据年份的不同,变量需要进行不同的清洁。
我写了一个函数,但它非常慢,尤其是因为我必须使用它rowwise()
我该如何做得更好?
这就是复杂的清洁功能:
cleandate_fn <- function(date,year){
if(year<=2010){
date = str_pad(date,8,"left","0")
date = as.Date(date,format="%d%m%Y")
} else if (year==2011) {
date = str_pad(date,6,"left","0")
date = ifelse(str_sub(date,1,2)=="02",paste0("20",str_sub(date,3,8)),date)
date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
date = gsub("\\|/","0",date)
date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,5,8) %in% c("0229","0230","0231"),paste0(str_sub(date,1,4),"0131"),date)
date = as.Date(date,format="%Y%m%d")
} else {
date = ifelse(str_sub(date,4,6)=="//1",paste0(str_sub(date,1,3),"/19",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,4,6)=="//2",paste0(str_sub(date,1,3),"/20",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,5,6)=="/9",paste0(str_sub(date,1,4),"19",str_sub(date,7,8)),date)
date = ifelse(str_sub(date,5,6)=="/0",paste0(str_sub(date,1,4),"20",str_sub(date,7,8)),date)
date = gsub("\\|/","0",date)
date = ifelse(str_sub(date,1,2)=="00",paste0("01",str_sub(date,3,8)),date)
date = ifelse(str_sub(date,3,4)=="00",paste0(str_sub(date,1,2),"01",str_sub(date,5,8)),date)
date = paste0(str_sub(date,1,2),"05",str_sub(date,5,8))
date = as.Date(date,format="%d%m%Y")}
return(date)
}
玩具示例:
data <- data.frame(date=c("19052003","29062012","008//210","05/2/001","01011980"),
year=rep(2010,5))
data <- data %>%
rowwise() %>%
mutate(date=cleandate_fn(date,unique(data$year)))
按"年份"进行分组,然后将"年份"的第一个元素作为函数的入口可能会更容易
library(dplyr)
df1 %>%
group_by(year) %>%
mutate(datenew = cleandate_fn(date, first(year))) %>%
ungroup
-输出
# A tibble: 5 × 3
date year datenew
<chr> <dbl> <date>
1 19052003 2010 2003-05-19
2 29062012 2010 2012-06-29
3 008//210 2010 NA
4 05/2/001 2010 NA
5 01011980 2010 1980-01-01