我有一个来自美国地质调查局(USGS)网站的数据框架,该网站在其站点标识符(site_no
)中使用前导零。为了做进一步的分析,我需要保持site_no
的前导零;问题是site_no
需要是类数字而不是字符。下面是一个模拟问题的示例数据帧。
df1 <- data.frame(matrix(ncol = 3, nrow = 20))
x <- c("site_no", "Date", "Wtemp")
colnames(df1) <- x
df1$site_no <- c("0101","0101","0101","0101","0101",
"0102","0102","0102","0102","0102",
"0103","0103","0103","0103","0103",
"104","104","104","104","104")
df1$Date <- rep(seq(from = as.Date("2020-01-01"), to = as.Date("2020-01-05"), by = 1),4)
df1$Wtemp <- c(10,NA,NA,NA,15,
20,NA,NA,10,16,
2,4,6,8,10,
12,14,16,18,20)
我尝试了以下操作,但没有成功
df1$site_no <- as.numeric(df1$site_no) # makes numeric but drops the leading zero which needs to be maintained
library(dplyr)
df1 <- df1 %>%
mutate(site_no = ifelse(row_number()<=15, paste0("0", site_no), site_no)) # returns the leading zero but makes class character
选项为formattable
中的comma
library(dplyr)
library(formattable)
out <- df1 %>%
mutate(site_no = comma(as.numeric(site_no),
width = 4, flag = '0', digits = 0, f = 'd', big.mark = ''))
is.numeric(out$site_no)
#[1] TRUE
out$site_no
#[1] 0101 0101 0101 0101 0101 0102 0102 0102 0102 0102 0103 0103 0103 0103 0103 0104 0104 0104 0104 0104
sum(out$site_no)
#[1] 2050
与产出
out
# site_no Date Wtemp
#1 0101 2020-01-01 10
#2 0101 2020-01-02 NA
#3 0101 2020-01-03 NA
#4 0101 2020-01-04 NA
#5 0101 2020-01-05 15
#6 0102 2020-01-01 20
#7 0102 2020-01-02 NA
#8 0102 2020-01-03 NA
#9 0102 2020-01-04 10
#10 0102 2020-01-05 16
#11 0103 2020-01-01 2
#12 0103 2020-01-02 4
#13 0103 2020-01-03 6
#14 0103 2020-01-04 8
#15 0103 2020-01-05 10
#16 0104 2020-01-01 12
#17 0104 2020-01-02 14
#18 0104 2020-01-03 16
#19 0104 2020-01-04 18
#20 0104 2020-01-05 20
或者选择将列保持为numeric
将创建label
属性
library(labelled)
out <- df1 %>%
mutate(site_no = as.numeric(site_no),
site_no = labelled(site_no, setNames(unique(site_no),
str_pad(unique(site_no), width = 4, pad = '0'))))
str(out)
#'data.frame': 20 obs. of 3 variables:
# $ site_no: dbl+lbl [1:20] 101, 101, 101, 101, 101, 102, 102, 102, 102, 102, 103, 103, #103, 103, 103, 104, 104, 104, 104, 104
# ..@ labels: Named num 101 102 103 104
# .. ..- attr(*, "names")= chr [1:4] "0101" "0102" "0103" "0104"
# $ Date : Date, format: "2020-01-01" "2020-01-02" "2020-01-03" "2020-01-04" ...
# $ Wtemp : num 10 NA NA NA 15 20 NA NA 10 16 ...
或者我们可以通过使用str_pad
来保持character
library(stringr)
df1 %>%
mutate(site_no = str_pad(site_no, width = 4, pad = '0'))
与产出
# site_no Date Wtemp
#1 0101 2020-01-01 10
#2 0101 2020-01-02 NA
#3 0101 2020-01-03 NA
#4 0101 2020-01-04 NA
#5 0101 2020-01-05 15
#6 0102 2020-01-01 20
#7 0102 2020-01-02 NA
#8 0102 2020-01-03 NA
#9 0102 2020-01-04 10
#10 0102 2020-01-05 16
#11 0103 2020-01-01 2
#12 0103 2020-01-02 4
#13 0103 2020-01-03 6
#14 0103 2020-01-04 8
#15 0103 2020-01-05 10
#16 0104 2020-01-01 12
#17 0104 2020-01-02 14
#18 0104 2020-01-03 16
#19 0104 2020-01-04 18
#20 0104 2020-01-05 20
或者从base R
转换为数字后的sprintf
df1$site_no <- with(df1, sprintf('%04d', as.numeric(site_no)))