我使用格式的水质数据
Station_Name, Sample_Date, Value, Parameter, Units
F1001 2/04/2020 5.6 NO3 mg/L
F1001 4/05/2020 3.9 NO3 mg/L
F1001 8/06/2020 2.7 NO3 mg/L
F1002 6/03/2020 1.7 NO3 mg/L
F1002 3/04/2020 2.5 NO3 mg/L
F1003 1/03/2020 4.9 NO3 mg/L
F1003 5/04/2020 1.5 NO3 mg/L
F1003 6/05/2020 3.1 NO3 mg/L
F1004 4/05/2020 9.3 NO3 mg/L
F1004 5/06/2020 3.6 NO3 mg/L
excel文件中还有1000多行数据。
我目前使用的程序可以将数据收集到数据库中。在这个程序中,他们包含了R-Console。我需要做的是获取所有数据,在每个站点(而不是整个数据库(上运行Mann-Kendall测试,并给我结果。第7、8和9行是通过双击程序中所需的字段获得的,有趣的是,它是根据我给数据库的数据创建的。
package_list <- c("ggplot2", "Kendall", "dplyr")
lapply(package_list, require, character.only = TRUE)
library(Kendall)
library(dplyr)
library(purrr)
Station_Name = c(STATION[Name,StationGroup(-1)])
SampleDate = c(SAMPLE[SampleDate,StationGroup(-1)])
NO3 = c(PARAMETER[NO3,Unit(mg/L),StationGroup(-1)])
#Add it to a Data Frame
df = data.frame(Station_Name, NO3, SampleDate)
cv <- function(val) {sd(val)/mean(val)}
MK <- function(x){
MKfun <- possibly(Kendall::MannKendall,
list(tau = NA_real_,
sl = NA_real_,
S = NA_real_,
D = NA_real_,
varS = NA_real_))
out <- MKfun(x)
class(out) <- "list"
data.frame(out)
}
df %>%
group_by(Station_Name) %>%
summarise(N = n(),
MEAN = mean (Value),
STDEV = sd (Value),
CoV = cv (Value),
MAX = max (Value),
Q75 = quantile(Value, 0.75),
Q50 = quantile(Value, 0.50),
Q25 = quantile(Value, 0.25),
MIN = min (Value),
MK(Value))
print("")
print(df)
当我运行脚本时,它会显示sl=NA_real_的状态错误,:意外的','(第24行(
> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252
[3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C
[5] LC_TIME=English_United Kingdom.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] reshape2_1.4.4 purrr_0.3.3 dplyr_0.8.5 Kendall_2.2 readxl_1.3.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4 rstudioapi_0.11 magrittr_1.5 tidyselect_1.0.0 R6_2.4.1
[6] rlang_0.4.5 fansi_0.4.1 stringr_1.4.0 plyr_1.8.6 tools_3.6.3
[11] utf8_1.1.4 cli_2.0.2 assertthat_0.2.1 tibble_2.1.3 crayon_1.3.4
[16] vctrs_0.2.4 glue_1.3.2 stringi_1.4.6 compiler_3.6.3 pillar_1.4.3
[21] cellranger_1.1.0 boot_1.3-24 pkgconfig_2.0.3
是否需要外部函数?否则,您可以留下一个可读性很强的dplyr代码。。
数据:
df <- tibble::tribble(~Station_Name, ~Sample_Date, ~Value, ~Parameter, ~Units,
"F1001", "2/04/2020", 5.6, "NO3", "mg/L",
"F1001", "4/05/2020", 3.9, "NO3", "mg/L",
"F1001", "8/06/2020", 2.7, "NO3", "mg/L",
"F1002", "6/03/2020", 1.7, "NO3", "mg/L",
"F1002", "3/04/2020", 2.5, "NO3", "mg/L",
"F1003", "1/03/2020", 4.9, "NO3", "mg/L",
"F1003", "5/04/2020", 1.5, "NO3", "mg/L",
"F1003", "6/05/2020", 3.1, "NO3", "mg/L",
"F1004", "4/05/2020", 9.3, "NO3", "mg/L",
"F1004", "5/06/2020", 3.6, "NO3", "mg/L")
解决方案:
library(dplyr)
cv <- function(val) {sd(val)/mean(val)}
df %>%
group_by(Station_Name) %>%
summarise(MEAN = mean (Value),
STDEV = sd (Value),
CoV = cv (Value),
MAX = max (Value),
Q75 = quantile(Value, 0.75),
Q50 = quantile(Value, 0.50),
Q25 = quantile(Value, 0.25),
MIN = min (Value))
# # A tibble: 4 x 9
# Station_Name MEAN STDEV CoV MAX Q75 Q50 Q25 MIN
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 F1001 4.07 1.46 0.358 5.6 4.75 3.9 3.3 2.7
# 2 F1002 2.1 0.566 0.269 2.5 2.3 2.1 1.9 1.7
# 3 F1003 3.17 1.70 0.537 4.9 4 3.1 2.3 1.5
# 4 F1004 6.45 4.03 0.625 9.3 7.88 6.45 5.03 3.6
***********编辑***************
对于MannKendall:
library(Kendall)
library(dplyr)
library(purrr)
MK <- function(x){
# this provides a fallback in case of error
MKfun <- possibly(Kendall::MannKendall,
list(tau = NA_real_,
sl = NA_real_,
S = NA_real_,
D = NA_real_,
varS = NA_real_))
out <- MKfun(x)
# force to list and then to df to integrate it with the final tibble
class(out) <- "list"
data.frame(out)
}
df %>%
group_by(Station_Name) %>%
summarise(MK(Value))
# # A tibble: 4 x 6
# Station_Name tau sl S D varS
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 F1001 -1 1 -3 3 3.67
# 2 F1002 NA NA NA NA NA
# 3 F1003 -0.333 1 -1 3 3.67
# 4 F1004 NA NA NA NA NA
你会看到一些NA,因为只有2个观测值,无法计算MK。在你的数据中,你不应该有这个问题。万一有退路。
正如你所看到的,我保留了函数自动计算的所有统计数据,因为我不知道你是只需要结果还是其他的。
所有
我把曼肯德尔和你的数据放在这里。
library(Kendall)
library(dplyr)
library(purrr)
cv <- function(val) {sd(val)/mean(val)}
MK <- function(x){
MKfun <- possibly(Kendall::MannKendall,
list(tau = NA_real_,
sl = NA_real_,
S = NA_real_,
D = NA_real_,
varS = NA_real_))
out <- MKfun(x)
class(out) <- "list"
data.frame(out)
}
df %>%
group_by(Station_Name) %>%
summarise(N = n(),
MEAN = mean (Value),
STDEV = sd (Value),
CoV = cv (Value),
MAX = max (Value),
Q75 = quantile(Value, 0.75),
Q50 = quantile(Value, 0.50),
Q25 = quantile(Value, 0.25),
MIN = min (Value),
MK(Value))
# # A tibble: 4 x 15
# Station_Name N MEAN STDEV CoV MAX Q75 Q50 Q25 MIN tau sl S D varS
# <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 F1001 3 4.07 1.46 0.358 5.6 4.75 3.9 3.3 2.7 -1 1 -3 3 3.67
# 2 F1002 2 2.1 0.566 0.269 2.5 2.3 2.1 1.9 1.7 NA NA NA NA NA
# 3 F1003 3 3.17 1.70 0.537 4.9 4 3.1 2.3 1.5 -0.333 1 -1 3 3.67
# 4 F1004 2 6.45 4.03 0.625 9.3 7.88 6.45 5.03 3.6 NA NA NA NA NA