r语言 - 使用一组规则的多个列的方差



我刚刚开始熟悉validate包。不幸的是,一开始我就遇到了一个问题,我找不到正确的解决方案。我想创建一个稍后可以应用于多个变量的验证规则。我会用一个例子来说明。我有这样一个tibble:

library(tidyverse)
library(validate)
df = tibble(
id = rep(1:10, each=20),
name = rep(paste0("v", 1:20), 10),
value = rnorm(length(name))
) %>% pivot_wider()

otuput

# A tibble: 10 x 21
id     v1     v2      v3      v4     v5     v6      v7       v8      v9    v10
<int>  <dbl>  <dbl>   <dbl>   <dbl>  <dbl>  <dbl>   <dbl>    <dbl>   <dbl>  <dbl>
1     1  1.20   0.182 -1.53    2.73   -1.60  -0.976 -0.767  -2.28    -0.257   0.736
2     2  0.484  0.913 -0.873  -0.801   0.172  1.11  -1.71    0.0125   0.0450  0.374
3     3 -0.604 -0.405  0.482   0.998  -0.634  0.212  0.717   0.598   -0.876   0.139
4     4 -0.324 -1.83   0.0195 -1.70    0.506 -0.139  3.21   -0.00169 -0.200  -1.03 
5     5  0.268  1.40   0.349   0.667   1.76   0.926 -1.09   -0.487    2.03    0.203
6     6  0.646  0.516  0.849  -0.619  -2.18   0.126 -0.0956 -0.471    0.0342  0.530
7     7 -1.03  -1.27  -0.0716 -2.13   -0.340  1.20   0.746  -0.366   -2.82   -0.431
8     8  0.415  0.313  0.591  -0.0552  0.132  1.86  -0.427   0.390   -0.506  -0.470
9     9  0.309  1.13  -0.472   0.760  -0.549 -0.954 -0.219  -0.653    0.335  -0.870
10    10  1.06   1.30   1.12    0.646   0.279 -1.45  -0.891  -0.278    0.637   0.236
# ... with 10 more variables: v11 <dbl>, v12 <dbl>, v13 <dbl>, v14 <dbl>, v15 <dbl>,
#   v16 <dbl>, v17 <dbl>, v18 <dbl>, v19 <dbl>, v20 <dbl>

我可以使用以下规则验证一个变量:

df %>% 
confront(
validator(
num.val = is.numeric(v1),
big.val = !(v1>10),
low.val = !(v1< -10),
NA.val = !is.na(v1)
)
) %>% summary()
#      name items passes fails nNA error warning     expression
# 1 num.val     1      1     0   0 FALSE   FALSE is.numeric(v1)
# 2 big.val    10     10     0   0 FALSE   FALSE       v1 <= 10
# 3 low.val    10     10     0   0 FALSE   FALSE      v1 >= -10
# 4  NA.val    10     10     0   0 FALSE   FALSE     !is.na(v1)

但是,我想使用一些简单的符号将此规则应用于多个列。不幸的是,下面的代码不能工作。

df %>% 
confront(
validator(
num.val = is.numeric(v1:v20),
big.val = !(v1:v20>10),
low.val = !(v1:v20< -10),
NA.val = !is.na(v1:v20)
)
) %>% summary()
#      name items passes fails nNA error warning         expression
# 1 num.val     1      1     0   0 FALSE    TRUE is.numeric(v1:v20)
# 2 big.val     1      1     0   0 FALSE    TRUE       v1:v20 <= 10
# 3 low.val     1      1     0   0 FALSE    TRUE      v1:v20 >= -10
# 4  NA.val     1      1     0   0 FALSE    TRUE     !is.na(v1:v20)

我知道我总是可以把我的数据转换成长格式。

df %>% 
pivot_longer(v1:v20) %>% 
confront(
validator(
num.val = is.numeric(value),
big.val = !(value>10),
low.val = !(value< -10),
NA.val = !is.na(value)
)
) %>% summary()
#      name items passes fails nNA error warning        expression
# 1 num.val     1      1     0   0 FALSE   FALSE is.numeric(value)
# 2 big.val   200    200     0   0 FALSE   FALSE       value <= 10
# 3 low.val   200    200     0   0 FALSE   FALSE      value >= -10
# 4  NA.val   200    200     0   0 FALSE   FALSE     !is.na(value)

但是,在这种情况下,我将无法确定验证在哪个变量中失败。

关于如何轻松地将一个验证规则应用于许多选定变量,有什么建议吗?

如果我们在pivot_longer中通过group_split更改OP的代码,它应该工作

library(purrr)
library(dplyr)
library(tidyr)
out <- df %>% 
pivot_longer(v1:v20) %>% 
group_split(name) %>% 
map(~ .x %>% confront(
validator(
num.val = is.numeric(value),
big.val = !(value>10),
low.val = !(value< -10),
NA.val = !is.na(value)
)
) %>% summary()) 

与产出

> out[1:4]
[[1]]
name items passes fails nNA error warning        expression
1 num.val     1      1     0   0 FALSE   FALSE is.numeric(value)
2 big.val    10     10     0   0 FALSE   FALSE       value <= 10
3 low.val    10     10     0   0 FALSE   FALSE      value >= -10
4  NA.val    10     10     0   0 FALSE   FALSE     !is.na(value)
[[2]]
name items passes fails nNA error warning        expression
1 num.val     1      1     0   0 FALSE   FALSE is.numeric(value)
2 big.val    10     10     0   0 FALSE   FALSE       value <= 10
3 low.val    10     10     0   0 FALSE   FALSE      value >= -10
4  NA.val    10     10     0   0 FALSE   FALSE     !is.na(value)
[[3]]
name items passes fails nNA error warning        expression
1 num.val     1      1     0   0 FALSE   FALSE is.numeric(value)
2 big.val    10     10     0   0 FALSE   FALSE       value <= 10
3 low.val    10     10     0   0 FALSE   FALSE      value >= -10
4  NA.val    10     10     0   0 FALSE   FALSE     !is.na(value)
[[4]]
name items passes fails nNA error warning        expression
1 num.val     1      1     0   0 FALSE   FALSE is.numeric(value)
2 big.val    10     10     0   0 FALSE   FALSE       value <= 10
3 low.val    10     10     0   0 FALSE   FALSE      value >= -10
4  NA.val    10     10     0   0 FALSE   FALSE     !is.na(value)

这种方式来自validate::语法,使用.来放置整个数据,但num.val得到不同的结果。我查找数据验证食谱,但我找不到关于选择多列的简单方法。

df %>% 
select(-id) %>%
confront(
validator(
num.val = is.numeric(.),
big.val = !(.>10),
low.val = !(.< -10),
NA.val = !is.na(.)
)
) %>% summary() 
name items passes fails nNA error warning    expression
1 num.val     1      0     1   0 FALSE   FALSE is.numeric(.)
2 big.val   200    200     0   0 FALSE   FALSE       . <= 10
3 low.val   200    200     0   0 FALSE   FALSE      . >= -10
4  NA.val   200    200     0   0 FALSE   FALSE     !is.na(.)