r语言 - 用 100+ 行的 3 个 SD 过滤 15 列的值



我有一个数据集,其中 15 列 col1 到 col15 是数字。我有 100 行数据,每行都附加了名称作为因素。我想为所有 15 列的每一行做一个摘要。

head(df2phcl[,c(1:16)])
col1 col2 col3 col4 col5 col6 col7 col8 col9 col10 col11 col12 col13 col14 col15             NAME
78   95  101  100   84   93   93   85   81    97    80    94    81    79   87         R04-001 
100   61   96   75   98   92   99   99  102    83    84    NA   101    93    96         R04-002 
81   84   82   83   77   86   90   92   92    78    86    91    59    80    84         R04-003 
91   84   87   95  103   93   92   95   86    92   107    96    94    87    97         R04-004 
72   79   66   98   84   75   85   83   75    80    91    65    90    81    73         R04-005 
72   75   68   44   79   64   83   71   81    82    85    63    87    94    60         R04-006 

我的代码是。

library(dplyr)
####Rachis
SUMCL <- df2phcl %>%
group_by(name) %>% 
summarise(CL = mean(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
CLMAX = max(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
CLMIN = min(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
CLSTD = sd(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
OUT = outliers(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T))
head(SUMCL)
tail(SUMCL)

我得到的分析结果是...

Error: 
Evaluation error: missing value where TRUE/FALSE needed.

我也试过这个...

df2phcl$col1+col2+col3+col4+col5+col6+col7+col8+col9+col10+col11+col12+col13+co114+col15[!df2phcl$col1+col2+col3+col4+col5+col6+col7+col8+col9+col10+col11+col12+col13+col14+col15%in%boxplot.stats(df2phcl$col1+col2+col3+col4+col5+col6+col7+col8+col9+col10+co111+col12+col13+col14+col15)$out]
This returns ....
Error: object 'col2' not found

不知道我做错了什么,这适用于平均值、最大值、最小值和 sd。

> head(SUMCL)
# A tibble: 6 x 11
# Groups:   ENTRY, NAME, HEADCODE, RHTGENES, HEAD, PL [6]
ENTRY             NAME HEADCODE RHTGENES   HEAD    PL    PH       CL CLMAX CLMIN     CLSTD
<int>           <fctr>   <fctr>   <fctr> <fctr> <dbl> <dbl>    <dbl> <dbl> <dbl>     <dbl>
1     1         R04-001       CAW     Rht1   Club   319    83 88.53333   101    78  7.989875
2     2         R04-002       LBW Wildtype Common   330   102 91.35714   102    61 11.770936
3     3         R04-003       CBW     Rht2   Club   230    82 83.00000    92    59  8.220184
4     4         R04-004       LBW     Rht1 Common   328   117 93.26667   107    84  6.192930
5     5         R04-005       CBW     Rht1   Club   280    97 79.80000    98    65  9.182281
6     6         R04-006       LAW     Rht1 Common   310    92 73.86667    94    44 12.749603

我只是想过滤 3 sd 或更高的异常值,然后使用 dplyr 打包进行统计......

我不确定您要做什么,所以请告诉我下面的代码是否在正确的轨道上。

下面的方法是将数据从宽格式转换为长格式,这使得为每个级别的name进行摘要变得更加容易。

library(tidyverse)
# Fake data
set.seed(2)
dat = as.data.frame(replicate(15, rnorm(100)))
names(dat) = paste0("col", 1:15)
dat$name = paste0(rep(LETTERS[1:10], each=10), rep(letters[1:10], 10))
# Convert data to long format, remove outliers and summarize
dat %>% 
gather(column, value, -name) %>%  # reshape from wide to long
group_by(name) %>%                # summarize by name
mutate(value = replace(value, abs(value - mean(value)) > 2*sd(value), NA)) %>%  # set outliers to NA
summarise(mean = mean(value, na.rm=TRUE),
max = max(value, na.rm=TRUE),
sd = sd(value, na.rm=TRUE))
name         mean      max        sd
1    Aa  0.007848188 1.238744 0.8510016
2    Ab -0.208536464 1.980401 1.2764606
3    Ac -0.152986713 1.587845 0.8443106
4    Ad -0.413543054 0.965692 0.7225872
5    Ae -0.112648322 1.178716 0.7269527
6    Af  0.442268890 2.048040 1.0350119
7    Ag  0.390627994 1.978260 0.8716681
8    Ah  0.080505879 2.396349 1.3128403
9    Ai  0.257925059 1.984474 1.0196722
10    Aj  0.137469703 1.470177 0.7192616
# ... with 90 more rows

我设法更改了一些col std.dev.;但是,我不确定它花了多少观察结果。我想以均匀的数量从分布的顶部和底部取出。就像修剪均值一样,它将从分布的顶部和底部取出 20% 的 obs。我好奇的是只留下分布顶部和底部(+-3 SD(的观察结果。

> SUMCL <- df2phcl %>% 
+   gather(column, value, -c(ENTRY, NAME, HEADCODE, RHTGENES, HEAD,PL,PH)) %>%  # reshape from wide to long
+   group_by(ENTRY, NAME, HEADCODE, RHTGENES, HEAD,PL,PH) %>%                # summarize by name
+   mutate(value = replace(value, abs(value - mean(value)) > 2*sd(value), NA)) %>%  # set outliers to NA
+   summarise(CL = mean(value, na.rm=TRUE),
+             CLMAX = max(value, na.rm=TRUE),
+             CLMIN = min(value, na.rm=TRUE),
+             N = sum(!is.na(value), na.rm=TRUE),
+             CLSTD= sd(value, na.rm=TRUE),
+             CLSE = (CLSTD / sqrt(N)))
> head(SUMCL)
# A tibble: 6 x 13
# Groups:   ENTRY, NAME, HEADCODE, RHTGENES, HEAD, PL [6]
ENTRY             NAME HEADCODE RHTGENES   HEAD    PL    PH       CL CLMAX CLMIN     N     CLSTD     CLSE
<int>           <fctr>   <fctr>   <fctr> <fctr> <dbl> <dbl>    <dbl> <dbl> <dbl> <int>     <dbl>    <dbl>
1     1         R04-001       CAW     Rht1   Club   319    83 88.53333   101    78    15  7.989875 2.062977
2     2         R04-002       LBW Wildtype Common   330   102 91.35714   102    61    14 11.770936 3.145915
3     3         R04-003       CBW     Rht2   Club   230    82 84.71429    92    77    14  5.029583 1.344213
4     4         R04-004       LBW     Rht1 Common   328   117 92.28571   103    84    14  5.075258 1.356420
5     5         R04-005       CBW     Rht1   Club   280    97 79.80000    98    65    15  9.182281 2.370855
6     6         R04-006       LAW     Rht1 Common   310    92 76.00000    94    60    14 10.076629 2.693093

最新更新