我正试图根据纳入和排除标准清理我的数据。索引和过滤函数都给了我32个变量的3787个观察值。但是,当我使用same()或all.equal()检查时,它们是不相同的。我遗漏了什么?我认为这与filter()有关。我花了一段时间才弄清楚我必须在filter()中显式地声明NA。
是否与"filter()只包含条件为TRUE的行;它排除了FALSE和NA值。如果您想保留缺失的值,请显式请求它们…"https://github.com/tidyverse/dplyr/issues/3196
analysis1 <- nhanes[(nhanes$Age >= 30 & nhanes$Age <= 79) &
!(nhanes$Cancer == 1 | nhanes$Cancer == 7 | nhanes$Cancer == 9) &
!(nhanes$Sex == 1 & (nhanes$WTkg > 160 | nhanes$WTkg < 50)) &
!(nhanes$Sex == 2 & (nhanes$WTkg > 150 | nhanes$WTkg < 45)) &
!(nhanes$HTcm > 190),]
nhanes1 <- nhanes %>%
filter((is.na(Age) | (Age >= 30 & Age <= 79 )) &
((is.na(Cancer) | !(Cancer == 1 | Cancer == 7 | Cancer == 9))) &
((is.na(Sex & WTkg) | !(Sex == 1 & (WTkg > 160 | WTkg < 50)))) &
((is.na(Sex & WTkg) | !(Sex == 2 & (WTkg > 150 | WTkg < 45)))) &
((is.na(HTcm) | !(HTcm > 190))))
identical(analysis1, nhanes1)
all.equal(analysis1, nhanes1)
输出:
[1] FALSE
[1] "Attributes: < Component “row.names”: Modes: character, numeric >"
[2] "Attributes: < Component “row.names”: target is character, current is numeric >"
[3] "Component “ID”: 'is.NA' value mismatch: 0 in current 229 in target"
[4] "Component “WBC”: 'is.NA' value mismatch: 334 in current 373 in target"
[5] "Component “RBC”: 'is.NA' value mismatch: 334 in current 373 in target"
[6] "Component “HGB”: 'is.NA' value mismatch: 334 in current 373 in target"
[7] "Component “HCT”: 'is.NA' value mismatch: 334 in current 373 in target"
[8] "Component “MCV”: 'is.NA' value mismatch: 334 in current 373 in target"
[9] "Component “MCH”: 'is.NA' value mismatch: 334 in current 373 in target"
[10] "Component “HbA1c”: 'is.NA' value mismatch: 340 in current 379 in target"
[11] "Component “HDL”: 'is.NA' value mismatch: 394 in current 433 in target"
[12] "Component “CRP”: 'is.NA' value mismatch: 408 in current 446 in target"
[13] "Component “TCHOL”: 'is.NA' value mismatch: 394 in current 433 in target"
[14] "Component “TRIG”: 'is.NA' value mismatch: 2141 in current 2163 in target"
[15] "Component “LDL”: 'is.NA' value mismatch: 2161 in current 2183 in target"
[16] "Component “Sex”: 'is.NA' value mismatch: 0 in current 229 in target"
[17] "Component “Age”: 'is.NA' value mismatch: 0 in current 229 in target"
[18] "Component “Race2”: 'is.NA' value mismatch: 0 in current 229 in target"
[19] "Component “Mstatus”: 'is.NA' value mismatch: 0 in current 229 in target"
[20] "Component “Fpoverty”: 'is.NA' value mismatch: 542 in current 698 in target"
[21] "Component “Income”: 'is.NA' value mismatch: 390 in current 556 in target"
[22] "Component “WTkg”: 'is.NA' value mismatch: 223 in current 229 in target"
[23] "Component “HTcm”: 'is.NA' value mismatch: 225 in current 229 in target"
[24] "Component “WAISTcm”: 'is.NA' value mismatch: 370 in current 376 in target"
[25] "Component “HIPcm”: 'is.NA' value mismatch: 362 in current 368 in target"
[26] "Component “SBP”: 'is.NA' value mismatch: 438 in current 470 in target"
[27] "Component “DBP”: 'is.NA' value mismatch: 438 in current 470 in target"
[28] "Component “CHF”: 'is.NA' value mismatch: 0 in current 229 in target"
[29] "Component “CHD”: 'is.NA' value mismatch: 0 in current 229 in target"
[30] "Component “MI”: 'is.NA' value mismatch: 0 in current 229 in target"
[31] "Component “Stroke”: 'is.NA' value mismatch: 0 in current 229 in target"
[32] "Component “COPD”: 'is.NA' value mismatch: 0 in current 229 in target"
[33] "Component “Cancer”: 'is.NA' value mismatch: 0 in current 229 in target"
数据有问题。如果我检查两个数据集中相同变量之间的相关系数,它们是不同的。
res_cor <- analysis1 %>%
cor_mat(Income, HGB, HCT, MCV, CRP, HDL, LDL, method = "spearman")
res_cor
res_cor1 <- nhanes1 %>%
cor_mat(Income, HGB, HCT, MCV, CRP, HDL, LDL, method = "spearman")
res_cor1
输出:
rowname
<chr>
Income HGB HCT MCV CRP HDL LDL
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Income 1.000 0.055 0.039 -0.060 -0.150 0.087 0.014
2 HGB 0.055 1.000 0.970 0.220 -0.120 -0.250 0.120
3 HCT 0.039 0.970 1.000 0.180 -0.097 -0.230 0.130
4 MCV -0.060 0.220 0.180 1.000 -0.150 0.150 -0.022
5 CRP -0.150 -0.120 -0.097 -0.150 1.000 -0.200 0.074
6 HDL 0.087 -0.250 -0.230 0.150 -0.200 1.000 -0.011
7 LDL 0.014 0.120 0.130 -0.022 0.074 -0.011 1.000
rowname
<chr>
Income HGB HCT MCV CRP HDL LDL
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Income 1.000 0.052 0.035 -0.057 -0.150 0.0910 0.0130
2 HGB 0.052 1.000 0.970 0.220 -0.120 -0.2500 0.1200
3 HCT 0.035 0.970 1.000 0.180 -0.099 -0.2300 0.1200
4 MCV -0.057 0.220 0.180 1.000 -0.150 0.1500 -0.0230
5 CRP -0.150 -0.120 -0.099 -0.150 1.000 -0.2000 0.0760
6 HDL 0.091 -0.250 -0.230 0.150 -0.200 1.0000 -0.0078
7 LDL 0.013 0.120 0.120 -0.023 0.076 -0.0078 1.0000
试试这个:首先对列进行排序,然后使用identical
:下面是一个mtcars数据集的示例,其中有和没有排序的数据:
或者你可以像@Gregor Thomas在这里提出的那样使用setidff
比较R中的2个数据名称是否相等
mtcars1 <- mtcars %>%
arrange(cyl)
identical(mtcars, mtcars)
identical(mtcars1, mtcars)
library(dplyr)
nrow(setdiff(mtcars, mtcars1)) == 0 & nrow(setdiff(mtcars1, mtcars)) == 0
> identical(mtcars, mtcars)
[1] TRUE
> mtcars1 <- mtcars %>%
+ arrange(cyl)
> identical(mtcars1, mtcars)
[1] FALSE
> nrow(setdiff(mtcars, mtcars1)) == 0 & nrow(setdiff(mtcars1, mtcars)) == 0
[1] TRUE