我与54名参与者进行了单词学习实验。设计是这样的,每个参与者都会学习12个同源单词和12个非认知单词。但是,我不得不从数据集中删除一些观察结果,最终以1591个同源观测和1816年的非认知观察结果。
。现在,我想计算同名和非共同认知分数之间的相关性。这意味着我将不得不将1816年的非认知观察结果减少到1591年,因为始终使用相等的样本进行相关性。
我只能删除行1592-1816,但这不是理想的选择,因为我会失去后来参与者的所有非认知(数据按参与者编号排序(。
我宁愿做的是循环浏览所有参与者,对于每个参与者而言,删除了尽可能多的"剩余"非认知者,以使该参与者的同名数量和非同名数量相同。
这是2个参与者的数据的一个示例(请注意,在删除数据时也可以考虑多个时间点测试单词,理想情况下也将考虑到:
(:structure(list(Participant = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "34", "35", "36", "37",
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48",
"49", "50", "51", "52", "54", "55", "56", "57"), class = "factor"),
Word = structure(c(5L, 77L, 23L, 40L, 30L, 8L, 73L, 28L,
48L, 44L, 58L, 69L, 50L, 57L, 45L, 6L, 56L, 53L, 63L, 65L,
77L, 5L, 40L, 23L, 30L, 8L, 28L, 73L, 48L, 58L, 44L, 50L,
69L, 57L, 45L, 56L, 6L, 63L, 53L, 65L, 23L, 30L, 40L, 5L,
8L, 77L, 73L, 48L, 28L, 57L, 69L, 58L, 50L, 45L, 44L, 53L,
65L, 6L, 63L, 56L, 5L, 40L, 8L, 77L, 30L, 23L, 28L, 48L,
73L, 57L, 45L, 50L, 69L, 58L, 44L, 63L, 53L, 56L, 6L, 16L,
13L, 81L, 82L, 52L, 1L, 12L, 75L, 55L, 78L, 70L, 66L, 80L,
83L, 64L, 68L, 25L, 47L, 11L, 26L, 4L, 19L, 36L, 13L, 16L,
82L, 81L, 52L, 1L, 75L, 12L, 78L, 55L, 70L, 80L, 66L, 64L,
83L, 68L, 25L, 11L, 47L, 4L, 26L, 19L, 36L, 13L, 16L, 1L,
82L, 52L, 81L, 78L, 12L, 75L, 55L, 70L, 80L, 66L, 64L, 83L,
68L, 25L, 4L, 11L, 47L, 36L, 19L, 26L), .Label = c("aambeeld",
"bezem", "brandblusser", "broodrooster", "buis", "citruspers",
"dienblad", "dobber", "dweil", "emmer", "garde", "gesp",
"gieter", "gum", "heggenschaar", "hengel", "hes", "kaars",
"kapstok", "keppel", "kist", "klapper", "klos", "knikker",
"knuffel", "kooi", "kous", "kraag", "kroon", "kruiwagen",
"kruk", "kurk", "kussen", "kwast", "lantaarn", "lessenaar",
"mijter", "onderzetter", "pak", "passer", "peddel", "pet",
"pruik", "puntenslijper", "rammelaar", "reddingsvest", "rietje",
"rits", "romper", "sambabal", "schort", "schroef", "servet",
"skelter", "slab", "slang", "slinger", "speen", "speldje",
"spijker", "spuit", "staf", "stamper", "stelt", "stofzuiger",
"stokpaard", "stolp", "tamboerijn", "tol", "tooi", "toverstaf",
"tuinbroek", "tulband", "vergiet", "veter", "vijl", "vijzel",
"waaier", "wafelijzer", "wip", "zaag", "zeis", "zwemvleugel"
), class = "factor"), Cognate = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Cognate",
"Non-cognate"), class = "factor"), TestingMoment = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Main2",
"Main4", "Post", "FollowUp"), class = "factor"), Score = c(0,
1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0.71, 1, 1, 0.86,
1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
1, 0.86, 1, 0, 0, 0, 0, 1, 0, 0.43, 1, 1, 0, 0, 0, 0, 1,
1, 0.86, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0.75, 0, 0, 0, 0.57,
0, 0, 0, 0.45, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0.8,
1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
0, 0)), .Names = c("Participant", "Word", "Cognate", "TestingMoment",
"Score"), row.names = c(97L, 98L, 99L, 100L, 101L, 102L, 103L,
104L, 105L, 109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L,
118L, 120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 130L,
133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L, 142L, 144L,
145L, 146L, 147L, 148L, 149L, 150L, 152L, 154L, 155L, 157L, 158L,
159L, 160L, 161L, 162L, 163L, 164L, 165L, 166L, 168L, 169L, 170L,
171L, 172L, 173L, 174L, 175L, 178L, 180L, 181L, 182L, 183L, 184L,
185L, 186L, 188L, 189L, 190L, 191L, 193L, 194L, 195L, 196L, 197L,
198L, 199L, 200L, 201L, 202L, 204L, 205L, 206L, 207L, 208L, 209L,
210L, 211L, 212L, 213L, 214L, 215L, 216L, 217L, 218L, 219L, 220L,
221L, 222L, 223L, 224L, 225L, 226L, 228L, 229L, 230L, 231L, 232L,
233L, 234L, 235L, 236L, 237L, 238L, 239L, 240L, 241L, 242L, 243L,
244L, 245L, 246L, 247L, 248L, 250L, 251L, 252L, 253L, 254L, 255L,
256L, 257L, 258L, 259L, 260L, 261L, 262L, 263L, 264L), class = "data.frame")
最好的方法是什么?
一个较小的答案。
df2 = df ## Preserve original data
for(Part in levels(df2$Participant)) {
Tab = table(df2$Cognate[df2$Participant == Part])
if(Tab[1] == Tab[2]) { next }
Big = ifelse(Tab[1] > Tab[2], 1, 2)
Small = ifelse(Tab[1] < Tab[2], 1, 2)
Rem1 = sample(Tab[Big], Tab[Big] - Tab[Small])
Remove = which(df2$Participant == Part & df2$Cognate == levels(df2$Cognate)[Big])[Rem1]
df2 = df2[-Remove,]
}
table(df2$Cognate)
这是我的答案,它很奇怪,但它通过每个用户循环,检查 Cognate
或 Non-cognate
是否更频繁,然后删除直到相等(请记住将数据分配给data
变量((:
final_data <- NULL
for (ptcp in unique(data$Participant)) {
# subset to chose each participant
new_data <- data[which(data$Participant==ptcp), ]
# Check if Non-cognate and Cognate are equal
if ( length(which(new_data$Cognate=="Non-cognate")) == length(which(new_data$Cognate=="Cognate")) ) break
# Check if have more Non-cognate than Cognate
if ( length(which(new_data$Cognate=="Non-cognate")) > length(which(new_data$Cognate=="Cognate")) ) {
# Loop while they are not equal
while (length(which(new_data$Cognate=="Non-cognate")) > length(which(new_data$Cognate=="Cognate"))) {
# Removes the first row of "non-cognate
id <- which(new_data$Cognate=="Non-cognate")
new_data <- new_data[-id[1],]
}
# Check if have more Cognate than Non-cognate
} else if ( length(which(new_data$Cognate=="Cognate")) > length(which(new_data$Cognate=="Non-cognate")) ) {
# Loop while they are not equal
while (length(which(new_data$Cognate=="Cognate")) > length(which(new_data$Cognate=="Non-cognate"))) {
# Removes the first row of "non-cognate
id <- which(new_data$Cognate=="Cognate")
new_data <- new_data[-id[1],]
}
}
# Combine each user to final_data
final_data <- rbind(final_data, new_data)
}