我想计算一次值的试对试变化,然后删除那些超出一定间隔的变化。我的数据如下:
这是我尝试过的
df_all_UK2 %>%
group_by(ID, trial) %>%
filter(phase_bins == "baseline") %>%
mutate(drop = case_when(sum(is.na(nosetip))= 3 ~ TRUE,
TRUE ~ FALSE)) %>%
select(drop=="FALSE")
这应该从id中删除所有对trial
没有可用baseline
值的试验(一个不可用的试验在phase_bins = baseline
中所有三个值都是NA
s)但我不能让它工作。
之后,我需要nosetip
与前一次试验变化的差异测量(diff
),并删除所有不在正确区间内的试验。
# for the difference it is crucial that stimuli are in right order
df_all_UK2$time <- factor(df_all_UK$time , levels=c("pre_60", "pre_30","pre", "base1" , "base2" ,"stim1" ,"stim2" , "stim3" , "stim4" , "stim5", "stim6", "stim7" , "stim8", "stim9", "stim10" ,"stim11", "stim12" ,"rec1", "rec2", "rec3" , "rec4" ,"rec5", "rec6", "bre1" , "bre2" , "bre3" , "bre4" , "bre5" ))
# then compute difference
df_all_UK2 <- df_all_UK2 %>%
group_by(ID, trial) %>%
arrange(ID, cond_f, time) %>%
mutate(diff=nosetip-lag(nosetip,default=NA, n=1L))
这个步骤是有效的,但是过滤掉那些过高或过低的值是无效的。
# set values to NA if difference to value before exceeds is not between -0.5 and 0.5
df_all_UK2 %>%
mutate(nosetip = case_when(diff < -0.5 ~ NA,
diff > 0.5 ~ NA,
TRUE ~ nosetip))
任何关于如何解决这个问题的想法将非常感激!!
这是我的数据:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("UK103", "UK104", "UK105", "UK106", "UK107",
"UK108", "UK110", "UK111", "UK112", "UK113", "UK114", "UK115",
"UK116", "UK117", "UK119", "UK122", "UK123", "UK126", "UK130",
"UK132", "UK135", "UK136", "UK138", "UK139", "UK140", "UK147",
"UK148", "UK150", "UK153", "UK155", "UK159", "UK160", "UK162",
"UK163", "UK164", "UK101", "UK102", "UK109", "UK118", "UK120",
"UK121", "UK124", "UK125", "UK127", "UK128", "UK129", "UK131",
"UK133", "UK134", "UK137", "UK141", "UK142", "UK143", "UK144",
"UK145", "UK146", "UK149", "UK151", "UK152", "UK154", "UK156",
"UK157", "UK158", "UK161", "UK166", "UK167", "UK168", "UK169",
"UKA102", "UKA103", "UKA104", "UKA105", "UKA106", "UKA107", "UKA108",
"UKA109", "UKA110", "UKA111", "UKA112", "UKA113", "UKA114", "UKA115",
"UKA116", "UKA117", "UKA119", "UKA120", "UKA121", "UKA122", "UKA101",
"UKA118"), class = "factor"), cond_f = structure(c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("artificial", "babble",
"cry", "laugh"), class = "factor"), trial = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2", "3", "4"
), class = "factor"), time = structure(3:12, .Label = c("pre_60",
"pre_30", "pre", "base1", "base2", "stim1", "stim2", "stim3",
"stim4", "stim5", "stim6", "stim7", "stim8", "stim9", "stim10",
"stim11", "stim12", "rec1", "rec2", "rec3", "rec4", "rec5", "rec6",
"bre1", "bre2", "bre3", "bre4", "bre5"), class = "factor"), nosetip = c(29.4,
29.1, 29.6, 29, 29.1, 29.2, 29.3, NA, NA, NA), phase_bins = structure(c(2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L), .Label = c("pre", "baseline",
"stim_bin1", "stim_bin2", "recovery", "break"), class = "factor")), row.names = c(NA,
-10L), groups = structure(list(ID = structure(1L, .Label = c("UK103",
"UK104", "UK105", "UK106", "UK107", "UK108", "UK110", "UK111",
"UK112", "UK113", "UK114", "UK115", "UK116", "UK117", "UK119",
"UK122", "UK123", "UK126", "UK130", "UK132", "UK135", "UK136",
"UK138", "UK139", "UK140", "UK147", "UK148", "UK150", "UK153",
"UK155", "UK159", "UK160", "UK162", "UK163", "UK164", "UK101",
"UK102", "UK109", "UK118", "UK120", "UK121", "UK124", "UK125",
"UK127", "UK128", "UK129", "UK131", "UK133", "UK134", "UK137",
"UK141", "UK142", "UK143", "UK144", "UK145", "UK146", "UK149",
"UK151", "UK152", "UK154", "UK156", "UK157", "UK158", "UK161",
"UK166", "UK167", "UK168", "UK169", "UKA102", "UKA103", "UKA104",
"UKA105", "UKA106", "UKA107", "UKA108", "UKA109", "UKA110", "UKA111",
"UKA112", "UKA113", "UKA114", "UKA115", "UKA116", "UKA117", "UKA119",
"UKA120", "UKA121", "UKA122", "UKA101", "UKA118"), class = "factor"),
trial = structure(2L, .Label = c("1", "2", "3", "4"), class = "factor"),
.rows = structure(list(1:10), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
我有一点时间,所以我想我会给你一个实际的答案。我要先说,我相信当你有三个NA基线值时,有一种更简洁、更简单的过滤方法,但这可能是我的做法。
我希望数据集大一点比你共享,为了确保我筛选正确的事情,所以我做了我自己的叫做dat
。我所做的是,我创建了第二个名为dat_drop
的数据帧,其中只包含phase_bins == "baseline
。然后我总结了缺失的值,并只保留那些具有三个NAs的ID和试验。
从这里,我通过过滤dat
来制作dat2
,删除dat_drop$ID
列中的任何id,并且试验在dat_drop$trial
中。因此,它应该只保留那些不缺少三个基线的id和试验。但是,根据您的整个数据框架,我可以看到这不起作用(即对于一些id和试验,您只有两个基线行,并且都是NA)。
之后,我使用您的代码为dat3
创建diff列。我还包括了我对dat3()
的评论,在那里我使用filter(diff < 0.5 | diff > 0.5)
。我意识到这是一个延迟的答案,它可能不是最有效或最健壮的,但希望这对你有帮助。或者它可能会促使别人给出更好的解决方案!
library(dplyr)
set.seed(123)
dat<-data.frame("ID" = c(rep("UK103", 10), rep("UK104", 10), rep("UK105",10), rep("UK106", 10)),
"cond_f" = rep("cry", 40),
"trial" = rep(2, 40),
"time" = rep(c("pre", "base1", "base2", "stim1", "stim2", "stim3", "stim4", "stim5", "stim6", "stim7"),4),
"nosetip" = c(rnorm(7, 29), NA, NA, NA, rnorm(10, 29), NA, NA, NA, rnorm(7, 29), NA, rnorm(8, 29), NA),
"phase_bins" = rep(c(rep("baseline", 3), rep("stim_bin1", 6), "stim_bin2"), 4))
dat_drop<-dat%>%
filter(phase_bins == "baseline")%>%
group_by(ID, trial) %>%
summarise(n = sum(is.na(nosetip)), .groups = "drop")%>%
filter(n == 3)
dat2<-dat%>%
filter(!(ID == dat_drop$ID & trial == dat_drop$trial))
dat3<-dat2%>%
group_by(ID, trial) %>%
arrange(ID, cond_f, time) %>%
mutate(diff=nosetip-lag(nosetip,default=NA, n=1L))%>%
ungroup()
dat3 %>%
filter(diff < 0.5 | diff > 0.5)