我有一个来自某个参与者在情景任务中的行为的数据框架,假设该情景从90开始,当我们有一个可能在40秒范围内的特定触发时结束。我正在做一个示例数据帧,其中一列包含行数,另一列包含实际触发器。
ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
df <- data.frame(ex1,ex2)
> df
ex1 ex2
1 1 41
2 2 1
3 3 1
4 4 90
5 5 1
6 6 1
7 7 1
8 8 44
9 9 1
10 10 90
11 11 1
12 12 2
13 13 42
14 14 1
15 15 1
16 16 1
17 17 1
18 18 90
19 19 1
20 20 41
现在,我想做的是删除这一集开头和结尾之外的所有行,因为它们是打字行为的记录,因为它不在这一集之外,所以不有趣。因此,我想最终得到这样的数据帧:
ex1 <- c(1,4,5,6,7,8,10,11,12,13,18,19,20)
ex2 <- c(41,90,1,1,1,44,90,1,2,42,90,1,41)
df <- data.frame(ex1,ex2)
> df
ex1 ex2
1 1 41
2 4 90
3 5 1
4 6 1
5 7 1
6 8 44
7 10 90
8 11 1
9 12 2
10 13 42
11 18 90
12 19 1
13 20 41
我一直在尝试使用subset
,但我无法使它在范围和数字之间工作。
提前感谢!
设置值:
ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
before <- data.frame(ex1,ex2)
before
ex1 ex2
1 1 41
2 2 1
3 3 1
4 4 90
5 5 1
6 6 1
7 7 1
8 8 44
9 9 1
10 10 90
11 11 1
12 12 2
13 13 42
14 14 1
15 15 1
16 16 1
17 17 1
18 18 90
19 19 1
20 20 41
我已经建立了一个应该做这项工作的函数。该函数是基于我对您问题的理解构建的,因此我的函数可能无法完全满足您的设置。然而,我相信你可以通过稍微调整一下功能来满足你的需求。
library(dplyr)
episode <- function(start = 90, end = 40, data){#the default value of start is 90 and the default value of end is 40
#retrieving all the row indices that correspond to values that indicates an end
end_idx <- which(data$ex2>=end & data$ex2<=end+10)
#retrieving all the row indices that correspond to values that indicates a start
start_idx <- which(data$ex2==start)
#declaring a list that would contain the extracted sub samples in your liking
sub_sample_list <- vector("list", length(start_idx))
#looping through the start indices
for(i in 1:length(start_idx)){
#extracting the minimum among those have values larger than the i-th start_idx value
temp_end <- min(end_idx[end_idx>start_idx[i]])
#extracting the rows between the i-th start index and the minimum end index that is larger than the i-th start index
temp_sub_sample <- data[start_idx[i]:temp_end,]
#saving the sub-sample in the list
sub_sample_list[[i]] <- temp_sub_sample
}
#now row binding all the extracted sub samples
clean.df <- do.call(rbind.data.frame, sub_sample_list)
#if there is an end index that is smaller than the minimum start index
if(min(end_idx)< min(start_idx)){
#only retrieve those corresponding rows and add to the clean.df
clean.df <- rbind(data[end_idx[end_idx<min(start_idx)],], clean.df)
}
#cleaning up the row numbers a bit
rownames(clean.df) <- 1:nrow(clean.df)
#sort the clean.df by ex1
clean.df <- clean.df %>% arrange(ex1)
#returning the clean.df
return(clean.df)
}
使用episode
函数生成after
数据集。
after <- episode(start = 90, end = 40, before)
after
ex1 ex2
1 1 41
2 4 90
3 5 1
4 6 1
5 7 1
6 8 44
7 10 90
8 11 1
9 12 2
10 13 42
11 18 90
12 19 1
13 20 41
和基数:
ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
df <- data.frame(ex1,ex2)
索引序列[90]的开始,如果不是第1行,则在开始之前将行子集输出为不完整:
start_idx <- which(df$ex2 == 90)
df <- df[start_idx[1]:nrow(df), ]
重新索引开始和索引结束>=40<;90
start_idx <- which(df$ex2 == 90)
end_idx <- which(df$ex2 >= 40 & df$ex2 < 90)
制作一个空列表,并通过循环,子设置开始:结束部分
df_lst <- list()
for (k in 1:length(start_idx)) {
df_lst[[k]] <- df[start_idx[k]:end_idx[k], ]
}
把它们放在一起
df2 <- do.call('rbind' df_lst)
df2
ex1 ex2
4 4 90
5 5 1
6 6 1
7 7 1
8 8 44
10 10 90
11 11 1
12 12 2
13 13 42
18 18 90
19 19 1
20 20 41
相当紧凑。