我有这样的数据:
df <- structure(list(line = c("001", "002", "003", "004", "005", "006",
"007", "008", "009", "010", "011", "012", "013", "014"),
utterance = c("((m: both hands",
"((m: both hands",
"((i: DH=1, SZ=0", "((i: DH=1, SZ=0",
"((s: Preface))", "((m: both hands",
"((m: both hands clasped",
"((m: both hands clasped",
"((s: Background))", "((m: enumerating",
"((m: enumerating",
"((s: End))", "((i: DH=1, SZ=0", "((m: relax gesture))"
)), row.names = c(NA, 14L), class = "data.frame")
我想创建一个新的列story
,并使用列utterance
中与正则表达式模式\(\(s
匹配的值fill
该列。但我希望fill
停止在与这种模式匹配的最后一个值处,这是((s: End))
。
此fill
命令不会在该模式处停止 - 如何在该模式下停止fill
?
library(tidyr)
df %>%
mutate(story = ifelse(grepl("\(\(s", utterance), utterance, NA)) %>%
fill(story, .direction = "down")
line utterance story
1 001 ((m: both hands <NA>
2 002 ((m: both hands <NA>
3 003 ((i: DH=1, SZ=0 <NA>
4 004 ((i: DH=1, SZ=0 <NA>
5 005 ((s: Preface)) ((s: Preface))
6 006 ((m: both hands ((s: Preface))
7 007 ((m: both hands clasped ((s: Preface))
8 008 ((m: both hands clasped ((s: Preface))
9 009 ((s: Background)) ((s: Background))
10 010 ((m: enumerating ((s: Background))
11 011 ((m: enumerating ((s: Background))
12 012 ((s: End)) ((s: End))
13 013 ((i: DH=1, SZ=0 ((s: End))
14 014 ((m: relax gesture)) ((s: End))
期望:
line utterance story
1 001 ((m: both hands <NA>
2 002 ((m: both hands <NA>
3 003 ((i: DH=1, SZ=0 <NA>
4 004 ((i: DH=1, SZ=0 <NA>
5 005 ((s: Preface)) ((s: Preface))
6 006 ((m: both hands ((s: Preface))
7 007 ((m: both hands clasped ((s: Preface))
8 008 ((m: both hands clasped ((s: Preface))
9 009 ((s: Background)) ((s: Background))
10 010 ((m: enumerating ((s: Background))
11 011 ((m: enumerating ((s: Background))
12 012 ((s: End)) ((s: End))
13 013 ((i: DH=1, SZ=0 <NA>
14 014 ((m: relax gesture)) <NA>
tidyr::fill
本身不会这样做,但您可以再添加一个mutate
:
df %>%
mutate(story = if_else(grepl("\(\(s", utterance), utterance, NA_character_)) %>%
fill(story, .direction = "down") %>%
mutate(story = if_else(story == last(story) & duplicated(story), NA_character_, story))
# line utterance story
# 1 001 ((m: both hands <NA>
# 2 002 ((m: both hands <NA>
# 3 003 ((i: DH=1, SZ=0 <NA>
# 4 004 ((i: DH=1, SZ=0 <NA>
# 5 005 ((s: Preface)) ((s: Preface))
# 6 006 ((m: both hands ((s: Preface))
# 7 007 ((m: both hands clasped ((s: Preface))
# 8 008 ((m: both hands clasped ((s: Preface))
# 9 009 ((s: Background)) ((s: Background))
# 10 010 ((m: enumerating ((s: Background))
# 11 011 ((m: enumerating ((s: Background))
# 12 012 ((s: End)) ((s: End))
# 13 013 ((i: DH=1, SZ=0 <NA>
# 14 014 ((m: relax gesture)) <NA>
这将查找最后一次出现的story
并删除除第一个之外的所有匹配项。这假设顺序很重要,并且不假设最后一个story
必须包含文字s: End
,尽管您可以根据需要相应地更新逻辑。
仅供参考,我从ifelse
更改为if_else
,因为它是类型安全的(base::ifelse
不是)。它需要具体说明使用哪种NA
(有超过六种不同的变体)。
我们可以使用na.locf
library(dplyr)
library(zoo)
df %>%
mutate(story = ifelse(grepl("\(\(s", utterance), utterance, NA),
ind = match("((s: End))", story),
story = replace(story, seq_len(first(ind)),
zoo::na.locf0(story[seq_len(first(ind))])), ind = NULL)
-输出
line utterance story
1 001 ((m: both hands <NA>
2 002 ((m: both hands <NA>
3 003 ((i: DH=1, SZ=0 <NA>
4 004 ((i: DH=1, SZ=0 <NA>
5 005 ((s: Preface)) ((s: Preface))
6 006 ((m: both hands ((s: Preface))
7 007 ((m: both hands clasped ((s: Preface))
8 008 ((m: both hands clasped ((s: Preface))
9 009 ((s: Background)) ((s: Background))
10 010 ((m: enumerating ((s: Background))
11 011 ((m: enumerating ((s: Background))
12 012 ((s: End)) ((s: End))
13 013 ((i: DH=1, SZ=0 <NA>
14 014 ((m: relax gesture)) <NA>