将值减去另一列中最近的特定字符串

  • 本文关键字:最近 一列 字符串 r
  • 更新时间 :
  • 英文 :


假设我有一个数据df,如下所示。共有20行,列string中有四种类型的字符串:"A"、"B"、"C"one_answers"D"。

no  string  position
1   B   650
2   C   651
3   B   659
4   C   660
5   C   662
6   B   663
7   D   668
8   D   670
9   C   671
10  B   672
11  C   673
12  A   681
13  C   682
14  B   683
15  C   684
16  D   690
17  A   692
18  C   693
19  D   694
20  C   695

通过从前一行减去列position中的值,我可以通过执行以下命令获得第四列distance

df$distance <- ave(df$position, FUN=function(x) c(0, diff(x)))

这样我就可以得到从当前值到前一行的距离,如下所示:

no  string  position    distance
1   B   650 0
2   C   651 1
3   B   659 8
4   C   660 1
5   C   662 2
6   B   663 1
7   D   668 5
8   D   670 2
9   C   671 1
10  B   672 1
11  C   673 1
12  A   681 8
13  C   682 1
14  B   683 1
15  C   684 1
16  D   690 6
17  A   692 2
18  C   693 1
19  D   694 1
20  C   695 1

然而,我希望得到的是column position中每个字符串到最近的前一个字符串"C"的距离,例如下面7,8和17的变化:

no  string  position    distance
1   B   650 0
2   C   651 1
3   B   659 8
4   C   660 1
5   C   662 2
6   B   663 1
7   D   668 6
8   D   670 8
9   C   671 1
10  B   672 1
11  C   673 1
12  A   681 8
13  C   682 1
14  B   683 1
15  C   684 1
16  D   690 6
17  A   692 8
18  C   693 1
19  D   694 1
20  C   695 1

我该怎么做?顺便说一句,我能知道我该怎么做才能得到离string列中最近的下一个"C"的距离吗?

也许不是一个理想的解决方案,有一种方法可以简化它。

#Taken from your code
df$distance <- ave(df$position, FUN=function(x) c(0, diff(x)))
#logical values indicating occurrence of "C" 
c_occur = df$string == "C"
#We can ignore first two values in each group since, 
#First value is "C" and second value is correctly calculated from previous row
#Get the indices where we need to replace the values
inds_to_replace = which(ave(df$string, cumsum(c_occur), FUN = seq_along) > 2)
#Get the closest occurrence of "C" from the inds_to_replace
c_to_replace <- sapply(inds_to_replace, function(x) {
new_inds <- which(c_occur)
max(new_inds[(x - new_inds) > 0])
#To get distance from "nearest next "C" replace the above line with 
#new_inds[which.max(x - new_inds < 0)]
})
#Replace the values
df$distance[inds_to_replace] <- df$position[inds_to_replace] - 
df$position[c_to_replace]
df[inds_to_replace, ]
#   no string position distance
#7   7      D      668        6
#8   8      D      670        8
#17 17      A      692        8

下面的tidyverse方法再现您的预期输出。

问题描述:计算当前行的position前一行string = "C"的差;如果不存在先前的string = "C"行或者该行本身具有string = "C",则距离由当前行和先前行之间的position的差给出(与string无关(。

library(tidyverse)
df %>%
mutate(nC = cumsum(string == "C")) %>%
group_by(nC) %>%
mutate(dist = cumsum(c(0, diff(position)))) %>%
ungroup() %>%
mutate(dist = if_else(dist == 0, c(0, diff(position)), dist)) %>%
select(-nC)
## A tibble: 20 x 4
#      no string position  dist
#   <int> <fct>     <int> <dbl>
# 1     1 B           650    0.
# 2     2 C           651    1.
# 3     3 B           659    8.
# 4     4 C           660    1.
# 5     5 C           662    2.
# 6     6 B           663    1.
# 7     7 D           668    6.
# 8     8 D           670    8.
# 9     9 C           671    1.
#10    10 B           672    1.
#11    11 C           673    1.
#12    12 A           681    8.
#13    13 C           682    1.
#14    14 B           683    1.
#15    15 C           684    1.
#16    16 D           690    6.
#17    17 A           692    8.
#18    18 C           693    1.
#19    19 D           694    1.
#20    20 C           695    1.

样本数据

df <- read.table(text =
"no  string  position
1   B   650
2   C   651
3   B   659
4   C   660
5   C   662
6   B   663
7   D   668
8   D   670
9   C   671
10  B   672
11  C   673
12  A   681
13  C   682
14  B   683
15  C   684
16  D   690
17  A   692
18  C   693
19  D   694
20  C   695", header = T)

这里有一个data.table方式:

dtt[, distance := c(0, diff(position))]
dtt[cumsum(string == 'C') > 0,
distance := ifelse(seq_len(.N) == 1, distance, position - position[1]),
by = cumsum(string == 'C')]
#     no string position distance
#  1:  1      B      650        0
# 2:  2      C      651        1
# 3:  3      B      659        8
# 4:  4      C      660        1
# 5:  5      C      662        2
# 6:  6      B      663        1
# 7:  7      D      668        6
# 8:  8      D      670        8
# 9:  9      C      671        1
# 10: 10      B      672        1
# 11: 11      C      673        1
# 12: 12      A      681        8
# 13: 13      C      682        1
# 14: 14      B      683        1
# 15: 15      C      684        1
# 16: 16      D      690        6
# 17: 17      A      692        8
# 18: 18      C      693        1
# 19: 19      D      694        1
# 20: 20      C      695        1

这里是dtt:

structure(list(no = 1:20, string = c("B", "C", "B", "C", "C", 
"B", "D", "D", "C", "B", "C", "A", "C", "B", "C", "D", "A", "C", 
"D", "C"), position = c(650L, 651L, 659L, 660L, 662L, 663L, 668L, 
670L, 671L, 672L, 673L, 681L, 682L, 683L, 684L, 690L, 692L, 693L, 
694L, 695L)), row.names = c(NA, -20L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x1939260>)

如果您想为非C行获取到最近的下一个C的距离,请尝试以下操作:

dtt[, distance := c(0, diff(position))]
dtt[, g := rev(cumsum(rev(string == 'C')))]
dtt[g > 0, distance := ifelse(seq_len(.N) == .N, distance, abs(position - position[.N])), by = g]
dtt[, g := NULL]
#     no string position distance
#  1:  1      B      650        1
#  2:  2      C      651        1
#  3:  3      B      659        1
#  4:  4      C      660        1
#  5:  5      C      662        2
#  6:  6      B      663        8
#  7:  7      D      668        3
#  8:  8      D      670        1
#  9:  9      C      671        1
# 10: 10      B      672        1
# 11: 11      C      673        1
# 12: 12      A      681        1
# 13: 13      C      682        1
# 14: 14      B      683        1
# 15: 15      C      684        1
# 16: 16      D      690        3
# 17: 17      A      692        1
# 18: 18      C      693        1
# 19: 19      D      694        1
# 20: 20      C      695        1

最新更新