我在文本文件中有数据,其中案例堆叠在单个列中。我需要从每种情况中提取选定的线到向量中。我知道如何用一个循环来解析每一行,但我想知道这是否可以在R中不使用循环来完成。
下面是一个演示数据帧:
demodat <- data.frame(V1 = c(
"case01",
"sid: 112905",
"form3: 2",
"form2: 0",
"form1: An interesting comment",
"form0: 8",
"case02",
"sid: 132788",
"form3: 1",
"form2: 1",
"form1: Not sure about this",
"form0: 17",
"case03",
"sid: 102296",
"form3: 1",
"form2: 0",
"form1: This is obvious",
"form0: 12"))
下面是我用来将case, form0和form1提取为向量的循环示例:
library(tidyverse)
datlines <- 6 # Number of rows per case
case <- NA
form0 <- NA
form1 <- NA
j <- 1
for(i in 1:nrow(demodat)) {
if (str_sub(demodat[i,1],1,4)=="case") case[j] <- demodat[i,1]
#
if (str_sub(demodat[i,1],1,6)=="form0:") form0[j] <- str_replace(demodat[i,1],"form0: ","")
if (str_sub(demodat[i,1],1,6)=="form1:") form1[j] <- str_replace(demodat[i,1],"form1: ","")
#
if(i%%datlines == 0) j <- j + 1
}
case
form0
form1
这种方法是有效的,但是实际的数据帧有数万行,我需要从每种情况中提取许多向量。我希望找到一种更有效的方法,避免循环遍历数据帧的每一行。
我将非常感谢你的建议。
这是scan
和grep
的一个简单的基R方法。
s <- scan(textConnection(demodat$V1), what = character(), sep = ":")
s <- trimws(s)
case <- grep("case", s, value = TRUE)
form0 <- s[grep("form0", s) + 1L]
form1 <- s[grep("form1", s) + 1L]
rm(s)
case
#> [1] "case01" "case02" "case03"
form0
#> [1] "8" "17" "12"
form1
#> [1] "An interesting comment" "Not sure about this" "This is obvious"
在2022-05-06由reprex包(v2.0.1)创建
library(dplyr)
library(tidyr)
demodat %>%
separate_rows(V1, sep = ',') %>% ## one row per ','-separated term
separate(V1, into = c('parameter', 'value'), sep = ':') ## (1)
## (1) now you can filter for parameter, e.g. 'sid' or grepl('case', parameter)
输出:
## # A tibble: 18 x 2
## parameter value
## <chr> <chr>
## 1 case01 NA
## 2 sid " 112905"
## 3 form3 " 2"
## 4 form2 " 0"
## 5 form1 " An interesting comment"
## 6 form0 " 8"
## 7 case02 NA
## ...
编辑要跟踪case ID,请在管道中添加以下内容:
## ... %>%
mutate(case_id = ifelse(grepl('case', parameter),
gsub('^case(.*)$','\1',parameter),
NA)
) %>%
fill(case_id, .direction = 'down')
另一个基数:
demo2 <- read.dcf(textConnection(gsub('case*', 'case*: ', demodat$V1)), all = TRUE)
> demo2
case* sid form3 form2
1 01, 02, 03 112905, 132788, 102296 2, 1, 1 0, 1, 0
form1 form0
1 An interesting comment, Not sure about this, This is obvious 8, 17, 12
> class(demo2)
[1] "data.frame"