删除R data.table中包围行的双引号



我有几个格式不正确的CSV,它们以制表符分隔,但每行都有一个双引号。我可以读取它们,并使用以下忽略"

library(data.table)
files = list.files(pattern="*.csv")
dt = lapply(files, fread, sep="t", quote="")
setattr(dt, 'names', gsub(".csv", "", files))

但是,除了剥离第一列和最后一列的单独命令之外,是否有一种Rdata.table方法来处理引号?

# sample table
DT = data.table(V1=paste0(""", 1:5), V2=c(1,2,5,6,8), 
V3=c("a"","b"","c"","d"","e""))
dt = list(DT, DT, DT)
# these work but aren't using data.table 
dt = lapply(dt, function(i) {
i[[1]] = gsub('"', '', i[[1]])
i[[ncol(i)]] = gsub('"', '', i[[ncol(i)]])
i
})
# magical mystery operation that doesn't work???
dt = lapply(dt, function(i){
i[, .SD := gsub('"', '', rep(.SD)), .SDcols=names(i)[c(1, ncol(i))]]
})

使用索引或列名来分配

library(data.table)
lapply(dt, (x) {
# // get the column names based on the index 1st and last column
nm1 <- names(x)[c(1, length(x))]
# loop over the Subset of Data.table (.SD), use `gsub` 
# after specifying the columns to select in .SDcols
# assign the output back to the columns of interest (nm1)
x[, (nm1) := lapply(.SD, gsub, pattern = '"', replacement = ''), 
.SDcols = nm1][]
})

-输出

[[1]]
V1    V2     V3
<char> <num> <char>
1:      1     1      a
2:      2     2      b
3:      3     5      c
4:      4     6      d
5:      5     8      e
[[2]]
V1    V2     V3
<char> <num> <char>
1:      1     1      a
2:      2     2      b
3:      3     5      c
4:      4     6      d
5:      5     8      e
[[3]]
V1    V2     V3
<char> <num> <char>
1:      1     1      a
2:      2     2      b
3:      3     5      c
4:      4     6      d
5:      5     8      e

另一个选项是set

lapply(dt, (x) {

nm1 <- names(x)[c(1, length(x))]
for(j in nm1) set(x, i = NULL, j = j, value = gsub('"', '', x[[j]]))
})

-输出

dt
[[1]]
V1    V2     V3
<char> <num> <char>
1:      1     1      a
2:      2     2      b
3:      3     5      c
4:      4     6      d
5:      5     8      e
[[2]]
V1    V2     V3
<char> <num> <char>
1:      1     1      a
2:      2     2      b
3:      3     5      c
4:      4     6      d
5:      5     8      e
[[3]]
V1    V2     V3
<char> <num> <char>
1:      1     1      a
2:      2     2      b
3:      3     5      c
4:      4     6      d
5:      5     8      e

最新更新