r - SQL 将许多 tsv 文件连接成数据库中的单个表,同时跟踪文件源 (MonetDBLite)



我正在使用MonetDBLite R包来创建MonetDB。我可以使用此处的说明使用以下代码创建数据库表:

library(DBI)
library(MonetDBLite)
# Write tsv file of mtcars
write.table(mtcars, "mtcars.tsv", row.names=FALSE, sep= "t")
# Initialize MonetDB
dbdir <- "/Users/admin/my_directory"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
# Write table
dbWriteTable(con, "test4", "mtcars.tsv", delim="t")

以下查询给出

> dbGetQuery(con, "SELECT * FROM test4 LIMIT 3")
mpg cyl  disp  hp drat    wt  qsec vs am gear carb
1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1

目前为止,一切都好。但是,假设我有另一个具有不同 mpg 值的文件 mtcars2:

mtcars2 <- mtcars
mtcars2$mpg <- mtcars2$mpg + 5
write.table(mtcars2, "mtcars2.tsv", row.names= FALSE, sep = "t")

我可以将其加载到另一个表中:

dbWriteTable(con, "test5", "mtcars2.tsv", delim = "t")
> dbGetQuery(con, "SELECT * FROM test5 LIMIT 3")
mpg cyl disp  hp drat    wt  qsec vs am gear carb
1 26.0   6  160 110 3.90 2.620 16.46  0  1    4    4
2 26.0   6  160 110 3.90 2.875 17.02  0  1    4    4
3 27.8   4  108  93 3.85 2.320 18.61  1  1    4    1

也很好。但我的问题是:我想稍后查找所有 6cyl的汽车的mpg,并知道它来自哪个数据集(mtcars 或 mtcars2)。根据我对SQL索引的理解(这不是很多,基本上是我在这里读到的),我应该将所有数据放在一个表中,以便进行最有效的搜索。我尝试加载第一个 tsv 文件,然后使用ALTER TABLE test4 ADD dataset TEXTUPDATE test4 SET dataset = dataset1sql 命令添加了另一列-

dbSendQuery(con, "UPDATE test4 SET dataset = dataset1")
dbSendQuery(con, "UPDATE test4 SET dataset = 1")
> dbGetQuery(con, "SELECT * FROM test4 LIMIT 3")
mpg cyl disp  hp drat    wt  qsec vs am gear carb dataset
1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4       1
2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4       1
3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1       1

但是当我尝试将 mtcars2 附加到表中时,它的列数不同(正如我应该预料的那样,duh)。将许多具有相同列的 tsv 文件中的数据连接到单个表中,同时跟踪数据源的最佳方法是什么?

编辑 - 正如您可能已经猜到的那样,真正的数据不是 mtcars- 它是数百万行长的平面 tsv 文件,这意味着我想避免将整个文件读入内存并使用 R 进行操作。

按照xQbert的建议,我只使用SQL命令来解决(这是必要的,比bash命令更快,考虑到我的数据是10个文件,每个文件有数百万行长)。

library(DBI)
library(MonetDBLite)
# Write tsv file of mtcars
write.table(mtcars, "mtcars.tsv", row.names=FALSE, sep= "t")
# Write tsv of second mtcars
mtcars2 <- mtcars
mtcars2$mpg <- mtcars2$mpg + 5
write.table(mtcars2, "mtcars2.tsv", row.names= FALSE, sep = "t")
# Initialize MonetDB
dbdir <- "/Users/admin/"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
# Write table
dbWriteTable(con, "test4", "mtcars.tsv", delim="t")
# Add data source information
dbSendQuery(con, "ALTER TABLE test4 ADD source TEXT")
dbSendQuery(con, "UPDATE test4 SET source = 'dataset1'")
# Write second dataset to a temporary table
dbWriteTable(con, "temptable", "mtcars2.tsv", delim="t")
# Add data source information
dbSendQuery(con, "ALTER TABLE temptable ADD source TEXT")
dbSendQuery(con, "UPDATE temptable SET source = 'dataset2'")
# Insert temp table into main table
dbSendQuery(con, "INSERT INTO test4 SELECT * FROM temptable")
# Drop temp table
dbSendQuery(con, "DROP TABLE temptable")
# Checking the data, truncated for clarity
> dbGetQuery(con, "SELECT * FROM test4")
mpg cyl  disp  hp drat    wt  qsec vs am gear carb   source
1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4 dataset1
2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4 dataset1
3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1 dataset1
...
33 26.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4 dataset2
34 26.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4 dataset2
35 27.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1 dataset2
...
64 26.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2 dataset2

抱歉,如果我在问题中没有足够清楚地说明我的数据比 mtcars 大得多 - 如果您有中等大小的数据,data.tables包可能是比数据库更好的解决方案。

您应该能够在读取文件后执行dbWriteTable()在 data.frame 中创建一个新变量。像这样:

library(DBI)
library(MonetDBLite)
library(data.table)
# Write tsv file of mtcars
tmp <- tempfile()
write.table(mtcars, tmp, row.names=FALSE, sep= "t")
# Initialize MonetDB
dbdir <- "~/Desktop/temp"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
test4df <- fread(tmp)
test4df$dataset <- 1
dbWriteTable(con, "test4", test4df)
dbReadTable(con, "test4")
test5df <- fread(tmp)
test5df$mpg <- test5df$mpg + 5
test5df$dataset <- 2
dbWriteTable(con, "test4", test5df, append = TRUE)
dbReadTable(con, "test4")

编辑(在不打开文件的情况下的建议)

如果您想在不一次打开文件的情况下工作,您可以执行类似操作来修改文件并附加另一个字段。正如我所写的,这将适用于带有bash的操作系统。

infile <- tmp
outfile <- tempfile()
# open connections
incon <- file(description = infile, open = "r")
outcon <- file(description = outfile, open = "w")
# count the number of lines (this will work only with Mac/Linux)
com <- paste("wc -l ", infile, " | awk '{ print $1 }'", sep="")
n <- system(command=com, intern=TRUE)
# work with the first line
txt <- scan(file = incon, what = character(), nlines=1, quiet=TRUE)
txt <- c(txt, "dataset")
cat(paste(txt, collapse = "t"), "n", file = outcon, sep = "")
# work with the rest of the file
for(i in 2:n) {
txt <- scan(file = incon, what = character(), nlines=1, quiet=TRUE)
txt <- c(txt, "1")
cat(paste(txt, collapse = "t"), "n", file = outcon, sep = "")
}
close(incon);close(outcon)
dbWriteTable(con, "test4", outfile, delim = "t")
# do the similar for other files

以下是我要做的,给定一组具有最终表中所需结构和文件名的文件,否则它是所有文件数据的组合:

# say we have those files
write.table(mtcars, "mtcars1.tsv", row.names=FALSE, sep= "t")
write.table(mtcars, "mtcars2.tsv", row.names=FALSE, sep= "t")
# write them individually, and add a column that contains the file name
dbWriteTable(con, "mtcars1", "mtcars1.tsv", delim="t")
dbSendQuery(con, "ALTER TABLE mtcars1 ADD COLUMN file STRING DEFAULT 'mtcars1.tsv';")
dbWriteTable(con, "mtcars2", "mtcars2.tsv", delim="t")
dbSendQuery(con, "ALTER TABLE mtcars2 ADD COLUMN file STRING DEFAULT 'mtcars2.tsv';")
# now combine into a new table
dbSendQuery(con, "CREATE TABLE mtcars_mat AS SELECT * FROM mtcars1 UNION ALL SELECT * FROM mtcars2")
# or a view if you don't need to modify the data in the mtcars table (faster)
dbSendQuery(con, "CREATE view mtcars AS SELECT * FROM mtcars1 UNION ALL SELECT * FROM mtcars2")

# and here is the same as a loop with a filename glob and some added robustness (handy if you have 1000 files)
files <- Sys.glob("/some/path/mtcars*.tsv")
tables <- dbQuoteIdentifier(con, tools::file_path_sans_ext(basename(files)))
dbBegin(con)
for (i in 1:length(files)) {
dbWriteTable(con, tables[i], files[i], delim="t", transaction=FALSE)
dbSendQuery(con, paste0("ALTER TABLE ", tables[i], " ADD COLUMN file STRING DEFAULT ",dbQuoteString(con, files[i]),";"))
}
dbSendQuery(con, paste0("CREATE TABLE somefinalresult AS ", paste0("SELECT * FROM ",tables, collapse=" UNION ALL ")))
# remove the parts again, optional
dbSendQuery(con, paste0("DROP TABLE ", tables, ";", collapse=" "))
dbCommit(con)

最新更新