r语言 - 直接使用 dplyr 改变数据库表中的变量 - r - Mutate variables in database tables directly using dplyr 小贝子编程网

这是MonetDBLite数据库文件中的mtcars数据。

library(MonetDBLite)
library(tidyverse)
library(DBI)
dbdir <- getwd()
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
dbWriteTable(conn = con, name = "mtcars_1", value = mtcars)
data_mt <- con %>% tbl("mtcars_1")

我想使用 dplyr mutate 创建新变量并将其添加(提交！类似的东西

data_mt %>% select(mpg, cyl) %>% mutate(var = mpg/cyl) %>% dbCommit(con)

当我们这样做时，所需的输出应该是相同的：

dbSendQuery(con, "ALTER TABLE mtcars_1 ADD COLUMN var DOUBLE PRECISION")
dbSendQuery(con, "UPDATE mtcars_1 SET var=mpg/cyl")

怎么能做到呢？

这里有几个函数，create和update.tbl_lazy。

他们分别实现了CREATE TABLE，这很简单，而ALTER TABLE/UPDATE对则不那么简单：

创造

create <- function(data,name){
DBI::dbSendQuery(data$src$con,
paste("CREATE TABLE", name,"AS", dbplyr::sql_render(data)))
dplyr::tbl(data$src$con,name)
}

例：

library(dbplyr)
library(DBI)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
copy_to(con, head(iris,3),"iris")
tbl(con,"iris") %>% mutate(Sepal.Area= Sepal.Length * Sepal.Width) %>% create("iris_2")
# # Source:   table<iris_2> [?? x 6]
# # Database: sqlite 3.22.0 []
#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Area
#          <dbl>       <dbl>        <dbl>       <dbl> <chr>        <dbl>
# 1          5.1         3.5          1.4         0.2 setosa        17.8
# 2          4.9         3            1.4         0.2 setosa        14.7
# 3          4.7         3.2          1.3         0.2 setosa        15.0

更新

update.tbl_lazy <- function(.data,...,new_type="DOUBLE PRECISION"){
quos <- rlang::quos(...)
dots <- rlang::exprs_auto_name(quos, printer = tidy_text)
# extract key parameters from query
sql <- dbplyr::sql_render(.data)
con  <- .data$src$con
table_name <-gsub(".*?(FROM (`|")(.+?)(`|")).*","\3",sql)
if(grepl("nWHERE ",sql)) where <-  regmatches(sql, regexpr("WHERE .*",sql))
else where <- ""
new_cols <- setdiff(names(dots),colnames(.data))
# Add empty columns to base table
if(length(new_cols)){
alter_queries <- paste("ALTER TABLE",table_name,"ADD COLUMN",new_cols,new_type)
purrr::walk(alter_queries, ~{
rs <- DBI::dbSendStatement(con, .)
DBI::dbClearResult(rs)})}
# translate unevaluated dot arguments to SQL instructions as character
translations  <- purrr::map_chr(dots, ~ translate_sql(!!! .))
# messy hack to make translations work
translations <- gsub("OVER \(\)","",translations) 
# 2 possibilities: called group_by or (called filter or called nothing)
if(identical(.data$ops$name,"group_by")){
# ERROR if `filter` and `group_by` both used
if(where != "") stop("Using both `filter` and `group by` is not supported")
# Build aggregated table
gb_cols   <- paste0('"',.data$ops$dots,'"',collapse=", ")
gb_query0 <- paste(translations,"AS", names(dots),collapse=", ")
gb_query  <- paste("CREATE TABLE TEMP_GB_TABLE AS SELECT",
gb_cols,", ",gb_query0,
"FROM", table_name,"GROUP BY", gb_cols)
rs <- DBI::dbSendStatement(con, gb_query)
DBI::dbClearResult(rs)
# Delete temp table on exit
on.exit({
rs <- DBI::dbSendStatement(con,"DROP TABLE TEMP_GB_TABLE")
DBI::dbClearResult(rs)
})
# Build update query
gb_on <- paste0(table_name,'."',.data$ops$dots,'" = TEMP_GB_TABLE."', .data$ops$dots,'"',collapse=" AND ")
update_query0 <- paste0(names(dots)," = (SELECT ", names(dots), " FROM TEMP_GB_TABLE WHERE ",gb_on,")",
collapse=", ")
update_query <- paste("UPDATE", table_name, "SET", update_query0)
rs <- DBI::dbSendStatement(con, update_query)
DBI::dbClearResult(rs)
} else {
# Build update query in case of no group_by and optional where
update_query0 <- paste(names(dots),'=',translations,collapse=", ")
update_query  <- paste("UPDATE", table_name,"SET", update_query0,where)
rs <- DBI::dbSendStatement(con, update_query)
DBI::dbClearResult(rs)
}
tbl(con,table_name)
}

示例 1，定义 2 个新的数字列：

tbl(con,"iris") %>% update(x=pmax(Sepal.Length,Sepal.Width),
y=pmin(Sepal.Length,Sepal.Width))
# # Source:   table<iris> [?? x 7]
# # Database: sqlite 3.22.0 []
#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species     x     y
#          <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl>
# 1          5.1         3.5          1.4         0.2 setosa    5.1   3.5
# 2          4.9         3            1.4         0.2 setosa    4.9   3  
# 3          4.7         3.2          1.3         0.2 setosa    4.7   3.2

例2，修改现有列，创建2个不同类型的新列：

tbl(con,"iris") %>%
update(x= Sepal.Length*Sepal.Width,
z= 2*y,
a= Species %||% Species,               
new_type = c("DOUBLE","VARCHAR(255)"))
# # Source:   table<iris> [?? x 9]
# # Database: sqlite 3.22.0 []
#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species     x     y     z a           
#          <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl> <dbl> <chr>       
# 1          5.1         3.5          1.4         0.2 setosa   17.8   3.5   7   setosasetosa
# 2          4.9         3            1.4         0.2 setosa   14.7   3     6   setosasetosa
# 3          4.7         3.2          1.3         0.2 setosa   15.0   3.2   6.4 setosasetosa

示例 3，更新位置：

tbl(con,"iris") %>% filter(Sepal.Width > 3) %>% update(a="foo")
# # Source:   table<iris> [?? x 9]
# # Database: sqlite 3.22.0 []
#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species     x     y     z a           
#          <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl> <dbl> <chr>       
# 1          5.1         3.5          1.4         0.2 setosa   17.8   3.5   7   foo         
# 2          4.9         3            1.4         0.2 setosa   14.7   3     6   setosasetosa
# 3          4.7         3.2          1.3         0.2 setosa   15.0   3.2   6.4 foo

示例 4：按组更新

tbl(con,"iris") %>%
group_by(Species, Petal.Width) %>%
update(new_col1 = sum(Sepal.Width,na.rm=TRUE), # using a R function
new_col2 = MAX(Sepal.Length))           # using native SQL
# # Source:   SQL [?? x 11]
# # Database: sqlite 3.22.0 []
#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species        x     y     z a            new_col1 new_col2
#          <dbl>       <dbl>        <dbl>       <dbl> <chr>      <dbl> <dbl> <dbl> <chr>           <dbl>    <dbl>
# 1          5.1         3.5          1.4         0.2 setosa         1     2   7   foo               6.5      5.1
# 2          4.9         3            1.4         0.2 setosa         1     2   6   setosasetosa      6.5      5.1
# 3          7           3.2          4.7         1.4 versicolor     1     2   6.4 foo               3.2      7

一般说明

代码使用 usedbplyr::translate_sql，因此我们可以使用 R 函数或本机函数，就像在旧的mutate调用中一样。
update只能在一次filter调用或一次group_by调用或每次调用零次后使用，其他任何内容，您都会得到错误或意外结果。
group_by实现非常笨拙，因此没有动态定义列或按操作分组的空间，坚持基础知识。
update和create都返回tbl(con, table_name)，这意味着您可以根据需要链接任意数量的create或update调用，中间有适当数量的group_by和filter。事实上，我所有的 4 个例子都可以链接。
要敲钉子，create没有受到相同的限制，您可以在调用之前获得尽可能多的dbplyr乐趣。
我没有实现类型检测，所以我需要new_type参数，它在代码中alter_queries定义的paste调用中回收，因此它可以是单个值或向量。

解决后者的一种方法是从translations变量中提取变量，在dbGetQuery(con,"PRAGMA table_info(iris)")中找到它们的类型。然后我们需要所有现有类型之间的强制规则，我们设置好了。但是由于不同的DBMS具有不同的类型，我想不出一种通用的方法，而且我不知道MonetDBLite。

r语言 - 直接使用 dplyr 改变数据库表中的变量

相关内容

最新更新

热门标签：