r语言 - TBL和mutate dplyr:变量格式不正确



我目前正在使用dplyr::tbl与Azure中的SQL数据库进行交互。我喜欢延迟查询求值的能力,直到它需要,并在许多函数中使用它。

我有时需要做的一件事是修改tabl()将数据读取到R环境中的方式。然而,我刚刚注意到一些意想不到的行为,当使用突变来做到这一点。
# Load library    
library(dplyr)
library(DBI)
library(odbc)
library(purrr)

使用odbc连接数据库

odbc::dbConnect(odbc::odbc(),
Driver = "ODBC Driver 17 for SQL Server",
...)
> class(db_con)
[1] "Microsoft SQL Server"
attr(,"package")
[1] ".GlobalEnv"
> # Azure Database
> # Microsoft SQL Server Version: 12.00.2195

使用mtcar指定测试数据,将第一行替换为NAs。

> mtcars[1,] <- NA
> mtcars_mod <- as_tibble(mtcars)
> head(mtcars_mod)
# A tibble: 6 x 11
mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1  NA      NA    NA    NA NA    NA     NA      NA    NA    NA    NA
2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
4  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
5  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
6  18.1     6   225   105  2.76  3.46  20.2     1     0     3     1
> # See what will be mapped
> purrr::map(mtcars_mod, ~dbDataType(db_con, .))
$mpg
[1] "FLOAT"
$cyl
[1] "FLOAT"
$disp
[1] "FLOAT"
$hp
[1] "FLOAT"
$drat
[1] "FLOAT"
$wt
[1] "FLOAT"
$qsec
[1] "FLOAT"
$vs
[1] "FLOAT"
$am
[1] "FLOAT"
$gear
[1] "FLOAT"
$carb
[1] "FLOAT"
> 
> # write to database
> dbWriteTable(db_con, "mtcars_mod", mtcars_mod, overwrite = TRUE)

现在数据已经写入数据库,我想把它放回环境中。目前所有变量都是"float">

> # Returns the correct result using 
> dbReadTable(db_con, "mtcars_mod") %>% head()
mpg cyl disp  hp drat    wt  qsec vs am gear carb
1   NA  NA   NA  NA   NA    NA    NA NA NA   NA   NA
2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
> 
> # using dplyr::tbl - returns the correct result
> dplyr::tbl(db_con, "mtcars_mod") %>% collect()
# A tibble: 32 x 11
mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1  NA      NA   NA     NA NA    NA     NA      NA    NA    NA    NA
2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
# ... with 22 more rows

到目前为止,这一切都很好,但是如果我想在调用collect()之前更改数据读入环境的方式,该怎么办呢?让我们看看…

> # What if changing the format of a column?
> # This completely changes the expression of the variables
> # mpg is correctly aligned as chr, but for remaining variables 0s or other values
> # are returned for missing values and this changes on each call.
> 
> (call_1 <- tbl(db_con, "mtcars_mod") %>% 
+   mutate(mpg = as.character(mpg)) %>% 
+   collect())
# A tibble: 32 x 11
mpg         cyl      disp    hp   drat        wt  qsec        vs        am  gear      carb
<chr>     <dbl>     <dbl> <dbl>  <dbl>     <dbl> <dbl>     <dbl>     <dbl> <dbl>     <dbl>
1 NA    2.37e-307 2.73e-310   NaN NaN    -5.55e303 NaN   2.37e-307 8.49e-314     0 2.12e-314
2 21    6   e+  0 1.6 e+  2   110   3.9   2.88e  0  17.0 0         1   e+  0     4 4   e+  0
3 22.8  4   e+  0 1.08e+  2    93   3.85  2.32e  0  18.6 1   e+  0 1   e+  0     4 1   e+  0
4 21.4  6   e+  0 2.58e+  2   110   3.08  3.22e  0  19.4 1   e+  0 0             3 1   e+  0
5 18.7  8   e+  0 3.6 e+  2   175   3.15  3.44e  0  17.0 0         0             3 2   e+  0
6 18.1  6   e+  0 2.25e+  2   105   2.76  3.46e  0  20.2 1   e+  0 0             3 1   e+  0
7 14.3  8   e+  0 3.6 e+  2   245   3.21  3.57e  0  15.8 0         0             3 4   e+  0
8 24.4  4   e+  0 1.47e+  2    62   3.69  3.19e  0  20   1   e+  0 0             4 2   e+  0
9 22.8  4   e+  0 1.41e+  2    95   3.92  3.15e  0  22.9 1   e+  0 0             4 2   e+  0
10 19.2  6   e+  0 1.68e+  2   123   3.92  3.44e  0  18.3 1   e+  0 0             4 4   e+  0
# ... with 22 more rows
> 
> (call_2 <- tbl(db_con, "mtcars_mod") %>% 
+   mutate(mpg = as.character(mpg)) %>% 
+   collect())
# A tibble: 32 x 11
mpg         cyl      disp        hp  drat        wt  qsec        vs        am      gear      carb
<chr>     <dbl>     <dbl>     <dbl> <dbl>     <dbl> <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
1 NA    2.12e-314 2.37e-307 2.12e-314  0    2.12e-314 NaN   2.12e-314 2.37e-307 2.12e-314 8.49e-314
2 21    6   e+  0 1.6 e+  2 1.1 e+  2  3.9  2.88e+  0  17.0 0         1   e+  0 4   e+  0 4   e+  0
3 22.8  4   e+  0 1.08e+  2 9.3 e+  1  3.85 2.32e+  0  18.6 1   e+  0 1   e+  0 4   e+  0 1   e+  0
4 21.4  6   e+  0 2.58e+  2 1.1 e+  2  3.08 3.22e+  0  19.4 1   e+  0 0         3   e+  0 1   e+  0
5 18.7  8   e+  0 3.6 e+  2 1.75e+  2  3.15 3.44e+  0  17.0 0         0         3   e+  0 2   e+  0
6 18.1  6   e+  0 2.25e+  2 1.05e+  2  2.76 3.46e+  0  20.2 1   e+  0 0         3   e+  0 1   e+  0
7 14.3  8   e+  0 3.6 e+  2 2.45e+  2  3.21 3.57e+  0  15.8 0         0         3   e+  0 4   e+  0
8 24.4  4   e+  0 1.47e+  2 6.2 e+  1  3.69 3.19e+  0  20   1   e+  0 0         4   e+  0 2   e+  0
9 22.8  4   e+  0 1.41e+  2 9.5 e+  1  3.92 3.15e+  0  22.9 1   e+  0 0         4   e+  0 2   e+  0
10 19.2  6   e+  0 1.68e+  2 1.23e+  2  3.92 3.44e+  0  18.3 1   e+  0 0         4   e+  0 4   e+  0
# ... with 22 more rows
> 
> (call_3 <- tbl(db_con, "mtcars_mod") %>% 
+     mutate(mpg = as.character(mpg)) %>% 
+     collect())
# A tibble: 32 x 11
mpg     cyl  disp    hp   drat     wt  qsec    vs    am  gear  carb
<chr> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 NA        0  NaN    NaN NaN    NaN    NaN     NaN   NaN   NaN   NaN
2 21        6  160    110   3.9    2.88  17.0     0     1     4     4
3 22.8      4  108     93   3.85   2.32  18.6     1     1     4     1
4 21.4      6  258    110   3.08   3.22  19.4     1     0     3     1
5 18.7      8  360    175   3.15   3.44  17.0     0     0     3     2
6 18.1      6  225    105   2.76   3.46  20.2     1     0     3     1
7 14.3      8  360    245   3.21   3.57  15.8     0     0     3     4
8 24.4      4  147.    62   3.69   3.19  20       1     0     4     2
9 22.8      4  141.    95   3.92   3.15  22.9     1     0     4     2
10 19.2      6  168.   123   3.92   3.44  18.3     1     0     4     4
# ... with 22 more rows

可以看出,在不同的调用中,R环境中的变量表达式在每次调用时都会发生变化,即使我只修改了一列变量类型(从数字到字符)。

此外,当我使用show_query()生成的SQL查询并直接在MSSQL中运行时,问题不存在。这似乎是R和/或dplyr::tbl的直接问题。

> sessionInfo()
R version 4.0.3 (2020-10-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)
Matrix products: default
locale:
[1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252    LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
[5] LC_TIME=English_Australia.1252    
attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     
other attached packages:
[1] tibble_3.0.6  keyring_1.1.0 purrr_0.3.4   odbc_1.3.0    DBI_1.1.1     dplyr_1.0.7  
loaded via a namespace (and not attached):
[1] Rcpp_1.0.7          rstudioapi_0.13     magrittr_2.0.1      hms_1.0.0           tidyselect_1.1.0    bit_4.0.4           R6_2.5.0           
[8] rlang_0.4.10        fansi_0.4.2         blob_1.2.1          tools_4.0.3         utf8_1.1.4          cli_2.3.0           withr_2.4.1        
[15] dbplyr_2.1.1        remotes_2.4.0       ellipsis_0.3.2      bit64_4.0.5         assertthat_0.2.1    lifecycle_1.0.0     neon_0.1.20        
[22] crayon_1.4.0        BiocManager_1.30.16 vctrs_0.3.8         glue_1.4.2          compiler_4.0.3      pillar_1.6.2        generics_0.1.0     
[29] lubridate_1.7.10    renv_0.12.5         pkgconfig_2.0.3 

这是ODBC包而不是dplyr的问题。

从1.3.0更新到1.3.2修复了这个问题。请看这里:https://cran.r-project.org/web/packages/odbc/news/news.html

最新更新