From 7f6197b1e388ed0376c635553d5a31e254aec7a4 Mon Sep 17 00:00:00 2001 From: Dongdong Kong Date: Fri, 15 Dec 2023 12:36:07 +0800 Subject: [PATCH] add duckdb --- .Rbuildignore | 1 - DESCRIPTION | 3 ++ NAMESPACE | 7 ++++ R/R6_dbase.R | 57 +++++++++++++++++++++++++++++++ R/edit_db_config.R | 5 +++ R/tool_dbase.R | 4 ++- scripts/database/test-db_perm.Rmd | 14 ++++++-- scripts/speed_duckdb.R | 8 +++++ scripts/test1_duckdb.qmd | 38 +++++++++++++++++++++ vignettes/database_config.Rmd | 2 -- vignettes/database_tidydata.Rmd | 26 ++++++++++++++ 11 files changed, 159 insertions(+), 6 deletions(-) create mode 100644 R/R6_dbase.R create mode 100644 R/edit_db_config.R create mode 100644 scripts/speed_duckdb.R create mode 100644 scripts/test1_duckdb.qmd create mode 100644 vignettes/database_tidydata.Rmd diff --git a/.Rbuildignore b/.Rbuildignore index 93219cd..7e9f3fb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,5 +7,4 @@ ^INPUT$ ^Figures$ ^OUTPUT$ -^inst$ ^tidydb2\.Rproj$ diff --git a/DESCRIPTION b/DESCRIPTION index eed8fd4..e349868 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,14 +8,17 @@ Description: What the package does (one paragraph). License: MIT + file LICENSE Imports: methods, + R6, magrittr, purrr, dbplyr, dplyr, DBI, + duckdb, RMySQL, crayon Suggests: knitr, RMariaDB, + usethis, rmarkdown, testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 8367521..39cdcea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,15 +3,22 @@ export(dbRemoveTables_like) export(db_append) export(db_info) +export(dbase) +export(edit_db_config) export(get_dbInfo) export(import_table_large) export(open_mariadb) export(open_mysql) export(tbl_copy) import(DBI) +import(R6) import(crayon) import(dplyr) import(magrittr) +importFrom(duckdb,duckdb) +importFrom(duckdb,duckdb_shutdown) importFrom(methods,new) importFrom(purrr,`%||%`) importFrom(utils,str) +importMethodsFrom(DBI,dbSendQuery) +importMethodsFrom(RMySQL,dbSendQuery) diff --git a/R/R6_dbase.R b/R/R6_dbase.R new file mode 100644 index 0000000..dd389fc --- /dev/null +++ b/R/R6_dbase.R @@ -0,0 +1,57 @@ +#' @import R6 +#' @importFrom duckdb duckdb duckdb_shutdown +#' @export +dbase <- R6Class("duckdb_base", list( + db = NULL, + table = NULL, + type = "duckdb", + con = NULL, + tbl = NULL, + initialize = function(db = NULL, table = NULL, type = c("duckdb", "sqlite")) { + dbinfo <- get_dbInfo("duckdb")[[1]] # 默认是第一个 + if (is.null(db)) db <- dbinfo$db + self$db <- db + + # 从文件后缀能够猜出变量类型 + type <- match.arg(type) + self$type <- type + + if (type == "duckdb") { + self$con <- dbConnect(duckdb(), dbdir = self$db, read_only = TRUE) + } else if (type == "sqlite") { + self$con <- dbConnect(duckdb(), dbdir = self$db, read_only = TRUE) + } + + if (is.null(table)) table <- dbinfo$table[1] %||% DBI::dbListTables(self$con)[1] + self$table <- table + self$tbl <- tbl(self$con, self$table) + }, + print = function(...) { + cat(sprintf("db : %s\n", self$db)) + cat(sprintf("table: %s\n", self$table)) + print(self$tbl) + }, + finalize = function() { + message("close datadb ...") + self$close(force = TRUE) + }, + close = function(force = FALSE) { + if (!force) { + DBI::dbDisconnect(self$con, shutdown = TRUE) + } else { + duckdb::duckdb_shutdown(duckdb()) + } + }, + read_data = function(site_id = 50349L, verbose = TRUE) { + suppressWarnings({ + t <- system.time({ + d <- self$tbl |> + filter(site == site_id) |> + collect() + }) + if (verbose) print(t) + d + }) + } + ## add a write table options +)) diff --git a/R/edit_db_config.R b/R/edit_db_config.R new file mode 100644 index 0000000..52ad3d7 --- /dev/null +++ b/R/edit_db_config.R @@ -0,0 +1,5 @@ +#' @export +edit_db_config <- function() { + f = normalizePath("~/.db.yml") + usethis::edit_file(f) +} diff --git a/R/tool_dbase.R b/R/tool_dbase.R index c86315e..1677b20 100644 --- a/R/tool_dbase.R +++ b/R/tool_dbase.R @@ -10,7 +10,10 @@ # library(tidydb) # pak::pkg_install(c("rpkgs/tidydb", "rpkgs/tidymet")) #' @import DBI dplyr crayon +#' @importMethodsFrom DBI dbSendQuery +#' @importMethodsFrom RMySQL dbSendQuery setMethod("dbSendQuery", c("MySQLConnection", "character"), + # import S4 method from RMySQL function(conn, statement, ...) { RMySQL:::checkValid(conn) @@ -116,7 +119,6 @@ db_append <- function(con, tbl, values) { dbWriteTable(con, tbl, values, append = TRUE) } - #' @export dbRemoveTables_like <- function(con, pattern="dbplyr", del=TRUE) { tbls_bad = dbListTables(con) %>% .[grep(pattern, .)] diff --git a/scripts/database/test-db_perm.Rmd b/scripts/database/test-db_perm.Rmd index 29ae6a5..644c9bc 100644 --- a/scripts/database/test-db_perm.Rmd +++ b/scripts/database/test-db_perm.Rmd @@ -19,10 +19,21 @@ con_mariadb = open_mariadb(dbinfo, 1) # con ``` +## 测试读取数据 + ```{r} -table = tbl(con, "China_Mete2000_hourly_2020_2022") +con <- open_mariadb() ``` +```{r} +table = tbl(con, "China_Mete2000_hourly_2020_2022") + +system.time({ + d <- table |> + filter(site == 50136) |> + collect() +}) +``` ## 测试基础命令 @@ -40,7 +51,6 @@ t1 = system.time({ t2 = system.time({ copy_to(con_mariadb, df, "temp02", overwrite=TRUE, temporary = FALSE) }) - ``` diff --git a/scripts/speed_duckdb.R b/scripts/speed_duckdb.R new file mode 100644 index 0000000..89a7102 --- /dev/null +++ b/scripts/speed_duckdb.R @@ -0,0 +1,8 @@ +library(dplyr) +library(duckdb) +# dbinfo$hourly_2020_2022 + +db = dbase$new() +d = db$read_data() # first time about 20s, 16s可以读进来所有数据 +d <- db$read_data(50246) # 0.6s +# db$close(force=TRUE) diff --git a/scripts/test1_duckdb.qmd b/scripts/test1_duckdb.qmd new file mode 100644 index 0000000..9d5748e --- /dev/null +++ b/scripts/test1_duckdb.qmd @@ -0,0 +1,38 @@ +```{r} +library(duckdb) +library(data.table) +library(dplyr) + +db = "z:/DATA/China/ChinaMet_hourly_mete2000/data/China_Mete2000_hourly_full_2020-2022_tidy.duckdb" + +f = "z:/DATA/China/ChinaMet_hourly_mete2000/data/China_Mete2000_hourly_full_2020-2022_tidy.csv" + +``` + +# 写入数据 + +```{r} +# Create a connection to DuckDB +con <- dbConnect(duckdb::duckdb(db)) + +df = fread(f) +# Import the CSV file into DuckDB +duckdb::dbWriteTable(con, "China_Mete2000_hourly_2020_2022", df) + +# Close the connection +dbDisconnect(con) +``` + +# 读取数据 + +```{r} +dbDisconnect(con, shutdown = TRUE) + +dbExecute(con, "CREATE INDEX idx_site ON 'China_Mete2000_hourly_full_2020-2022' (site)") + +dbDisconnect(con) +``` + +```{r} + +``` diff --git a/vignettes/database_config.Rmd b/vignettes/database_config.Rmd index 5bf499e..8128c68 100644 --- a/vignettes/database_config.Rmd +++ b/vignettes/database_config.Rmd @@ -56,5 +56,3 @@ dbinfo = get_dbInfo() # see which db to read con_mysql = open_mysql() con_mariadb = open_mariadb(1, dbinfo) ``` - - diff --git a/vignettes/database_tidydata.Rmd b/vignettes/database_tidydata.Rmd new file mode 100644 index 0000000..c783d67 --- /dev/null +++ b/vignettes/database_tidydata.Rmd @@ -0,0 +1,26 @@ +--- +title: "database_tidydata" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{database_tidydata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## 测试读取和清洗数据 + +```{r setup} +library(tidydb2) +``` + +```{r} + +``` +