noctua 1.10.0

New Feature

library(DBI)
library(noctua)
con <- dbConnect(athena())
dbGetPartition(con, "test_df2", .format = T)
# Info: (Data scanned: 0 Bytes)
#    year month day
# 1: 2020    11  17
dbGetPartition(con, "test_df2")
# Info: (Data scanned: 0 Bytes)
#                    partition
# 1: year=2020/month=11/day=17
library(DBI)
con <- dbConnect(noctua::athena(), bigint = "numeric")

When switching between the different file parsers the bigint to be represented according to the file parser i.e. data.table: “integer64” -> vroom: “I”.

Bug Fix:

Documentation:

noctua 1.9.1

Note:

Error: write_parquet requires the arrow package, please install it first and try again

Bug Fix:

Error in seq.default(1, length(l), 1000) : wrong sign in 'by' argument

Now a warning message will be returned:

Warning message:
Failed to remove AWS S3 files from: "s3://{bucket}/{prefix}/". Please check if AWS S3 files exist.

noctua 1.9.0

Minor Change:

library(DBI)
library(data.table)

X <- 1010
value <- data.table(x = 1:X,
                    y = sample(letters, X, replace = T), 
                    z = sample(c(TRUE, FALSE), X, replace = T))

con <- dbConnect(noctua::athena())

# create a removable table with 1010 parquet files in AWS S3.
dbWriteTable(con, "rm_tbl", value, file.type = "parquet", overwrite = T, max.batch = 1)

# old method: delete_object
system.time({dbRemoveTable(con, "rm_tbl", confirm = T)})
# user  system elapsed 
# 31.004   8.152 115.906 

# new method: delete_objects
system.time({dbRemoveTable(con, "rm_tbl", confirm = T)})
# user  system elapsed 
# 17.319   0.370  22.709 

New Feature

Bug Fix:

Info: The S3 objects in prefix will be deleted:
  s3://bucket/path/schema/table
Info: The S3 objects in prefix will be deleted:
  s3://bucket/path/schema/table

To overcome this dbRemoveTable will opt for paws::s3()$list_objects_v2 instead of paws::s3()$list_objects when listing s3 objects to be deleted. This allows noctua to iterate over AWS s3 prefix using tokens, instead of deleting objects in chunks. * s3_upload_location simplified how s3 location is built. Now s3.location parameter isn’t affected and instead only additional components e.g. name, schema and partition. * dbplyr v-2.0.0 function in_schema now wraps strings in quotes, this breaks db_query_fields.AthenaConnection. Now db_query_fields.AthenaConnection removes any quotation from the string so that it can search AWS GLUE for table metadata. (#117)

noctua 1.8.1

Bug Fix

noctua 1.8.0

New Feature

# Stop AWS Athena when R has been interrupted:

con <- dbConnect(noctua::athena())

# Let AWS Athena keep running when R has been interrupted:

con <- dbConnect(noctua::athena(),
                 keyboard_interrupt = F)

noctua 1.7.1

Minor Change

Documentation:

noctua 1.7.0

New Feature

library(DBI)
con <- dbConnect(noctua::athena())
res <- dbExecute(con, "select * from some_big_table limit 10000")
dbFetch(res, 5000)

Bug Fix

Documentation

noctua 1.6.0

New Feature

library(DBI)

con = dbConnect(noctua::athena())

# upload iris dataframe for removal test
dbWriteTable(con, "iris2", iris)

# Athena method
system.time(dbRemoveTable(con, "iris2", confirm = T))
# user  system elapsed 
# 0.247   0.091   2.243 

# upload iris dataframe for removal test
dbWriteTable(con, "iris2", iris)

# Glue method
system.time(dbRemoveTable(con, "iris2", confirm = T))
# user  system elapsed 
# 0.110   0.045   1.094 
library(DBI)
con = dbConnect(RAthena::athena())
dbWriteTable(con, "iris2", iris, file.type = "json")
dbGetQuery(con, "select * from iris2")

Bug Fix

Documentation

Unit tests:

noctua 1.5.1

Bug Fix

library(readr)
library(microbenchmark)

# creating some dummy data for testing
X <- 1e8
df <- 
data.frame(
    w = runif(X),
    x = 1:X,
    y = sample(letters, X, replace = T), 
    z = sample(c(TRUE, FALSE), X, replace = T))
write_csv(df, "test.csv")

# read in text file into raw format
obj <- readBin("test.csv", what = "raw", n = file.size("test.csv"))

format(object.size(obj), units = "auto")
# 3.3 Gb

# writeBin in a loop
write_bin <- function(
  value,
  filename,
  chunk_size = 2L ^ 20L) {
  
  total_size <- length(value)
  split_vec <- seq(1, total_size, chunk_size)
  
  con <- file(filename, "a+b")
  on.exit(close(con))
  
  sapply(split_vec, function(x){writeBin(value[x:min(total_size,(x+chunk_size-1))],con)})
  invisible(TRUE)
}


microbenchmark(writeBin_loop = write_bin(obj, tempfile()),
               readr = write_file(obj, tempfile()),
               times = 5)

# Unit: seconds
# expr       min       lq      mean    median        uq       max neval
# R_loop 41.463273 41.62077 42.265778 41.908908 42.022042 44.313893     5
# readr  2.291571  2.40495  2.496871  2.542544  2.558367  2.686921     5
# Before
translate_sql("2019-01-01", con = con)
# '2019-01-01'

# Now
translate_sql("2019-01-01", con = con)
# DATE '2019-01-01'
# R code:
paste("hi", "bye", sep = "-")

# SQL translation:
('hi'||'-'||'bye')
library(DBI)
library(dplyr)

con <- dbConnect(RAthena::athena())

tbl(con, "iris") %>%
  compute(name = "temp.iris")

New Feature

library(DBI)
library(dplyr)

con <- dbConnect(noctua::athena())

# ident method:
t1 <- system.time(tbl(con, "iris"))

# sub query method:
t2 <- system.time(tbl(con, sql("select * from iris")))

# ident method
# user  system elapsed 
# 0.082   0.012   0.288 

# sub query method
# user  system elapsed 
# 0.993   0.138   3.660 

Unit test

noctua 1.5.0

New Feature

library(noctua)

noctua_options("vroom")

Unit tests

Documentation

noctua 1.4.0

Major Change

warning('Appended `file.type` is not compatible with the existing Athena DDL file type and has been converted to "', File.Type,'".', call. = FALSE)

Bug fix

Unit Tests

New Feature

Minor Change

noctua 1.3.0

Major Change

Performance results

library(DBI)
X <- 1e8
df <- data.frame(w =runif(X),
                 x = 1:X,
                 y = sample(letters, X, replace = T), 
                 z = sample(c(TRUE, FALSE), X, replace = T))
con <- dbConnect(noctua::athena())
# upload dataframe with different splits
dbWriteTable(con, "test_split1", df, compress = T, max.batch = nrow(df), overwrite = T) # no splits
dbWriteTable(con, "test_split2", df, compress = T, max.batch = 0.05 * nrow(df), overwrite = T) # 20 splits
dbWriteTable(con, "test_split3", df, compress = T, max.batch = 0.1 * nrow(df), overwrite = T) # 10 splits

AWS Athena performance results from AWS console (query executed: select count(*) from .... ):

library(DBI)
X <- 1e8
df <- data.frame(w =runif(X),
                 x = 1:X,
                 y = sample(letters, X, replace = T), 
                 z = sample(c(TRUE, FALSE), X, replace = T))
con <- dbConnect(noctua::athena())
dbWriteTable(con, "test_split1", df, compress = T, overwrite = T) # default will now split compressed file into 20 equal size files.

Added information message to inform user about what files have been added to S3 location if user is overwriting an Athena table.

Minor Change

Bug Fix

Unit tests

noctua 1.2.1

New Features:

Bug fixed

noctua 1.2.0

Minor Change

Backend Change

library(DBI)

con <- dbConnect(noctua::athena())

dbWriteTable(con, "iris", iris)

Bug Fix

Unit Tests

New Feature

Minor Change

noctua 1.1.0

New Features

Bug fix

Unit Tests

Minor Change

Major Change

noctua 1.0.0

New Features

DBI

Athena lower level api