Inserting a binary blob to SciDB from the R package


#1

Do we have an example of inserting a binary blob from the R package?


#2

Here is an example (note the special indexing after download from SciDB):

genexp_mat = matrix(runif(3*4),  3)
print(dim(genexp_mat))
# [1] 3 4
x =serialize(genexp_mat, NULL)
library(scidb); scidbconnect()
xx = as.scidb(x)
# SciDB expression  R_arraye83d2464ea9b1480256928078527...
# SciDB schema  <val:binary> [i=0:0,1,0]
# variable dimension   type nullable start end chunk
# 1        i      TRUE  int64    FALSE     0   0     1
# 2      val     FALSE binary     TRUE

identical(x, xx[][[2]][[1]])

#3

I was revisiting this topic while working on a more sizeable upload (the variable es in code below is of size ~400 MB). Posting the updated code here:

library(scidb)
db = scidbconnect()

# Variable of interest is called `es`

format(object.size(serialize(es, NULL)), units = "Mb")
# [1] "409 Mb"

Now defining some helper functions

upload_as_binary = function(db, object_in_R) {
  x = serialize(object_in_R, NULL)
  as.scidb(db, x)
}

upload_as_base64_encoded_binary = function(db, object_in_R) {
  x = jsonlite::base64_enc(serialize(object_in_R, NULL))
  as.scidb(db, x)
}

download_from_binary = function(object_in_db) {
  unserialize(as.R(object_in_db)$val[[1]])
}

download_from_base64_encoded_binary = function(object_in_db) {
  unserialize(jsonlite::base64_dec(as.R(object_in_db)$val[[1]]))
}

Let us time the uploads and downloads - the latter uploads as character (string in SciDB) and is much slower.

system.time({object_in_db = upload_as_binary(db, object_in_R = es)})
#  user  system elapsed 
# 1.240   0.711   7.719 
system.time({es_downloaded = download_from_binary(object_in_db = object_in_db)})
#  user  system elapsed 
# 1.639   1.055   7.909 

system.time({object_in_db2 = upload_as_base64_encoded_binary(db, object_in_R = es)})
#  user  system elapsed 
# 2.867   1.250  15.471 
system.time({es_downloaded2 = download_from_base64_encoded_binary(object_in_db = object_in_db2)})
#  user  system elapsed 
# 8.181  62.658  77.729 

The uploaded SciDB data types

object_in_db
# SciDB expression  R_array2c52330bcb315499692774629100...
# SciDB schema  <val:binary> [i=0:0:0:1] 
#   variable dimension   type nullable start end chunk
# 1        i      TRUE  int64    FALSE     0   0     1
# 2      val     FALSE binary     TRUE            

object_in_db2
# SciDB expression  R_array2c5692c6a1615499692774629100...
# SciDB schema  <val:string> [i=0:0:0:1] 
#   variable dimension   type nullable start end chunk
# 1        i      TRUE  int64    FALSE     0   0     1
# 2      val     FALSE string     TRUE