First some settings…

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

R - tidyverse

library(tidyverse)

charts <- read_csv("data/charts.csv", n_max = 100000)
head(charts)
charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(
    n_dates = n_distinct(date), 
    n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)

R - duckdb

library(DBI)
library(duckdb)
con = dbConnect(duckdb(), dbdir="duckdb_r_db")
dbExecute(con,"drop table if exists charts;")
## [1] 0
dbExecute(con, "
  create table charts as 
  select * from read_csv_auto('data/charts.csv');
")
## [1] 25001801
dbGetQuery(con, "select * from charts limit 10")
dbGetQuery(con, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")
duckdb_charts <- tbl(con, "charts")
head(duckdb_charts)
duckdb_charts <- tbl(con, "charts")

duckdb_charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)
dbDisconnect(con, shutdown=TRUE)

R - spark

library(sparklyr)
library(tidyverse)
sc <- spark_connect(master = "local")
spark_charts <- spark_read_csv(sc, "charts", "data/charts.csv")
head(spark_charts)
spark_charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)
dbGetQuery(sc, "select * from charts limit 10")
dbGetQuery(sc, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")
spark_disconnect(sc)

R - Snowflake

library("RJDBC")
library("DBI")

jdbcDriver <- JDBC(driverClass="com.snowflake.client.jdbc.SnowflakeDriver", 
                   classPath=Sys.getenv("SNOWFLAKE_CLASS_PATH"))
con <- dbConnect(jdbcDriver, "jdbc:snowflake://kc75442.us-east-1.snowflakecomputing.com", 
                 Sys.getenv("SNOWFLAKE_USER"),Sys.getenv("SNOWFLAKE_PASSWORD"))

dbSendQuery(con, "use role data_scientist")
## <JDBCResult>
dbSendQuery(con, "use schema analytics.ds")
## <JDBCResult>
dbGetQuery(con, "select * from charts limit 10") 
dbGetQuery(con, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")
dbGetQuery(con, "
  select *
  from charts
  where artist = 'Journey'
  order by date desc
  limit 100
")
dbDisconnect(con)
## [1] TRUE