First some settings…
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
R - tidyverse
library(tidyverse)
charts <- read_csv("data/charts.csv", n_max = 100000)
head(charts)
charts %>%
filter(chart == 'top200', region == 'Sweden') %>%
group_by(artist) %>%
summarise(
n_dates = n_distinct(date),
n_songs = n_distinct(title)) %>%
arrange(desc(n_dates)) %>%
head(10)
R - duckdb
library(DBI)
library(duckdb)
con = dbConnect(duckdb(), dbdir="duckdb_r_db")
dbExecute(con,"drop table if exists charts;")
## [1] 0
dbExecute(con, "
create table charts as
select * from read_csv_auto('data/charts.csv');
")
## [1] 25001801
dbGetQuery(con, "select * from charts limit 10")
dbGetQuery(con, "
select
artist,
count(distinct date) n_dates,
count(distinct title) n_songs
from charts
where chart = 'top200' and region = 'Sweden'
group by artist
order by n_dates desc
limit 10
")
duckdb_charts <- tbl(con, "charts")
head(duckdb_charts)
duckdb_charts <- tbl(con, "charts")
duckdb_charts %>%
filter(chart == 'top200', region == 'Sweden') %>%
group_by(artist) %>%
summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>%
arrange(desc(n_dates)) %>%
head(10)
dbDisconnect(con, shutdown=TRUE)
R - spark
library(sparklyr)
library(tidyverse)
sc <- spark_connect(master = "local")
spark_charts <- spark_read_csv(sc, "charts", "data/charts.csv")
head(spark_charts)
spark_charts %>%
filter(chart == 'top200', region == 'Sweden') %>%
group_by(artist) %>%
summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>%
arrange(desc(n_dates)) %>%
head(10)
dbGetQuery(sc, "select * from charts limit 10")
dbGetQuery(sc, "
select
artist,
count(distinct date) n_dates,
count(distinct title) n_songs
from charts
where chart = 'top200' and region = 'Sweden'
group by artist
order by n_dates desc
limit 10
")
spark_disconnect(sc)
R - Snowflake
library("RJDBC")
library("DBI")
jdbcDriver <- JDBC(driverClass="com.snowflake.client.jdbc.SnowflakeDriver",
classPath=Sys.getenv("SNOWFLAKE_CLASS_PATH"))
con <- dbConnect(jdbcDriver, "jdbc:snowflake://kc75442.us-east-1.snowflakecomputing.com",
Sys.getenv("SNOWFLAKE_USER"),Sys.getenv("SNOWFLAKE_PASSWORD"))
dbSendQuery(con, "use role data_scientist")
## <JDBCResult>
dbSendQuery(con, "use schema analytics.ds")
## <JDBCResult>
dbGetQuery(con, "select * from charts limit 10")
dbGetQuery(con, "
select
artist,
count(distinct date) n_dates,
count(distinct title) n_songs
from charts
where chart = 'top200' and region = 'Sweden'
group by artist
order by n_dates desc
limit 10
")
dbGetQuery(con, "
select *
from charts
where artist = 'Journey'
order by date desc
limit 100
")
dbDisconnect(con)
## [1] TRUE