import pandas as pd
pd.set_option('display.width', 100)
import os
os.environ['SPARK_HOME'] = '/Users/rasmus/spark/spark-3.0.0-bin-hadoop2.7'
import pandas as pd
charts = pd.read_csv("data/charts.csv", nrows=100000)
charts.head()
title | rank | date | artist | url | region | chart | trend | streams | |
---|---|---|---|---|---|---|---|---|---|
0 | Despacito (Featuring Daddy Yankee) | 1 | 2017-02-01 | Luis Fonsi | https://open.spotify.com/track/4aWmUDTfIPGksMN... | Argentina | top200 | SAME_POSITION | 376615.0 |
1 | Reggaetón Lento (Bailemos) | 2 | 2017-02-01 | CNCO | https://open.spotify.com/track/3AEZUABDXNtecAO... | Argentina | top200 | SAME_POSITION | 195906.0 |
2 | Chantaje (feat. Maluma) | 3 | 2017-02-01 | Shakira | https://open.spotify.com/track/6mICuAdrwEjh6Y6... | Argentina | top200 | SAME_POSITION | 170048.0 |
3 | Cuando Se Pone a Bailar | 4 | 2017-02-01 | Rombai | https://open.spotify.com/track/1MpKZi1zTXpERKw... | Argentina | top200 | SAME_POSITION | 154424.0 |
4 | Otra vez (feat. J Balvin) | 5 | 2017-02-01 | Zion & Lennox | https://open.spotify.com/track/3QwBODjSEzelZyV... | Argentina | top200 | SAME_POSITION | 123791.0 |
(charts
.query('chart == "top200" and region == "Sweden"')
.groupby('artist', as_index=False)
.agg(
n_dates=('date', 'nunique'),
n_songs=('title', 'nunique'))
.sort_values(by='n_dates', ascending=False)
.head(10))
artist | n_dates | n_songs | |
---|---|---|---|
0 | A R I Z O N A | 8 | 2 |
173 | Olivia Holt | 8 | 1 |
148 | Marcus & Martinus, Katastrofe | 8 | 1 |
70 | Gabrielle | 8 | 1 |
151 | Maroon 5, Kendrick Lamar | 8 | 1 |
152 | Martin Garrix, Bebe Rexha | 8 | 1 |
153 | Martin Garrix, Dua Lipa | 8 | 1 |
66 | Flo Rida, 99 Percent | 8 | 1 |
154 | Martin Jensen | 8 | 2 |
63 | FRENSHIP, Emily Warren | 8 | 1 |
import duckdb
con = duckdb.connect(database='duckdb_python_db')
con.execute('drop table if exists charts')
con.execute("""
create table charts as
select * from read_csv_auto('data/charts.csv')
""")
<duckdb.DuckDBPyConnection at 0x1619eb930>
con.execute("select * from charts limit 10").fetchdf()
title | rank | date | artist | url | region | chart | trend | streams | |
---|---|---|---|---|---|---|---|---|---|
0 | Despacito (Featuring Daddy Yankee) | 1 | 2017-02-01 | Luis Fonsi | https://open.spotify.com/track/4aWmUDTfIPGksMN... | Argentina | top200 | SAME_POSITION | 376615 |
1 | Reggaetón Lento (Bailemos) | 2 | 2017-02-01 | CNCO | https://open.spotify.com/track/3AEZUABDXNtecAO... | Argentina | top200 | SAME_POSITION | 195906 |
2 | Chantaje (feat. Maluma) | 3 | 2017-02-01 | Shakira | https://open.spotify.com/track/6mICuAdrwEjh6Y6... | Argentina | top200 | SAME_POSITION | 170048 |
3 | Cuando Se Pone a Bailar | 4 | 2017-02-01 | Rombai | https://open.spotify.com/track/1MpKZi1zTXpERKw... | Argentina | top200 | SAME_POSITION | 154424 |
4 | Otra vez (feat. J Balvin) | 5 | 2017-02-01 | Zion & Lennox | https://open.spotify.com/track/3QwBODjSEzelZyV... | Argentina | top200 | SAME_POSITION | 123791 |
5 | El Amante | 6 | 2017-02-01 | Nicky Jam | https://open.spotify.com/track/3umS4y3uQDkqekN... | Argentina | top200 | MOVE_UP | 120908 |
6 | Shape of You | 7 | 2017-02-01 | Ed Sheeran | https://open.spotify.com/track/7qiZfU4dY1lWllz... | Argentina | top200 | MOVE_UP | 119201 |
7 | Vente Pa' Ca (feat. Maluma) | 8 | 2017-02-01 | Ricky Martin | https://open.spotify.com/track/7DM4BPaS7uofFul... | Argentina | top200 | MOVE_DOWN | 116437 |
8 | Safari | 9 | 2017-02-01 | J Balvin, Pharrell Williams, BIA, Sky | https://open.spotify.com/track/6rQSrBHf7HlZjtc... | Argentina | top200 | MOVE_DOWN | 113658 |
9 | Vacaciones | 10 | 2017-02-01 | Wisin | https://open.spotify.com/track/3dQDid3IUNhZy1O... | Argentina | top200 | SAME_POSITION | 108785 |
con.execute("""
select
artist,
count(distinct date) n_dates,
count(distinct title) n_songs
from charts
where chart = 'top200' and region = 'Sweden'
group by artist
order by n_dates desc
limit 10
""").fetchdf()
artist | n_dates | n_songs | |
---|---|---|---|
0 | Hov1 | 1664 | 46 |
1 | Avicii | 1540 | 52 |
2 | Ed Sheeran | 1514 | 59 |
3 | Laleh | 1484 | 18 |
4 | Molly Sandén | 1468 | 37 |
5 | Journey | 1431 | 1 |
6 | Miriam Bryant | 1291 | 18 |
7 | James Arthur | 1271 | 16 |
8 | Post Malone | 1249 | 39 |
9 | Billie Eilish | 1205 | 40 |
con.close()
import findspark
findspark.init()
import pyspark
import pyspark.sql.functions as F
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.sparkContext.uiWebUrl
'http://ip-192-168-1-24.ec2.internal:4040'
charts = spark.read.csv('data/charts.csv', header=True)
charts.limit(10).show()
+--------------------+----+----------+--------------------+--------------------+---------+------+-------------+-------+ | title|rank| date| artist| url| region| chart| trend|streams| +--------------------+----+----------+--------------------+--------------------+---------+------+-------------+-------+ |Despacito (Featur...| 1|2017-02-01| Luis Fonsi|https://open.spot...|Argentina|top200|SAME_POSITION| 376615| |Reggaetón Lento (...| 2|2017-02-01| CNCO|https://open.spot...|Argentina|top200|SAME_POSITION| 195906| |Chantaje (feat. M...| 3|2017-02-01| Shakira|https://open.spot...|Argentina|top200|SAME_POSITION| 170048| |Cuando Se Pone a ...| 4|2017-02-01| Rombai|https://open.spot...|Argentina|top200|SAME_POSITION| 154424| |Otra vez (feat. J...| 5|2017-02-01| Zion & Lennox|https://open.spot...|Argentina|top200|SAME_POSITION| 123791| | El Amante| 6|2017-02-01| Nicky Jam|https://open.spot...|Argentina|top200| MOVE_UP| 120908| | Shape of You| 7|2017-02-01| Ed Sheeran|https://open.spot...|Argentina|top200| MOVE_UP| 119201| |Vente Pa' Ca (fea...| 8|2017-02-01| Ricky Martin|https://open.spot...|Argentina|top200| MOVE_DOWN| 116437| | Safari| 9|2017-02-01|J Balvin, Pharrel...|https://open.spot...|Argentina|top200| MOVE_DOWN| 113658| | Vacaciones| 10|2017-02-01| Wisin|https://open.spot...|Argentina|top200|SAME_POSITION| 108785| +--------------------+----+----------+--------------------+--------------------+---------+------+-------------+-------+
(charts
.filter('chart = "top200" and region = "Sweden"')
.groupBy('artist')
.agg(
F.countDistinct('date').alias('n_dates'),
F.countDistinct('title').alias('n_songs'))
.sort('n_dates', ascending=False)
.limit(10)
.show())
+-------------+-------+-------+ | artist|n_dates|n_songs| +-------------+-------+-------+ | Hov1| 1664| 46| | Avicii| 1540| 52| | Ed Sheeran| 1514| 59| | Laleh| 1484| 18| | Molly Sandén| 1468| 37| | Journey| 1431| 1| |Miriam Bryant| 1291| 18| | James Arthur| 1271| 16| | Post Malone| 1249| 39| |Billie Eilish| 1205| 40| +-------------+-------+-------+
charts.registerTempTable("charts")
spark.sql('select * from charts limit 10').show()
+--------------------+----+----------+--------------------+--------------------+---------+------+-------------+-------+ | title|rank| date| artist| url| region| chart| trend|streams| +--------------------+----+----------+--------------------+--------------------+---------+------+-------------+-------+ |Despacito (Featur...| 1|2017-02-01| Luis Fonsi|https://open.spot...|Argentina|top200|SAME_POSITION| 376615| |Reggaetón Lento (...| 2|2017-02-01| CNCO|https://open.spot...|Argentina|top200|SAME_POSITION| 195906| |Chantaje (feat. M...| 3|2017-02-01| Shakira|https://open.spot...|Argentina|top200|SAME_POSITION| 170048| |Cuando Se Pone a ...| 4|2017-02-01| Rombai|https://open.spot...|Argentina|top200|SAME_POSITION| 154424| |Otra vez (feat. J...| 5|2017-02-01| Zion & Lennox|https://open.spot...|Argentina|top200|SAME_POSITION| 123791| | El Amante| 6|2017-02-01| Nicky Jam|https://open.spot...|Argentina|top200| MOVE_UP| 120908| | Shape of You| 7|2017-02-01| Ed Sheeran|https://open.spot...|Argentina|top200| MOVE_UP| 119201| |Vente Pa' Ca (fea...| 8|2017-02-01| Ricky Martin|https://open.spot...|Argentina|top200| MOVE_DOWN| 116437| | Safari| 9|2017-02-01|J Balvin, Pharrel...|https://open.spot...|Argentina|top200| MOVE_DOWN| 113658| | Vacaciones| 10|2017-02-01| Wisin|https://open.spot...|Argentina|top200|SAME_POSITION| 108785| +--------------------+----+----------+--------------------+--------------------+---------+------+-------------+-------+
spark.sql("""
select
artist,
count(distinct date) n_dates,
count(distinct title) n_songs
from charts
where chart = 'top200' and region = 'Sweden'
group by artist
order by n_dates desc
limit 10
""").show()
+-------------+-------+-------+ | artist|n_dates|n_songs| +-------------+-------+-------+ | Hov1| 1664| 46| | Avicii| 1540| 52| | Ed Sheeran| 1514| 59| | Laleh| 1484| 18| | Molly Sandén| 1468| 37| | Journey| 1431| 1| |Miriam Bryant| 1291| 18| | James Arthur| 1271| 16| | Post Malone| 1249| 39| |Billie Eilish| 1205| 40| +-------------+-------+-------+
spark.sparkContext.stop()
spark.stop()
import os
from sqlalchemy import create_engine
import pandas as pd
# This creates a SQLAlchemy connection to snowflake
user = os.environ['SNOWFLAKE_USER']
password = os.environ['SNOWFLAKE_PASSWORD']
account = 'kc75442.us-east-1'
connection_string = f'snowflake://{user}:{password}@{account}/'
engine = create_engine(connection_string)
con = engine.connect()
con.execute("use role data_scientist")
con.execute("use schema analytics.ds")
<sqlalchemy.engine.result.ResultProxy at 0x168dfceb0>
pd.read_sql_query("select * from charts limit 10", con)
title | rank | date | artist | url | region | chart | trend | streams | |
---|---|---|---|---|---|---|---|---|---|
0 | 60 années | 105 | 2018-09-06 | Damso | https://open.spotify.com/track/3mqq0Z92V15UZX4... | France | top200 | MOVE_DOWN | 40571 |
1 | Te Boté - Remix | 80 | 2018-06-14 | Nio Garcia, Casper Magico, Bad Bunny, Darell, ... | https://open.spotify.com/track/3V8UKqhEK5zBkBb... | Netherlands | top200 | MOVE_DOWN | 37864 |
2 | Sofiane | 81 | 2018-06-14 | Boef | https://open.spotify.com/track/7aK0LpCyxzN5K9p... | Netherlands | top200 | MOVE_UP | 37845 |
3 | Reborn | 82 | 2018-06-14 | KIDS SEE GHOSTS | https://open.spotify.com/track/4RVbK6cV0VqWdpC... | Netherlands | top200 | MOVE_DOWN | 37715 |
4 | This Is America | 83 | 2018-06-14 | Childish Gambino | https://open.spotify.com/track/0b9oOr2ZgvyQu88... | Netherlands | top200 | MOVE_DOWN | 36994 |
5 | Does It Matter | 84 | 2018-06-14 | Janieck | https://open.spotify.com/track/7FxidSR6IoL3qMR... | Netherlands | top200 | MOVE_UP | 36562 |
6 | Where Were You In The Morning? | 85 | 2018-06-14 | Shawn Mendes | https://open.spotify.com/track/51wrmElSLxFzc0v... | Netherlands | top200 | MOVE_DOWN | 36393 |
7 | Vuurwerk | 86 | 2018-06-14 | Spanker, 3robi, Josylvio, Lijpe | https://open.spotify.com/track/6qN7yPBgF9t6x2T... | Netherlands | top200 | MOVE_UP | 35956 |
8 | I'm Upset | 87 | 2018-06-14 | Drake | https://open.spotify.com/track/54oMZ3blwWwuPN4... | Netherlands | top200 | MOVE_UP | 35317 |
9 | Without You (feat. Sandro Cavazza) | 88 | 2018-06-14 | Avicii | https://open.spotify.com/track/6WbADFqMvR8N5u0... | Netherlands | top200 | MOVE_DOWN | 35137 |
pd.read_sql_query("""
select
artist,
count(distinct date) n_dates,
count(distinct title) n_songs
from charts
where chart = 'top200' and region = 'Sweden'
group by artist
order by n_dates desc
limit 10
""", con)
artist | n_dates | n_songs | |
---|---|---|---|
0 | Hov1 | 1664 | 46 |
1 | Avicii | 1540 | 52 |
2 | Ed Sheeran | 1514 | 59 |
3 | Laleh | 1484 | 18 |
4 | Molly Sandén | 1468 | 37 |
5 | Journey | 1431 | 1 |
6 | Miriam Bryant | 1291 | 18 |
7 | James Arthur | 1271 | 16 |
8 | Post Malone | 1249 | 39 |
9 | Billie Eilish | 1205 | 40 |
pd.read_sql_query("""
select *
from charts
where artist = 'Journey'
order by date desc
limit 100
""", con)
title | rank | date | artist | url | region | chart | trend | streams | |
---|---|---|---|---|---|---|---|---|---|
0 | Don't Stop Believin' | 173 | 2021-09-30 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | United Kingdom | top200 | MOVE_UP | 49942 |
1 | Don't Stop Believin' | 139 | 2021-09-30 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | Australia | top200 | MOVE_UP | 34883 |
2 | Don't Stop Believin' | 137 | 2021-09-30 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | Ireland | top200 | MOVE_DOWN | 6211 |
3 | Don't Stop Believin' | 196 | 2021-09-30 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | Austria | top200 | NEW_ENTRY | 4557 |
4 | Don't Stop Believin' | 103 | 2021-09-30 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | Canada | top200 | MOVE_UP | 39446 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95 | Don't Stop Believin' | 172 | 2021-09-17 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | Sweden | top200 | NEW_ENTRY | 21383 |
96 | Don't Stop Believin' | 136 | 2021-09-17 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | Australia | top200 | MOVE_UP | 39369 |
97 | Don't Stop Believin' | 98 | 2021-09-17 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | New Zealand | top200 | MOVE_UP | 8723 |
98 | Don't Stop Believin' | 193 | 2021-09-16 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | Austria | top200 | MOVE_DOWN | 4544 |
99 | Don't Stop Believin' | 194 | 2021-09-16 | Journey | https://open.spotify.com/track/4bHsxqR3GMrXTxE... | South Africa | top200 | NEW_ENTRY | 3142 |
100 rows × 9 columns
con.close()
engine.dispose()