from pathlib import Path
from IPython.display import Image, display
import pandas as pd

# project root
repo_root = Path().resolve().parent  

# database path relative to project root
DB_PATH = repo_root / "data" / "Chinook_Sqlite.sqlite"

assert DB_PATH.exists(), f"Database file not found: {DB_PATH}"

# SQLAlchemy engine
from sqlalchemy import create_engine
engine = create_engine(f"sqlite:///{DB_PATH}")

# image path relative to repo root
image_path = repo_root / "input" / "ChinookDBStructure.png"

# check it exists
assert image_path.exists(), f"Image file not found: {image_path}"

# display
display(Image(filename=str(image_path), width=600, height=400))

query = """

SELECT TRIM(c.FirstName || ' ' || c.LastName) as customer_name,
       i.InvoiceId,
       i.Total AS invoice_amount
FROM customer AS c
JOIN invoice AS i ON c.CustomerId = i.CustomerId
ORDER BY customer_name ;

"""
df = pd.read_sql_query(query, engine)
df

query = """

SELECT c.Country, -- using customer country
       COALESCE(SUM(i.Total),0) AS total_revenue
FROM customer AS c
JOIN invoice AS i ON c.CustomerId = i.CustomerId
GROUP BY c.Country
ORDER BY total_revenue DESC ;

"""
df = pd.read_sql_query(query, engine)
df

query = """

SELECT ar.Name AS artist,
       COUNT(tr.TrackId) AS track_count
FROM artist AS ar
JOIN album AS al ON ar.ArtistId = al.ArtistId
JOIN track AS tr ON al.AlbumId = tr.AlbumId
GROUP BY ar.ArtistId, ar.Name
ORDER BY track_count DESC ;

"""
df = pd.read_sql_query(query, engine)
df

# LIMIT version

query = """

SELECT TRIM(c.FirstName || ' ' || c.LastName) AS customer_name,
       COALESCE(SUM(i.Total),0) AS total_invoice_amount
FROM customer AS c
JOIN invoice AS i ON c.CustomerId = i.CustomerId
GROUP BY c.CustomerId, c.FirstName, c.LastName
ORDER BY total_invoice_amount DESC
LIMIT 5 ;

"""
df = pd.read_sql_query(query, engine)
df

# CTE version

query = """

WITH ranked_customers AS(

SELECT TRIM(c.FirstName || ' ' || c.LastName) AS customer_name,
       COALESCE(SUM(i.Total),0) AS total_invoice_amount,
       RANK() OVER(ORDER BY COALESCE(SUM(i.Total),0) DESC) AS customer_rank
FROM customer AS c
JOIN invoice AS i ON c.CustomerId = i.CustomerId
GROUP BY c.CustomerId, c.FirstName, c.LastName
ORDER BY total_invoice_amount DESC
)
SELECT *
FROM ranked_customers
WHERE customer_rank <= 5
ORDER BY total_invoice_amount DESC;

"""
df = pd.read_sql_query(query, engine)
df

query = """

SELECT TRIM(e.FirstName || ' ' || e.LastName) AS employee_name,
       COUNT(i.InvoiceId) AS invoice_count
FROM employee AS e
JOIN customer AS c ON e.EmployeeId = c.SupportRepId
JOIN invoice AS i ON c.CustomerId = i.CustomerId
GROUP BY e.EmployeeId, e.FirstName, e.LastName
ORDER BY employee_name

"""
df = pd.read_sql_query(query, engine)
df

query = """

SELECT g.Name AS genre_name,
       COALESCE(SUM(il.UnitPrice * il.Quantity),0) AS total_revenue
FROM genre AS g
JOIN track AS t ON g.GenreId = t.GenreId
JOIN invoiceline AS il ON t.TrackId = il.TrackId
GROUP BY g.GenreId, g.Name
ORDER BY total_revenue DESC

"""
df = pd.read_sql_query(query, engine)
df

# LIMIT version

query = """

SELECT ar.Name AS artist_name,
       COUNT(tr.TrackId) AS track_count
FROM artist AS ar
JOIN album AS al ON ar.ArtistId = al.ArtistId
JOIN track AS tr ON al.AlbumId = tr.AlbumId
GROUP BY ar.ArtistId
ORDER BY track_count DESC
LIMIT 55 ; -- manually determine number of rows that have 20+ track counts

"""
df = pd.read_sql_query(query, engine)
df

# CTE version
query = """

WITH artist_track_counts AS(
SELECT ar.Name AS artist_name,
       COUNT(tr.TrackId) AS track_count
FROM artist AS ar
JOIN album AS al ON ar.ArtistId = al.ArtistId
JOIN track AS tr ON al.AlbumId = tr.AlbumId
GROUP BY ar.ArtistId
ORDER BY track_count DESC 
)
SELECT *
FROM artist_track_counts
WHERE track_count >= 20 -- filter for artists that have 20+ track counts
ORDER BY track_count DESC ;

"""
df = pd.read_sql_query(query, engine)
df

	customer_name	InvoiceId	invoice_amount
0	Aaron Mitchell	50	1.98
1	Aaron Mitchell	61	13.86
2	Aaron Mitchell	116	8.91
3	Aaron Mitchell	245	1.98
4	Aaron Mitchell	268	3.96
...	...	...	...
407	Wyatt Girard	83	0.99
408	Wyatt Girard	204	3.98
409	Wyatt Girard	215	13.86
410	Wyatt Girard	270	8.91
411	Wyatt Girard	399	1.98

	Country	total_revenue
0	USA	523.06
1	Canada	303.96
2	France	195.10
3	Brazil	190.10
4	Germany	156.48
5	United Kingdom	112.86
6	Czech Republic	90.24
7	Portugal	77.24
8	India	75.26
9	Chile	46.62
10	Ireland	45.62
11	Hungary	45.62
12	Austria	42.62
13	Finland	41.62
14	Netherlands	40.62
15	Norway	39.62
16	Sweden	38.62
17	Spain	37.62
18	Poland	37.62
19	Italy	37.62
20	Denmark	37.62
21	Belgium	37.62
22	Australia	37.62
23	Argentina	37.62

	artist	track_count
0	Iron Maiden	213
1	U2	135
2	Led Zeppelin	114
3	Metallica	112
4	Deep Purple	92
...	...	...
199	Mela Tenenbaum, Pro Musica Prague & Richard Kapp	1
200	Emerson String Quartet	1
201	C. Monteverdi, Nigel Rogers - Chiaroscuro; Lon...	1
202	Nash Ensemble	1
203	Philip Glass Ensemble	1

	customer_name	total_invoice_amount
0	Helena Holý	49.62
1	Richard Cunningham	47.62
2	Luis Rojas	46.62
3	Ladislav Kovács	45.62
4	Hugh O'Reilly	45.62

	customer_name	total_invoice_amount	customer_rank
0	Helena Holý	49.62	1
1	Richard Cunningham	47.62	2
2	Luis Rojas	46.62	3
3	Ladislav Kovács	45.62	4
4	Hugh O'Reilly	45.62	4

SQL Examples: Joins and Aggregation Queries¶

Setup¶

Query Examples¶

1) List all invoices with the customer name and total invoice amount.¶

2) Show total revenue per country.¶

3) Count the number of tracks each artist has.¶

4) Find the top 5 customers by total invoice amount.¶

5) List the number of invoices per employee.¶

6) Show total revenue per genre.¶

7) Find artists with more than 20 tracks in the database.¶

	genre_name	total_revenue
0	Rock	826.65
1	Latin	382.14
2	Metal	261.36
3	Alternative & Punk	241.56
4	TV Shows	93.53
5	Jazz	79.20
6	Blues	60.39
7	Drama	57.71
8	R&B/Soul	40.59
9	Classical	40.59
10	Sci Fi & Fantasy	39.80
11	Reggae	29.70
12	Pop	27.72
13	Soundtrack	19.80
14	Comedy	17.91
15	Hip Hop/Rap	16.83
16	Bossa Nova	14.85
17	Alternative	13.86
18	World	12.87
19	Science Fiction	11.94
20	Heavy Metal	11.88
21	Electronica/Dance	11.88
22	Easy Listening	9.90
23	Rock And Roll	5.94

	employee_name	invoice_count
0	Jane Peacock	146
1	Margaret Park	140
2	Steve Johnson	126

	artist_name	track_count
0	Iron Maiden	213
1	U2	135
2	Led Zeppelin	114
3	Metallica	112
4	Lost	92
5	Deep Purple	92
6	Pearl Jam	67
7	Lenny Kravitz	57
8	Various Artists	56
9	The Office	53
10	Van Halen	52
11	Faith No More	52
12	Os Paralamas Do Sucesso	49
13	Red Hot Chili Peppers	48
14	Eric Clapton	48
15	Queen	45
16	Foo Fighters	44
17	Guns N' Roses	42
18	The Rolling Stones	41
19	R.E.M.	41
20	Creedence Clearwater Revival	40
21	Audioslave	40
22	Titãs	38
23	Miles Davis	37
24	Chico Science & Nação Zumbi	36
25	Kiss	35
26	Smashing Pumpkins	34
27	Green Day	34
28	Chico Buarque	34
29	Ozzy Osbourne	32
30	Jamiroquai	32
31	Gilberto Gil	32
32	Legião Urbana	31
33	Cidade Negra	31
34	Antônio Carlos Jobim	31
35	Tim Maia	30
36	The Cult	30
37	Cássia Eller	30
38	Nirvana	29
39	Lulu Santos	28

	artist_name	track_count
0	Iron Maiden	213
1	U2	135
2	Led Zeppelin	114
3	Metallica	112
4	Lost	92
5	Deep Purple	92
6	Pearl Jam	67
7	Lenny Kravitz	57
8	Various Artists	56
9	The Office	53
10	Van Halen	52
11	Faith No More	52
12	Os Paralamas Do Sucesso	49
13	Red Hot Chili Peppers	48
14	Eric Clapton	48
15	Queen	45
16	Foo Fighters	44
17	Guns N' Roses	42
18	The Rolling Stones	41
19	R.E.M.	41
20	Creedence Clearwater Revival	40
21	Audioslave	40
22	Titãs	38
23	Miles Davis	37
24	Chico Science & Nação Zumbi	36
25	Kiss	35
26	Smashing Pumpkins	34
27	Green Day	34
28	Chico Buarque	34
29	Ozzy Osbourne	32
30	Jamiroquai	32
31	Gilberto Gil	32
32	Legião Urbana	31
33	Cidade Negra	31
34	Antônio Carlos Jobim	31
35	Tim Maia	30
36	The Cult	30
37	Cássia Eller	30
38	Nirvana	29
39	Lulu Santos	28