from pathlib import Path
from IPython.display import Image, display
import pandas as pd

# project root
repo_root = Path().resolve().parent  

# database path relative to project root
DB_PATH = repo_root / "data" / "Chinook_Sqlite.sqlite"

assert DB_PATH.exists(), f"Database file not found: {DB_PATH}"

# SQLAlchemy engine
from sqlalchemy import create_engine
engine = create_engine(f"sqlite:///{DB_PATH}")

# image path relative to repo root
image_path = repo_root / "input" / "ChinookDBStructure.png"

# check it exists
assert image_path.exists(), f"Image file not found: {image_path}"

# display
display(Image(filename=str(image_path), width=600, height=400))

query = """

WITH top_10_customers AS (
  SELECT 
      c.CustomerId,
      TRIM(c.FirstName || ' ' || c.LastName) AS customer_name,
      SUM(i.Total) AS total_invoice_revenue
  FROM customer AS c
  JOIN invoice AS i 
      ON c.CustomerId = i.CustomerId
  GROUP BY c.CustomerId, c.FirstName, c.LastName
  ORDER BY total_invoice_revenue DESC
  LIMIT 10
)
SELECT 
    ttc.customer_name,
    i.InvoiceId,
    i.InvoiceDate,
    i.Total
FROM invoice AS i
JOIN top_10_customers AS ttc 
    ON i.CustomerId = ttc.CustomerId
ORDER BY ttc.customer_name, i.InvoiceDate ;

"""
df = pd.read_sql_query(query, engine)
df

query = """

SELECT 
     g.name AS genre_name,
     AVG(t.UnitPrice) AS average_track_price
FROM track AS t
JOIN genre AS g
  ON t.GenreId = g.GenreId
GROUP BY g.GenreId, g.name
  HAVING AVG(t.UnitPrice) = (
    SELECT MAX(average_price) --filtering by genres that have average max price
    FROM (
        SELECT 
             AVG(UnitPrice) AS average_price
        FROM track
        GROUP BY GenreId 
     ) AS genre_averages
);

"""
df = pd.read_sql_query(query, engine)
df

query = """

WITH country_invoice_total AS (
    SELECT 
         c.Country,
         COALESCE(SUM(i.Total),0) AS total_invoice_amount
    FROM invoice AS i
    JOIN customer AS c
      ON i.CustomerId = c.CustomerId
    GROUP BY c.Country
)
SELECT 
     Country,
     total_invoice_amount
FROM country_invoice_total
WHERE total_invoice_amount > 150
ORDER BY total_invoice_amount DESC ;

"""
df = pd.read_sql_query(query, engine)
df

query = """

SELECT 
     Name AS track_name,
     ROUND(Milliseconds/(60 * 1000), 1) AS track_length_minutes 
FROM track
WHERE Milliseconds > (
    SELECT AVG(Milliseconds)
    FROM track
)
ORDER BY track_length_minutes DESC

"""
df = pd.read_sql_query(query, engine)
df

query = """

WITH track_counts AS (   
    SELECT 
         ar.ArtistId,
         ar.Name AS artist_name,
         COUNT(tr.TrackId) as track_count
    FROM artist AS ar
    LEFT JOIN album AS al 
      ON ar.ArtistId = al.ArtistId
    JOIN track AS tr
      ON al.AlbumId = tr.AlbumId
    GROUP BY ar.ArtistId, ar.Name
)
SELECT 
    ArtistId,
    artist_name,
    track_count,
    DENSE_RANK() OVER(ORDER BY track_count DESC, artist_name) AS rank_by_track_counts
FROM track_counts
ORDER BY track_count DESC

"""
df = pd.read_sql_query(query, engine)
df

query = """

SELECT DISTINCT
    TRIM(c.FirstName || ' ' || c.LastName) AS customer_name
FROM customer AS c
JOIN invoice AS i
  ON c.CustomerId = i.CustomerId
WHERE i.Total > (SELECT AVG(Total) FROM invoice)
ORDER BY customer_name

"""
df = pd.read_sql_query(query, engine)
df.head(10) # showing just the top 10 customer_names

# subquery version

query = """
  SELECT *
  FROM (
    SELECT
      g.Name AS genre_name,
      t.Name AS track_name,
      COUNT(il.Quantity) AS track_invoice_count,
      DENSE_RANK() OVER(PARTITION BY g.Name ORDER BY COUNT(il.Quantity) DESC) AS track_rank_by_genre
    FROM genre AS g
    JOIN track AS t
      ON g.GenreId = t.GenreId
    JOIN invoiceline AS il 
      ON t.TrackId = il.TrackId
    GROUP BY g.Name, t.Name
) AS ranked_tracks
WHERE track_rank_by_genre <= 3

"""
df = pd.read_sql_query(query, engine)
df

# CTE version
query = """

WITH top_tracks_by_genre AS(
  SELECT
     g.Name AS genre_name,
     t.Name AS track_name,
     COUNT(il.Quantity) AS track_invoice_count,
     DENSE_RANK() OVER(PARTITION BY g.Name ORDER BY COUNT(il.Quantity) DESC) AS genre_track_rank
  FROM genre AS g
  JOIN track AS t
    ON g.GenreId = t.GenreId
  JOIN invoiceline AS il 
    ON t.TrackId = il.TrackId
  GROUP BY g.Name, t.Name
)
SELECT 
     genre_name,
     track_name,
     track_invoice_count,
     genre_track_rank
FROM top_tracks_by_genre
WHERE genre_track_rank <= 3

"""
df = pd.read_sql_query(query, engine)
df

	customer_name	InvoiceId	InvoiceDate	Total
0	Astrid Gruber	78	2021-12-08 00:00:00	1.98
1	Astrid Gruber	89	2022-01-18 00:00:00	18.86
2	Astrid Gruber	144	2022-09-18 00:00:00	8.91
3	Astrid Gruber	273	2024-04-24 00:00:00	1.98
4	Astrid Gruber	296	2024-07-27 00:00:00	3.96
...	...	...	...	...
65	Victor Stevens	190	2023-04-18 00:00:00	1.98
66	Victor Stevens	201	2023-05-29 00:00:00	18.86
67	Victor Stevens	256	2024-01-27 00:00:00	8.91
68	Victor Stevens	385	2025-09-02 00:00:00	1.98
69	Victor Stevens	408	2025-12-05 00:00:00	3.96

	genre_name	average_track_price
0	Science Fiction	1.99
1	TV Shows	1.99
2	Sci Fi & Fantasy	1.99
3	Drama	1.99
4	Comedy	1.99

	Country	total_invoice_amount
0	USA	523.06
1	Canada	303.96
2	France	195.10
3	Brazil	190.10
4	Germany	156.48

	track_name	track_length_minutes
0	Occupation / Precipice	88.0
1	Through a Looking Glass	84.0
2	Battlestar Galactica, Pt. 1	49.0
3	Battlestar Galactica, Pt. 2	49.0
4	The Man With Nine Lives	49.0
...	...	...
489	Concerto for Clarinet in A Major, K. 622: II. ...	6.0
490	Karelia Suite, Op.11: 2. Ballade (Tempo Di Men...	6.0
491	Wellington's Victory or the Battle Symphony, O...	6.0
492	Rehab (Hot Chip Remix)	6.0
493	You Sent Me Flying / Cherry	6.0

	ArtistId	artist_name	track_count	rank_by_track_counts
0	90	Iron Maiden	213	1
1	150	U2	135	2
2	22	Led Zeppelin	114	3
3	50	Metallica	112	4
4	58	Deep Purple	92	5
...	...	...	...	...
199	259	The 12 Cellists of The Berlin Philharmonic	1	200
200	231	Ton Koopman	1	201
201	211	Wilhelm Kempff	1	202
202	255	Yehudi Menuhin	1	203
203	212	Yo-Yo Ma	1	204

SQL Examples: Common Table Expressions (CTEs) and Subqueries¶

Setup¶

Query Examples¶

1) Create a CTE for top 10 customers by revenue and join it to get their invoices.¶

2) Using a subquery, find the genres whose tracks are, on average, are the most expensive.¶

3) Write a CTE to compute total invoice amount per country, then select countries with revenue > 150.¶

4) Find tracks longer than the average track length (subquery in WHERE).¶

5) Use a CTE to rank artists by number of tracks.¶

6) Find customers whose invoices exceed the average invoice total.¶

7) List the top 3 most popular tracks per genre using a subquery.¶

	customer_name
0	Aaron Mitchell
1	Alexandre Rocha
2	Astrid Gruber
3	Bjørn Hansen
4	Camille Bernard
5	Daan Peeters
6	Dan Miller
7	Diego Gutiérrez
8	Dominique Lefebvre
9	Eduardo Martins

	genre_name	track_name	track_invoice_count	track_rank_by_genre
0	Alternative	Until We Fall	1	1
1	Alternative	Sound of a Gun	1	1
2	Alternative	Slowness	1	1
3	Alternative	Show Me How to Live (Live at the Quart Festival)	1	1
4	Alternative	Shape of Things to Come	1	1
...	...	...	...	...
1321	World	Din Din Wo (Little Child)	1	1
1322	World	Demorou!	1	1
1323	World	Cafezinho	1	1
1324	World	Aos Leões	1	1
1325	World	A Moça e a Chuva	1	1