from pathlib import Path
from IPython.display import Image, display
import pandas as pd

# project root
repo_root = Path().resolve().parent  

# database path relative to project root
DB_PATH = repo_root / "data" / "Chinook_Sqlite.sqlite"

assert DB_PATH.exists(), f"Database file not found: {DB_PATH}"

# SQLAlchemy engine
from sqlalchemy import create_engine
engine = create_engine(f"sqlite:///{DB_PATH}")

# image path relative to repo root
image_path = repo_root / "input" / "ChinookDBStructure.png"

# check it exists
assert image_path.exists(), f"Image file not found: {image_path}"

# display
display(Image(filename=str(image_path), width=600, height=400))

# aggregating over CustomerId with sum to calculate each customer's total_revenue
# using DESNSE_RANK to rank customers by their total revenue from invoices

query = """

SELECT 
     c.CustomerId,
     TRIM(c.FirstName || ' ' || c.LastName) AS customer_name,
     COALESCE(SUM(i.Total),0) AS total_revenue,
     DENSE_RANK() OVER(ORDER BY COALESCE(SUM(i.Total), 0) DESC) AS total_revenue_rank
FROM customer AS c
LEFT JOIN invoice AS i
  ON c.CustomerId = i.CustomerId
GROUP BY c.CustomerId, c.FirstName, c.LastName
ORDER BY total_revenue DESC ;

"""
df = pd.read_sql_query(query, engine)
df.head(10) # showing top ten

# CTE version for strict SQL environements

query = """

WITH customer_invoice_totals AS(
SELECT 
     c.CustomerId,
     TRIM(c.FirstName || ' ' || c.LastName) AS customer_name,
     COALESCE(SUM(i.Total),0) AS total_revenue   
FROM customer AS c
LEFT JOIN invoice AS i
  ON c.CustomerId = i.CustomerId
GROUP BY c.CustomerId, c.FirstName, c.LastName
)
SELECT 
     CustomerId,
     customer_name,
     total_revenue,
     DENSE_RANK() OVER(ORDER BY total_revenue DESC) AS total_revenue_rank
FROM customer_invoice_totals
ORDER BY total_revenue DESC ;

"""
df = pd.read_sql_query(query, engine)
df.head(10) # showing top ten

# cte with revenue by year followed with a cumulative sum windows function

query = """

WITH customer_revenue_by_year AS(
    SELECT
         TRIM(c.FirstName || ' ' || c.LastName) AS customer_name,
         CAST(strftime('%Y', i.InvoiceDate) AS INTEGER) AS year, 
         COALESCE(SUM(i.Total), 0) AS total_revenue
FROM customer AS c
JOIN invoice AS i
  ON c.CustomerId = i.CustomerId
GROUP BY strftime('%Y', i.InvoiceDate), c.FirstName, c.LastName
)
SELECT 
     customer_name,
     year,
     total_revenue,
     SUM(total_revenue) OVER(PARTITION BY customer_name ORDER BY year) AS cumulative_revenue
FROM customer_revenue_by_year
ORDER BY customer_name, year

"""
df = pd.read_sql_query(query, engine)
df.head(10) # showing top ten

query = """

SELECT 
     c.CustomerId,
     TRIM(c.FirstName || ' ' || c.LastName) AS customer_name,
     i.InvoiceDate,
     LAG(i.InvoiceDate) OVER(
       PARTITION BY c.CustomerId 
       ORDER BY i.InvoiceDate, i.InvoiceId
     ) AS previous_invoice_date
FROM customer AS c
    JOIN invoice AS i
      ON c.CustomerId = i.CustomerId
ORDER BY c.CustomerId, c.FirstName, c.LastName, i.InvoiceDate ;

"""
df = pd.read_sql_query(query, engine)
df.head(10) # showing top ten

	CustomerId	customer_name	total_revenue	total_revenue_rank
0	6	Helena Holý	49.62	1
1	26	Richard Cunningham	47.62	2
2	57	Luis Rojas	46.62	3
3	45	Ladislav Kovács	45.62	4
4	46	Hugh O'Reilly	45.62	4
5	24	Frank Ralston	43.62	5
6	28	Julia Barnett	43.62	5
7	37	Fynn Zimmermann	43.62	5
8	7	Astrid Gruber	42.62	6
9	25	Victor Stevens	42.62	6

	CustomerId	customer_name	total_revenue	total_revenue_rank
0	6	Helena Holý	49.62	1
1	26	Richard Cunningham	47.62	2
2	57	Luis Rojas	46.62	3
3	45	Ladislav Kovács	45.62	4
4	46	Hugh O'Reilly	45.62	4
5	24	Frank Ralston	43.62	5
6	28	Julia Barnett	43.62	5
7	37	Fynn Zimmermann	43.62	5
8	7	Astrid Gruber	42.62	6
9	25	Victor Stevens	42.62	6

	customer_name	year	total_revenue	cumulative_revenue
0	Aaron Mitchell	2021	15.84	15.84
1	Aaron Mitchell	2022	8.91	24.75
2	Aaron Mitchell	2023	1.98	26.73
3	Aaron Mitchell	2024	9.90	36.63
4	Aaron Mitchell	2025	0.99	37.62
5	Alexandre Rocha	2021	15.84	15.84
6	Alexandre Rocha	2022	8.91	24.75
7	Alexandre Rocha	2024	11.88	36.63
8	Alexandre Rocha	2025	0.99	37.62
9	Astrid Gruber	2021	1.98	1.98

	CustomerId	customer_name	InvoiceDate	previous_invoice_date
0	1	Luís Gonçalves	2022-03-11 00:00:00	None
1	1	Luís Gonçalves	2022-06-13 00:00:00	2022-03-11 00:00:00
2	1	Luís Gonçalves	2022-09-15 00:00:00	2022-06-13 00:00:00
3	1	Luís Gonçalves	2023-05-06 00:00:00	2022-09-15 00:00:00
4	1	Luís Gonçalves	2024-10-27 00:00:00	2023-05-06 00:00:00
5	1	Luís Gonçalves	2024-12-07 00:00:00	2024-10-27 00:00:00
6	1	Luís Gonçalves	2025-08-07 00:00:00	2024-12-07 00:00:00
7	2	Leonie Köhler	2021-01-01 00:00:00	None
8	2	Leonie Köhler	2021-02-11 00:00:00	2021-01-01 00:00:00
9	2	Leonie Köhler	2021-10-12 00:00:00	2021-02-11 00:00:00

SQL Examples: Window Functions¶

Setup¶

Query Examples¶

1) Assign a rank to customers based on total revenue.¶

2) Compute cumulative revenue over time per customer.¶

3) Find the previous invoice date for each customer (LAG).¶

4) Show the next track each customer purchased (LEAD).¶

5) Compute running totals of revenue per month.¶

6) Rank tracks within each genre by price.¶

7) Show the percentile rank of customers by total purchase amount.¶