Code for SQL Analytics at Lightning Speed: Getting Started with DuckDB in Python Tutorial


View on Github

duckdb_tutorial.py

"""
DuckDB + Python — Complete Tutorial Code
=========================================
SQL Analytics at Lightning Speed with DuckDB

Requirements: pip install duckdb pandas polars pyarrow numpy

This script covers:
  1. Basic DuckDB connection and SQL queries
  2. Querying CSV files directly (no import needed!)
  3. DuckDB vs Pandas performance comparison
  4. Querying Parquet files
  5. Window functions for ranking
  6. Hybrid workflow: DuckDB → Pandas → Polars
  7. Persistent databases (.duckdb files)
  8. Exporting results to CSV and Parquet
"""
import duckdb
import pandas as pd
import polars as pl
import numpy as np
import time
import os

print(f"DuckDB version: {duckdb.__version__}")

# ═══════════════════════════════════════════════════════════════
# 1. GENERATE SAMPLE DATA
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("GENERATING SAMPLE DATA (500K rows)")
print("=" * 60)

np.random.seed(42)
n = 500_000

regions = ["North", "South", "East", "West"]
products = ["Widget A", "Widget B", "Gadget X", "Gadget Y", "Doohickey Z"]
categories = ["Electronics", "Home", "Office", "Electronics", "Office"]

df_sales = pd.DataFrame({
    "order_id": range(1, n + 1),
    "region": np.random.choice(regions, n),
    "product": np.random.choice(products, n),
    "category": np.random.choice(categories, n),
    "quantity": np.random.randint(1, 20, n),
    "unit_price": np.round(np.random.uniform(5, 500, n), 2),
    "order_date": pd.date_range("2025-01-01", periods=n, freq="90s"),
})

df_sales["total_amount"] = df_sales["quantity"] * df_sales["unit_price"]
df_sales["customer_id"] = np.random.randint(1000, 5000, n)

csv_path = "sales_data.csv"
parquet_path = "sales_data.parquet"
df_sales.to_csv(csv_path, index=False)
df_sales.to_parquet(parquet_path, index=False)

csv_size = os.path.getsize(csv_path) / (1024 * 1024)
pq_size = os.path.getsize(parquet_path) / (1024 * 1024)
print(f"CSV saved:  {csv_size:.1f} MB ({n:,} rows)")
print(f"Parquet saved: {pq_size:.1f} MB ({n:,} rows)")

# ═══════════════════════════════════════════════════════════════
# 2. BASIC DUCKDB: IN-MEMORY CONNECTION
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("BASIC DUCKDB: Creating tables & querying")
print("=" * 60)

conn = duckdb.connect()  # in-memory database

conn.execute("""
    CREATE TABLE employees (
        id INTEGER,
        name VARCHAR,
        department VARCHAR,
        salary DECIMAL(10, 2)
    )
""")

conn.execute("""
    INSERT INTO employees VALUES
    (1, 'Alice', 'Engineering', 95000),
    (2, 'Bob', 'Engineering', 87000),
    (3, 'Charlie', 'Marketing', 72000),
    (4, 'Diana', 'Marketing', 78000),
    (5, 'Eve', 'Engineering', 105000),
    (6, 'Frank', 'Sales', 65000),
    (7, 'Grace', 'Sales', 71000)
""")

print("\nAll employees (ordered by salary):")
print(conn.execute("SELECT * FROM employees ORDER BY salary DESC").fetchdf())

print("\nAverage salary by department:")
print(conn.execute("""
    SELECT department,
           ROUND(AVG(salary), 2) AS avg_salary,
           COUNT(*) AS headcount
    FROM employees
    GROUP BY department
    ORDER BY avg_salary DESC
""").fetchdf())

# ═══════════════════════════════════════════════════════════════
# 3. QUERY CSV DIRECTLY — THE KILLER FEATURE
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("QUERYING CSV DIRECTLY (No pd.read_csv() needed!)")
print("=" * 60)

t0 = time.time()
result = conn.execute(f"""
    SELECT
        region,
        category,
        COUNT(*) AS num_orders,
        ROUND(SUM(total_amount), 2) AS revenue,
        ROUND(AVG(total_amount), 2) AS avg_order_value
    FROM read_csv('{csv_path}', AUTO_DETECT=TRUE)
    GROUP BY region, category
    ORDER BY revenue DESC
    LIMIT 10
""").fetchdf()
duckdb_time = time.time() - t0
print(f"DuckDB direct CSV query: {duckdb_time:.3f}s")
print(result)

# ═══════════════════════════════════════════════════════════════
# 4. DUCKDB vs PANDAS — PERFORMANCE SHOWDOWN
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("DUCKDB vs PANDAS — Same query, who wins?")
print("=" * 60)

t0 = time.time()
df = pd.read_csv(csv_path)
pandas_result = (df.groupby(["region", "category"])
                 .agg(
                     num_orders=("order_id", "count"),
                     revenue=("total_amount", "sum"),
                     avg_order_value=("total_amount", "mean")
                 )
                 .sort_values("revenue", ascending=False)
                 .head(10)
                 .round(2))
pandas_time = time.time() - t0

print(f"Pandas read_csv + groupby: {pandas_time:.3f}s")
print(f"DuckDB direct query:        {duckdb_time:.3f}s")
print(f"Speedup: {pandas_time/duckdb_time:.1f}x faster with DuckDB!")

# ═══════════════════════════════════════════════════════════════
# 5. QUERY PARQUET FILES
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("QUERYING PARQUET FILES")
print("=" * 60)

t0 = time.time()
result = conn.execute(f"""
    SELECT
        product,
        ROUND(SUM(total_amount), 2) AS total_revenue,
        COUNT(*) AS units_sold,
        ROUND(AVG(quantity), 1) AS avg_qty_per_order
    FROM read_parquet('{parquet_path}')
    GROUP BY product
    ORDER BY total_revenue DESC
""").fetchdf()
pq_time = time.time() - t0
print(f"Parquet query: {pq_time:.3f}s")
print(result)

# ═══════════════════════════════════════════════════════════════
# 6. WINDOW FUNCTIONS — Top 3 products per region
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("WINDOW FUNCTIONS — Top 3 Products per Region")
print("=" * 60)

result = conn.execute(f"""
    WITH ranked AS (
        SELECT
            region,
            product,
            ROUND(SUM(total_amount), 2) AS revenue,
            ROW_NUMBER() OVER (
                PARTITION BY region
                ORDER BY SUM(total_amount) DESC
            ) AS rank
        FROM read_parquet('{parquet_path}')
        GROUP BY region, product
    )
    SELECT * FROM ranked WHERE rank <= 3
    ORDER BY region, rank
""").fetchdf()
print(result)

# ═══════════════════════════════════════════════════════════════
# 7. HYBRID WORKFLOW: DuckDB → Pandas → Polars
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("HYBRID WORKFLOW: DuckDB → Pandas → Polars")
print("=" * 60)

# Step 1: DuckDB does the heavy aggregation
print("Step 1: DuckDB aggregates 500K rows → summary...")
t0 = time.time()
summary = conn.execute(f"""
    SELECT
        region,
        category,
        DATE_TRUNC('month', order_date) AS month,
        COUNT(*) AS order_count,
        ROUND(SUM(total_amount), 2) AS monthly_revenue
    FROM read_parquet('{parquet_path}')
    GROUP BY region, category, DATE_TRUNC('month', order_date)
""").fetchdf()
print(f"   Done in {time.time() - t0:.3f}s → {len(summary)} rows")

# Step 2: Pandas for pivot table
print("\nStep 2: Pandas pivot table...")
t0 = time.time()
pivot = summary.pivot_table(
    index="month",
    columns="region",
    values="monthly_revenue",
    aggfunc="sum"
).round(2)
print(f"   Done in {time.time() - t0:.3f}s")
print(pivot.head(6))

# Step 3: Polars for final polish
print("\nStep 3: Polars for final formatting...")
t0 = time.time()
pl_df = pl.from_pandas(summary)
top_month = (pl_df
             .group_by("region")
             .agg(pl.col("monthly_revenue").max().alias("best_month_revenue"))
             .sort("best_month_revenue", descending=True))
print(f"   Done in {time.time() - t0:.3f}s")
print(top_month)

# ═══════════════════════════════════════════════════════════════
# 8. PERSISTENT DATABASE
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("PERSISTENT DATABASE — Save to .duckdb file")
print("=" * 60)

db_path = "analytics.duckdb"
persistent_conn = duckdb.connect(db_path)

persistent_conn.execute(f"""
    CREATE OR REPLACE TABLE sales AS
    SELECT * FROM read_parquet('{parquet_path}')
""")

row_count = persistent_conn.execute("SELECT COUNT(*) FROM sales").fetchone()[0]
db_size = os.path.getsize(db_path) / (1024 * 1024)
print(f"Database file: {db_path} ({db_size:.1f} MB)")
print(f"Sales table: {row_count:,} rows persisted")

print("\nTop 5 customers by lifetime value:")
print(persistent_conn.execute("""
    SELECT
        customer_id,
        COUNT(*) AS orders,
        ROUND(SUM(total_amount), 2) AS lifetime_value
    FROM sales
    GROUP BY customer_id
    ORDER BY lifetime_value DESC
    LIMIT 5
""").fetchdf())

persistent_conn.close()

# ═══════════════════════════════════════════════════════════════
# 9. EXPORT RESULTS
# ═══════════════════════════════════════════════════════════════
print("\n" + "=" * 60)
print("EXPORTING RESULTS")
print("=" * 60)

conn.execute(f"""
    COPY (
        SELECT region, product, ROUND(SUM(total_amount), 2) AS revenue
        FROM read_parquet('{parquet_path}')
        GROUP BY region, product
        ORDER BY revenue DESC
    ) TO 'revenue_summary.csv' (HEADER, DELIMITER ',')
""")

conn.execute(f"""
    COPY (
        SELECT region, product, ROUND(SUM(total_amount), 2) AS revenue
        FROM read_parquet('{parquet_path}')
        GROUP BY region, product
        ORDER BY revenue DESC
    ) TO 'revenue_summary.parquet' (FORMAT PARQUET)
""")

print("Exported: revenue_summary.csv")
print("Exported: revenue_summary.parquet")

exported = pd.read_csv("revenue_summary.csv")
print(f"\nExported CSV preview ({len(exported)} rows):")
print(exported.head())

# ═══════════════════════════════════════════════════════════════
# CLEANUP
# ═══════════════════════════════════════════════════════════════
conn.close()

print("\n" + "=" * 60)
print("DONE! All examples completed successfully.")
print("=" * 60)