Lukas Schwarzlmüller

Get started with DuckDB

Get started with DuckDB

DuckDB is a fast, in-process analytics DB that queries CSV/Parquet/data frames with near-zero setup. We’ll use uv for clean, reproducible runs.

1) Project setup
mkdir duckdb-starter && cd duckdb-starter
uv init
uv python pin 3.11 
uv add duckdb pyarrow pandas
2) First query
import duckdb
con = duckdb.connect()  # in-memory
con.execute("""
CREATE TABLE sales AS
SELECT * FROM (VALUES
 (1,'2025-01-01','book',3,12.99),
 (2,'2025-01-02','pen',5,1.49),
 (3,'2025-01-02','book',2,12.99),
 (4,'2025-01-03','stapler',1,6.49)
) AS t(id, day, item, qty, price);
""")
print(con.execute("""
SELECT day, item, SUM(qty) AS total_qty, SUM(qty*price) AS revenue
FROM sales GROUP BY 1,2 ORDER BY 1,2
""").fetchdf())
Run:
uv run python hello_duckdb.py
3) Query files directly (no ETL)
cat > data.csv << 'EOF'
id,city,temp_c,ts
1,Vienna,19.5,2025-04-01
2,Vienna,21.2,2025-04-02
3,Graz,17.9,2025-04-01
4,Graz,18.1,2025-04-02
EOF
# query_files.py
import duckdb
con = duckdb.connect()
q = """
SELECT city, AVG(temp_c) AS avg_temp, COUNT(*) AS readings
FROM read_csv_auto('data.csv')
GROUP BY city ORDER BY avg_temp DESC
"""
print(con.execute(q).fetchdf())
Run:
uv run python query_files.py
4) Use DataFrames
# df_roundtrip.py
import duckdb, pandas as pd
df = pd.DataFrame({"product":["book","pen","pen"],"qty":[3,5,1],"price":[12.99,1.49,1.49]})
con = duckdb.connect()
con.register("df_view", df)
print(con.execute("""
SELECT product, SUM(qty) AS total_qty, ROUND(SUM(qty*price),2) AS revenue
FROM df_view GROUP BY 1 ORDER BY revenue DESC
""").fetchdf())
5) Persist or export
# persist_db.py
import duckdb
con = duckdb.connect("analytics.duckdb")
con.execute("CREATE TABLE events AS SELECT * FROM (VALUES (1,'signup'),(2,'purchase')) t(id, kind)")
con.close()
# export examples
import duckdb; con = duckdb.connect()
con.execute("CREATE TABLE t AS SELECT * FROM read_csv_auto('data.csv')")
con.execute("COPY (SELECT city, AVG(temp_c) avg_temp FROM t GROUP BY city) TO 'city_avgs.parquet' (FORMAT PARQUET)")
con.execute("COPY t TO 'all_readings.csv' (HEADER)")