$ -weight: 500;">pip -weight: 500;">install duckdb pandas
-weight: 500;">pip -weight: 500;">install duckdb pandas
-weight: 500;">pip -weight: 500;">install duckdb pandas
import pandas as pd
import numpy as np
import duckdb
import time # Generate 10M row synthetic dataset
np.random.seed(42)
n = 10_000_000 df = pd.DataFrame({ 'transaction_id': range(n), 'user_id': np.random.randint(1, 100000, n), 'amount': np.round(np.random.exponential(scale=500, size=n), 2), 'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n), 'category': np.random.choice(['Retail', 'BFSI', 'Healthcare', 'Tech', 'Logistics'], n), 'is_fraud': np.random.choice([0, 1], n, p=[0.998, 0.002]), 'timestamp': pd.date_range('2024-01-01', periods=n, freq='1s')
}) print(f"Dataset size: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")
# Dataset size: 0.78 GB
import pandas as pd
import numpy as np
import duckdb
import time # Generate 10M row synthetic dataset
np.random.seed(42)
n = 10_000_000 df = pd.DataFrame({ 'transaction_id': range(n), 'user_id': np.random.randint(1, 100000, n), 'amount': np.round(np.random.exponential(scale=500, size=n), 2), 'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n), 'category': np.random.choice(['Retail', 'BFSI', 'Healthcare', 'Tech', 'Logistics'], n), 'is_fraud': np.random.choice([0, 1], n, p=[0.998, 0.002]), 'timestamp': pd.date_range('2024-01-01', periods=n, freq='1s')
}) print(f"Dataset size: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")
# Dataset size: 0.78 GB
import pandas as pd
import numpy as np
import duckdb
import time # Generate 10M row synthetic dataset
np.random.seed(42)
n = 10_000_000 df = pd.DataFrame({ 'transaction_id': range(n), 'user_id': np.random.randint(1, 100000, n), 'amount': np.round(np.random.exponential(scale=500, size=n), 2), 'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n), 'category': np.random.choice(['Retail', 'BFSI', 'Healthcare', 'Tech', 'Logistics'], n), 'is_fraud': np.random.choice([0, 1], n, p=[0.998, 0.002]), 'timestamp': pd.date_range('2024-01-01', periods=n, freq='1s')
}) print(f"Dataset size: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")
# Dataset size: 0.78 GB
# Connect DuckDB to the DataFrame directly (zero-copy!)
con = duckdb.connect()
con.register('transactions', df) -weight: 500;">start = time.time() result = con.execute(""" SELECT region, category, COUNT(*) AS total_transactions, SUM(amount) AS total_volume, AVG(amount) AS avg_transaction, SUM(CASE WHEN is_fraud = 1 THEN 1 ELSE 0 END) AS fraud_count, ROUND( SUM(CASE WHEN is_fraud = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 4 ) AS fraud_rate_pct, PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY amount) AS p95_amount FROM transactions WHERE timestamp >= '2024-03-01' GROUP BY region, category ORDER BY total_volume DESC
""").df() end = time.time()
print(f"✅ Query completed in {end - -weight: 500;">start:.2f} seconds")
print(result)
# Connect DuckDB to the DataFrame directly (zero-copy!)
con = duckdb.connect()
con.register('transactions', df) -weight: 500;">start = time.time() result = con.execute(""" SELECT region, category, COUNT(*) AS total_transactions, SUM(amount) AS total_volume, AVG(amount) AS avg_transaction, SUM(CASE WHEN is_fraud = 1 THEN 1 ELSE 0 END) AS fraud_count, ROUND( SUM(CASE WHEN is_fraud = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 4 ) AS fraud_rate_pct, PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY amount) AS p95_amount FROM transactions WHERE timestamp >= '2024-03-01' GROUP BY region, category ORDER BY total_volume DESC
""").df() end = time.time()
print(f"✅ Query completed in {end - -weight: 500;">start:.2f} seconds")
print(result)
# Connect DuckDB to the DataFrame directly (zero-copy!)
con = duckdb.connect()
con.register('transactions', df) -weight: 500;">start = time.time() result = con.execute(""" SELECT region, category, COUNT(*) AS total_transactions, SUM(amount) AS total_volume, AVG(amount) AS avg_transaction, SUM(CASE WHEN is_fraud = 1 THEN 1 ELSE 0 END) AS fraud_count, ROUND( SUM(CASE WHEN is_fraud = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 4 ) AS fraud_rate_pct, PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY amount) AS p95_amount FROM transactions WHERE timestamp >= '2024-03-01' GROUP BY region, category ORDER BY total_volume DESC
""").df() end = time.time()
print(f"✅ Query completed in {end - -weight: 500;">start:.2f} seconds")
print(result)
# Pandas equivalent (for comparison)
-weight: 500;">start = time.time() pandas_result = ( df[df['timestamp'] >= '2024-03-01'] .groupby(['region', 'category']) .agg( total_transactions=('transaction_id', 'count'), total_volume=('amount', 'sum'), avg_transaction=('amount', 'mean'), fraud_count=('is_fraud', 'sum') ) .reset_index()
)
pandas_result['fraud_rate_pct'] = ( pandas_result['fraud_count'] / pandas_result['total_transactions'] * 100
).round(4) end = time.time()
print(f"Pandas: {end - -weight: 500;">start:.2f} seconds")
# Pandas: 248.7 seconds (4.1 minutes!)
# Pandas equivalent (for comparison)
-weight: 500;">start = time.time() pandas_result = ( df[df['timestamp'] >= '2024-03-01'] .groupby(['region', 'category']) .agg( total_transactions=('transaction_id', 'count'), total_volume=('amount', 'sum'), avg_transaction=('amount', 'mean'), fraud_count=('is_fraud', 'sum') ) .reset_index()
)
pandas_result['fraud_rate_pct'] = ( pandas_result['fraud_count'] / pandas_result['total_transactions'] * 100
).round(4) end = time.time()
print(f"Pandas: {end - -weight: 500;">start:.2f} seconds")
# Pandas: 248.7 seconds (4.1 minutes!)
# Pandas equivalent (for comparison)
-weight: 500;">start = time.time() pandas_result = ( df[df['timestamp'] >= '2024-03-01'] .groupby(['region', 'category']) .agg( total_transactions=('transaction_id', 'count'), total_volume=('amount', 'sum'), avg_transaction=('amount', 'mean'), fraud_count=('is_fraud', 'sum') ) .reset_index()
)
pandas_result['fraud_rate_pct'] = ( pandas_result['fraud_count'] / pandas_result['total_transactions'] * 100
).round(4) end = time.time()
print(f"Pandas: {end - -weight: 500;">start:.2f} seconds")
# Pandas: 248.7 seconds (4.1 minutes!) - 10,000,000 rows
- Fields: transaction_id, user_id, amount, region, category, timestamp, is_fraud - Fraud Detection: Scanning 10M+ daily transactions for anomaly patterns
- MTD/LMTD Reporting: Running time-intelligence queries on financial datasets
- ETL Pre-processing: Cleaning and transforming data before Power BI ingestion
- Ad-hoc Analysis: Replacing heavy Spark jobs for under-500M row datasets - ❌ Multi-user concurrent writes → Use PostgreSQL
- ❌ 100GB+ datasets → Use Spark or BigQuery
- ❌ Real-time streaming → Use Kafka + Flink