18 KiB
18 KiB
Database Design & Optimization Patterns
Comprehensive database design, optimization, and performance patterns for SQL and NoSQL databases.
SQL Database Design Patterns
Schema Design Best Practices
Normalization (3NF)
-- Properly normalized schema
CREATE TABLE users (
id SERIAL PRIMARY KEY,
email VARCHAR(255) UNIQUE NOT NULL,
username VARCHAR(50) UNIQUE NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE user_profiles (
user_id INTEGER PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
first_name VARCHAR(100),
last_name VARCHAR(100),
bio TEXT,
avatar_url VARCHAR(500)
);
CREATE TABLE posts (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
title VARCHAR(255) NOT NULL,
content TEXT,
published_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE tags (
id SERIAL PRIMARY KEY,
name VARCHAR(50) UNIQUE NOT NULL,
slug VARCHAR(50) UNIQUE NOT NULL
);
CREATE TABLE post_tags (
post_id INTEGER REFERENCES posts(id) ON DELETE CASCADE,
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
PRIMARY KEY (post_id, tag_id)
);
Denormalization for Performance
-- Denormalized for read performance
CREATE TABLE post_view (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
username VARCHAR(50) NOT NULL,
user_avatar VARCHAR(500),
post_id INTEGER NOT NULL,
post_title VARCHAR(255) NOT NULL,
post_content TEXT,
post_published_at TIMESTAMP,
tags TEXT[], -- Array of tag names
comment_count INTEGER DEFAULT 0,
like_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Materialized view for complex aggregations
CREATE MATERIALIZED VIEW user_statistics AS
SELECT
u.id,
u.username,
COUNT(DISTINCT p.id) as post_count,
COUNT(DISTINCT c.id) as comment_count,
COUNT(DISTINCT l.id) as like_count,
MAX(p.created_at) as last_post_at
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
LEFT JOIN comments c ON u.id = c.user_id
LEFT JOIN likes l ON u.id = l.user_id
GROUP BY u.id, u.username;
-- Refresh materialized view
REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics;
Indexing Strategies
B-Tree Indexes (Default)
-- Single column index
CREATE INDEX idx_posts_user_id ON posts(user_id);
-- Composite index (order matters!)
CREATE INDEX idx_posts_user_published ON posts(user_id, published_at DESC);
-- Partial index (for specific conditions)
CREATE INDEX idx_posts_published ON posts(published_at)
WHERE published_at IS NOT NULL;
-- Unique index
CREATE UNIQUE INDEX idx_users_email_lower ON users(LOWER(email));
Specialized Indexes
-- GIN index for full-text search
CREATE INDEX idx_posts_content_fts ON posts
USING GIN(to_tsvector('english', content));
-- Search using full-text index
SELECT * FROM posts
WHERE to_tsvector('english', content) @@ to_tsquery('english', 'database & optimization');
-- JSONB GIN index
CREATE TABLE settings (
user_id INTEGER PRIMARY KEY,
preferences JSONB NOT NULL DEFAULT '{}'
);
CREATE INDEX idx_settings_preferences ON settings USING GIN(preferences);
-- Query JSONB efficiently
SELECT * FROM settings
WHERE preferences @> '{"theme": "dark"}';
-- GiST index for geometric and range types
CREATE INDEX idx_events_date_range ON events USING GIST(date_range);
-- Hash index (PostgreSQL 10+, for equality only)
CREATE INDEX idx_users_uuid ON users USING HASH(uuid);
Index Maintenance
-- Analyze index usage
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan ASC;
-- Find unused indexes
SELECT
schemaname,
tablename,
indexname
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexname NOT LIKE 'pg_toast%';
-- Reindex to rebuild fragmented indexes
REINDEX INDEX CONCURRENTLY idx_posts_user_id;
REINDEX TABLE CONCURRENTLY posts;
Query Optimization
EXPLAIN ANALYZE
-- Analyze query execution plan
EXPLAIN ANALYZE
SELECT
u.username,
p.title,
COUNT(c.id) as comment_count
FROM users u
JOIN posts p ON u.id = p.user_id
LEFT JOIN comments c ON p.id = c.post_id
WHERE p.published_at > NOW() - INTERVAL '30 days'
GROUP BY u.username, p.title
ORDER BY comment_count DESC
LIMIT 10;
-- Key metrics to look for:
-- - Seq Scan vs Index Scan
-- - Nested Loop vs Hash Join vs Merge Join
-- - Actual time vs Estimated rows
-- - Buffers (shared hit ratio)
Avoiding N+1 Queries
-- Bad: N+1 query problem
-- Query 1: Get all posts
SELECT * FROM posts LIMIT 10;
-- Query 2-11: Get author for each post (N queries)
SELECT * FROM users WHERE id = ?;
-- Good: Use JOIN to fetch in one query
SELECT
p.*,
u.username,
u.email
FROM posts p
JOIN users u ON p.user_id = u.id
LIMIT 10;
-- Good: Use subquery or CTE
WITH post_authors AS (
SELECT DISTINCT user_id FROM posts LIMIT 10
)
SELECT u.* FROM users u
WHERE u.id IN (SELECT user_id FROM post_authors);
Query Optimization Techniques
-- Use EXISTS instead of COUNT when checking existence
-- Bad
SELECT * FROM users u
WHERE (SELECT COUNT(*) FROM posts WHERE user_id = u.id) > 0;
-- Good
SELECT * FROM users u
WHERE EXISTS (SELECT 1 FROM posts WHERE user_id = u.id);
-- Use DISTINCT ON for getting first row per group (PostgreSQL)
SELECT DISTINCT ON (user_id)
user_id,
created_at,
content
FROM posts
ORDER BY user_id, created_at DESC;
-- Use window functions instead of subqueries
-- Get each user's latest post
SELECT
user_id,
title,
created_at,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) as rn
FROM posts
WHERE rn = 1;
-- Batch updates instead of row-by-row
-- Bad
UPDATE posts SET view_count = view_count + 1 WHERE id = ?; -- Called N times
-- Good
UPDATE posts
SET view_count = view_count + v.increment
FROM (VALUES (1, 5), (2, 3), (3, 10)) AS v(id, increment)
WHERE posts.id = v.id;
Connection Pooling
Node.js (pg pool)
import { Pool } from 'pg';
const pool = new Pool({
host: 'localhost',
port: 5432,
database: 'myapp',
user: 'postgres',
password: 'password',
max: 20, // Maximum pool size
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 2000,
});
// Use pool for queries
async function getUserById(id: number) {
const client = await pool.connect();
try {
const result = await client.query('SELECT * FROM users WHERE id = $1', [id]);
return result.rows[0];
} finally {
client.release(); // Always release back to pool
}
}
// Or use pool.query directly (handles acquire/release)
async function getUsers() {
const result = await pool.query('SELECT * FROM users LIMIT 100');
return result.rows;
}
// Transaction with pool
async function transferFunds(fromId: number, toId: number, amount: number) {
const client = await pool.connect();
try {
await client.query('BEGIN');
await client.query(
'UPDATE accounts SET balance = balance - $1 WHERE user_id = $2',
[amount, fromId]
);
await client.query(
'UPDATE accounts SET balance = balance + $1 WHERE user_id = $2',
[amount, toId]
);
await client.query('COMMIT');
} catch (error) {
await client.query('ROLLBACK');
throw error;
} finally {
client.release();
}
}
Python (SQLAlchemy)
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import QueuePool
# Create engine with connection pool
engine = create_engine(
'postgresql://user:password@localhost/dbname',
poolclass=QueuePool,
pool_size=10,
max_overflow=20,
pool_pre_ping=True, # Verify connections before using
pool_recycle=3600, # Recycle connections after 1 hour
)
Session = sessionmaker(bind=engine)
# Use session
def get_user(user_id: int):
session = Session()
try:
user = session.query(User).filter(User.id == user_id).first()
return user
finally:
session.close()
# Context manager for automatic cleanup
from contextlib import contextmanager
@contextmanager
def get_db_session():
session = Session()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()
# Usage
with get_db_session() as session:
user = session.query(User).filter(User.id == 1).first()
user.name = "Updated Name"
Database Migration Patterns
Migrations with TypeORM (Node.js)
// migrations/1234567890-CreateUsers.ts
import { MigrationInterface, QueryRunner, Table } from 'typeorm';
export class CreateUsers1234567890 implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.createTable(
new Table({
name: 'users',
columns: [
{
name: 'id',
type: 'int',
isPrimary: true,
isGenerated: true,
generationStrategy: 'increment',
},
{
name: 'email',
type: 'varchar',
length: '255',
isUnique: true,
},
{
name: 'created_at',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
],
})
);
await queryRunner.createIndex(
'users',
new TableIndex({
name: 'idx_users_email',
columnNames: ['email'],
})
);
}
public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.dropTable('users');
}
}
Alembic Migrations (Python)
# alembic/versions/001_create_users.py
from alembic import op
import sqlalchemy as sa
def upgrade():
op.create_table(
'users',
sa.Column('id', sa.Integer(), primary_key=True),
sa.Column('email', sa.String(255), unique=True, nullable=False),
sa.Column('username', sa.String(50), unique=True, nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()),
)
op.create_index('idx_users_email', 'users', ['email'])
def downgrade():
op.drop_index('idx_users_email')
op.drop_table('users')
Zero-Downtime Migration Strategies
-- Adding a NOT NULL column safely
-- Step 1: Add column as nullable
ALTER TABLE users ADD COLUMN phone VARCHAR(20);
-- Step 2: Backfill data in batches
UPDATE users SET phone = '000-000-0000' WHERE phone IS NULL;
-- Step 3: Add NOT NULL constraint
ALTER TABLE users ALTER COLUMN phone SET NOT NULL;
-- Renaming a column safely
-- Step 1: Add new column
ALTER TABLE users ADD COLUMN full_name VARCHAR(200);
-- Step 2: Dual-write to both columns in application code
-- Step 3: Backfill data
UPDATE users SET full_name = name WHERE full_name IS NULL;
-- Step 4: Switch reads to new column in application
-- Step 5: Drop old column
ALTER TABLE users DROP COLUMN name;
NoSQL Database Patterns
MongoDB Schema Design
Embedding vs Referencing
// Embedding (One-to-Few)
{
_id: ObjectId("..."),
username: "johndoe",
email: "john@example.com",
addresses: [
{
type: "home",
street: "123 Main St",
city: "New York",
zip: "10001"
},
{
type: "work",
street: "456 Office Blvd",
city: "New York",
zip: "10002"
}
]
}
// Referencing (One-to-Many or Many-to-Many)
// Users collection
{
_id: ObjectId("user1"),
username: "johndoe",
email: "john@example.com"
}
// Posts collection
{
_id: ObjectId("post1"),
user_id: ObjectId("user1"),
title: "My Post",
content: "...",
created_at: ISODate("2024-01-01")
}
// Extended Reference Pattern (Denormalization)
{
_id: ObjectId("post1"),
user: {
_id: ObjectId("user1"),
username: "johndoe",
avatar: "https://..."
},
title: "My Post",
content: "..."
}
Compound Indexes
// Create compound index
db.posts.createIndex({ user_id: 1, created_at: -1 });
// Index with unique constraint
db.users.createIndex({ email: 1 }, { unique: true });
// Partial index
db.orders.createIndex(
{ status: 1, created_at: -1 },
{ partialFilterExpression: { status: { $in: ["pending", "processing"] } } }
);
// Text index for full-text search
db.articles.createIndex({ title: "text", content: "text" });
// Geospatial index
db.locations.createIndex({ coordinates: "2dsphere" });
Aggregation Pipeline
// Complex aggregation example
db.orders.aggregate([
// Stage 1: Match recent orders
{
$match: {
created_at: { $gte: new Date("2024-01-01") },
status: "completed"
}
},
// Stage 2: Lookup user data
{
$lookup: {
from: "users",
localField: "user_id",
foreignField: "_id",
as: "user"
}
},
// Stage 3: Unwind user array
{ $unwind: "$user" },
// Stage 4: Group by user and calculate totals
{
$group: {
_id: "$user._id",
username: { $first: "$user.username" },
total_orders: { $sum: 1 },
total_revenue: { $sum: "$total_amount" },
avg_order_value: { $avg: "$total_amount" }
}
},
// Stage 5: Sort by revenue
{ $sort: { total_revenue: -1 } },
// Stage 6: Limit results
{ $limit: 10 }
]);
// Use $facet for multiple aggregations in one query
db.products.aggregate([
{
$facet: {
categoryCounts: [
{ $group: { _id: "$category", count: { $sum: 1 } } }
],
priceRanges: [
{
$bucket: {
groupBy: "$price",
boundaries: [0, 25, 50, 100, 500],
default: "500+",
output: { count: { $sum: 1 } }
}
}
],
topRated: [
{ $sort: { rating: -1 } },
{ $limit: 5 }
]
}
}
]);
Redis Patterns
Caching Strategy
import Redis from 'ioredis';
const redis = new Redis({
host: 'localhost',
port: 6379,
retryStrategy: (times) => Math.min(times * 50, 2000),
});
// Cache-aside pattern
async function getUser(userId: string) {
const cacheKey = `user:${userId}`;
// Try cache first
const cached = await redis.get(cacheKey);
if (cached) {
return JSON.parse(cached);
}
// Cache miss - fetch from database
const user = await db.users.findById(userId);
// Store in cache with TTL
await redis.setex(cacheKey, 3600, JSON.stringify(user));
return user;
}
// Invalidate cache on update
async function updateUser(userId: string, data: UserData) {
const user = await db.users.update(userId, data);
// Invalidate cache
await redis.del(`user:${userId}`);
return user;
}
// Rate limiting with Redis
async function checkRateLimit(userId: string, limit: number, window: number) {
const key = `ratelimit:${userId}`;
const current = await redis.incr(key);
if (current === 1) {
await redis.expire(key, window);
}
return current <= limit;
}
// Usage
const allowed = await checkRateLimit('user123', 100, 60); // 100 requests per minute
if (!allowed) {
throw new Error('Rate limit exceeded');
}
// Distributed locking
async function withLock<T>(
lockKey: string,
ttl: number,
fn: () => Promise<T>
): Promise<T> {
const lockValue = crypto.randomUUID();
const acquired = await redis.set(lockKey, lockValue, 'EX', ttl, 'NX');
if (!acquired) {
throw new Error('Could not acquire lock');
}
try {
return await fn();
} finally {
// Release lock only if we still own it
const script = `
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("del", KEYS[1])
else
return 0
end
`;
await redis.eval(script, 1, lockKey, lockValue);
}
}
// Pub/Sub pattern
const publisher = new Redis();
const subscriber = new Redis();
subscriber.subscribe('notifications', (err, count) => {
console.log(`Subscribed to ${count} channels`);
});
subscriber.on('message', (channel, message) => {
console.log(`Received ${message} from ${channel}`);
});
publisher.publish('notifications', JSON.stringify({
type: 'new_message',
userId: '123',
content: 'Hello!'
}));
Database Performance Best Practices
1. Use Connection Pooling
Always use connection pools to avoid connection overhead
2. Index Strategically
- Index foreign keys and columns used in WHERE, JOIN, ORDER BY
- Avoid over-indexing (impacts write performance)
- Use composite indexes for multi-column queries
- Monitor index usage and remove unused ones
3. Optimize Queries
- Use EXPLAIN to analyze query plans
- Avoid SELECT * - fetch only needed columns
- Use pagination for large result sets
- Batch operations when possible
4. Cache Frequently Accessed Data
- Use Redis or Memcached for hot data
- Implement cache invalidation strategy
- Consider read replicas for read-heavy workloads
5. Partition Large Tables
-- Range partitioning by date
CREATE TABLE events (
id SERIAL,
event_type VARCHAR(50),
data JSONB,
created_at TIMESTAMP
) PARTITION BY RANGE (created_at);
CREATE TABLE events_2024_q1 PARTITION OF events
FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');
CREATE TABLE events_2024_q2 PARTITION OF events
FOR VALUES FROM ('2024-04-01') TO ('2024-07-01');
6. Monitor and Analyze
- Track slow queries
- Monitor connection pool usage
- Analyze query performance trends
- Set up alerts for anomalies
7. Use Appropriate Data Types
-- Good
CREATE TABLE products (
id SERIAL PRIMARY KEY,
price NUMERIC(10,2), -- Exact decimal for money
created_at TIMESTAMP WITH TIME ZONE
);
-- Bad
CREATE TABLE products (
id VARCHAR(255) PRIMARY KEY, -- Wasteful for numeric IDs
price FLOAT, -- Floating point for money (precision issues)
created_at VARCHAR(50) -- String for dates
);
8. Implement Proper Backup Strategy
- Regular automated backups
- Test restore procedures
- Use point-in-time recovery when possible
- Replicate to multiple regions for disaster recovery