zhongwei/gh-dieshen-claude-marketplace-plugins-database-design

Fork 0

Files

Zhongwei Li 1b1cbbcdd5 Initial commit

2025-11-29 18:21:15 +08:00

18 KiB

Raw Blame History

Database Design & Optimization Patterns

Comprehensive database design, optimization, and performance patterns for SQL and NoSQL databases.

SQL Database Design Patterns

Schema Design Best Practices

Normalization (3NF)

-- Properly normalized schema
CREATE TABLE users (
    id SERIAL PRIMARY KEY,
    email VARCHAR(255) UNIQUE NOT NULL,
    username VARCHAR(50) UNIQUE NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE user_profiles (
    user_id INTEGER PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
    first_name VARCHAR(100),
    last_name VARCHAR(100),
    bio TEXT,
    avatar_url VARCHAR(500)
);

CREATE TABLE posts (
    id SERIAL PRIMARY KEY,
    user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
    title VARCHAR(255) NOT NULL,
    content TEXT,
    published_at TIMESTAMP,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE tags (
    id SERIAL PRIMARY KEY,
    name VARCHAR(50) UNIQUE NOT NULL,
    slug VARCHAR(50) UNIQUE NOT NULL
);

CREATE TABLE post_tags (
    post_id INTEGER REFERENCES posts(id) ON DELETE CASCADE,
    tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
    PRIMARY KEY (post_id, tag_id)
);

Denormalization for Performance

-- Denormalized for read performance
CREATE TABLE post_view (
    id SERIAL PRIMARY KEY,
    user_id INTEGER NOT NULL,
    username VARCHAR(50) NOT NULL,
    user_avatar VARCHAR(500),
    post_id INTEGER NOT NULL,
    post_title VARCHAR(255) NOT NULL,
    post_content TEXT,
    post_published_at TIMESTAMP,
    tags TEXT[], -- Array of tag names
    comment_count INTEGER DEFAULT 0,
    like_count INTEGER DEFAULT 0,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Materialized view for complex aggregations
CREATE MATERIALIZED VIEW user_statistics AS
SELECT
    u.id,
    u.username,
    COUNT(DISTINCT p.id) as post_count,
    COUNT(DISTINCT c.id) as comment_count,
    COUNT(DISTINCT l.id) as like_count,
    MAX(p.created_at) as last_post_at
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
LEFT JOIN comments c ON u.id = c.user_id
LEFT JOIN likes l ON u.id = l.user_id
GROUP BY u.id, u.username;

-- Refresh materialized view
REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics;

Indexing Strategies

B-Tree Indexes (Default)

-- Single column index
CREATE INDEX idx_posts_user_id ON posts(user_id);

-- Composite index (order matters!)
CREATE INDEX idx_posts_user_published ON posts(user_id, published_at DESC);

-- Partial index (for specific conditions)
CREATE INDEX idx_posts_published ON posts(published_at)
WHERE published_at IS NOT NULL;

-- Unique index
CREATE UNIQUE INDEX idx_users_email_lower ON users(LOWER(email));

Specialized Indexes

-- GIN index for full-text search
CREATE INDEX idx_posts_content_fts ON posts
USING GIN(to_tsvector('english', content));

-- Search using full-text index
SELECT * FROM posts
WHERE to_tsvector('english', content) @@ to_tsquery('english', 'database & optimization');

-- JSONB GIN index
CREATE TABLE settings (
    user_id INTEGER PRIMARY KEY,
    preferences JSONB NOT NULL DEFAULT '{}'
);

CREATE INDEX idx_settings_preferences ON settings USING GIN(preferences);

-- Query JSONB efficiently
SELECT * FROM settings
WHERE preferences @> '{"theme": "dark"}';

-- GiST index for geometric and range types
CREATE INDEX idx_events_date_range ON events USING GIST(date_range);

-- Hash index (PostgreSQL 10+, for equality only)
CREATE INDEX idx_users_uuid ON users USING HASH(uuid);

Index Maintenance

-- Analyze index usage
SELECT
    schemaname,
    tablename,
    indexname,
    idx_scan,
    idx_tup_read,
    idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan ASC;

-- Find unused indexes
SELECT
    schemaname,
    tablename,
    indexname
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexname NOT LIKE 'pg_toast%';

-- Reindex to rebuild fragmented indexes
REINDEX INDEX CONCURRENTLY idx_posts_user_id;
REINDEX TABLE CONCURRENTLY posts;

Query Optimization

EXPLAIN ANALYZE

-- Analyze query execution plan
EXPLAIN ANALYZE
SELECT
    u.username,
    p.title,
    COUNT(c.id) as comment_count
FROM users u
JOIN posts p ON u.id = p.user_id
LEFT JOIN comments c ON p.id = c.post_id
WHERE p.published_at > NOW() - INTERVAL '30 days'
GROUP BY u.username, p.title
ORDER BY comment_count DESC
LIMIT 10;

-- Key metrics to look for:
-- - Seq Scan vs Index Scan
-- - Nested Loop vs Hash Join vs Merge Join
-- - Actual time vs Estimated rows
-- - Buffers (shared hit ratio)

Avoiding N+1 Queries

-- Bad: N+1 query problem
-- Query 1: Get all posts
SELECT * FROM posts LIMIT 10;

-- Query 2-11: Get author for each post (N queries)
SELECT * FROM users WHERE id = ?;

-- Good: Use JOIN to fetch in one query
SELECT
    p.*,
    u.username,
    u.email
FROM posts p
JOIN users u ON p.user_id = u.id
LIMIT 10;

-- Good: Use subquery or CTE
WITH post_authors AS (
    SELECT DISTINCT user_id FROM posts LIMIT 10
)
SELECT u.* FROM users u
WHERE u.id IN (SELECT user_id FROM post_authors);

Query Optimization Techniques

-- Use EXISTS instead of COUNT when checking existence
-- Bad
SELECT * FROM users u
WHERE (SELECT COUNT(*) FROM posts WHERE user_id = u.id) > 0;

-- Good
SELECT * FROM users u
WHERE EXISTS (SELECT 1 FROM posts WHERE user_id = u.id);

-- Use DISTINCT ON for getting first row per group (PostgreSQL)
SELECT DISTINCT ON (user_id)
    user_id,
    created_at,
    content
FROM posts
ORDER BY user_id, created_at DESC;

-- Use window functions instead of subqueries
-- Get each user's latest post
SELECT
    user_id,
    title,
    created_at,
    ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) as rn
FROM posts
WHERE rn = 1;

-- Batch updates instead of row-by-row
-- Bad
UPDATE posts SET view_count = view_count + 1 WHERE id = ?; -- Called N times

-- Good
UPDATE posts
SET view_count = view_count + v.increment
FROM (VALUES (1, 5), (2, 3), (3, 10)) AS v(id, increment)
WHERE posts.id = v.id;

Connection Pooling

Node.js (pg pool)

import { Pool } from 'pg';

const pool = new Pool({
  host: 'localhost',
  port: 5432,
  database: 'myapp',
  user: 'postgres',
  password: 'password',
  max: 20, // Maximum pool size
  idleTimeoutMillis: 30000,
  connectionTimeoutMillis: 2000,
});

// Use pool for queries
async function getUserById(id: number) {
  const client = await pool.connect();
  try {
    const result = await client.query('SELECT * FROM users WHERE id = $1', [id]);
    return result.rows[0];
  } finally {
    client.release(); // Always release back to pool
  }
}

// Or use pool.query directly (handles acquire/release)
async function getUsers() {
  const result = await pool.query('SELECT * FROM users LIMIT 100');
  return result.rows;
}

// Transaction with pool
async function transferFunds(fromId: number, toId: number, amount: number) {
  const client = await pool.connect();
  try {
    await client.query('BEGIN');

    await client.query(
      'UPDATE accounts SET balance = balance - $1 WHERE user_id = $2',
      [amount, fromId]
    );

    await client.query(
      'UPDATE accounts SET balance = balance + $1 WHERE user_id = $2',
      [amount, toId]
    );

    await client.query('COMMIT');
  } catch (error) {
    await client.query('ROLLBACK');
    throw error;
  } finally {
    client.release();
  }
}

Python (SQLAlchemy)

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import QueuePool

# Create engine with connection pool
engine = create_engine(
    'postgresql://user:password@localhost/dbname',
    poolclass=QueuePool,
    pool_size=10,
    max_overflow=20,
    pool_pre_ping=True,  # Verify connections before using
    pool_recycle=3600,   # Recycle connections after 1 hour
)

Session = sessionmaker(bind=engine)

# Use session
def get_user(user_id: int):
    session = Session()
    try:
        user = session.query(User).filter(User.id == user_id).first()
        return user
    finally:
        session.close()

# Context manager for automatic cleanup
from contextlib import contextmanager

@contextmanager
def get_db_session():
    session = Session()
    try:
        yield session
        session.commit()
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()

# Usage
with get_db_session() as session:
    user = session.query(User).filter(User.id == 1).first()
    user.name = "Updated Name"

Database Migration Patterns

Migrations with TypeORM (Node.js)

// migrations/1234567890-CreateUsers.ts
import { MigrationInterface, QueryRunner, Table } from 'typeorm';

export class CreateUsers1234567890 implements MigrationInterface {
  public async up(queryRunner: QueryRunner): Promise<void> {
    await queryRunner.createTable(
      new Table({
        name: 'users',
        columns: [
          {
            name: 'id',
            type: 'int',
            isPrimary: true,
            isGenerated: true,
            generationStrategy: 'increment',
          },
          {
            name: 'email',
            type: 'varchar',
            length: '255',
            isUnique: true,
          },
          {
            name: 'created_at',
            type: 'timestamp',
            default: 'CURRENT_TIMESTAMP',
          },
        ],
      })
    );

    await queryRunner.createIndex(
      'users',
      new TableIndex({
        name: 'idx_users_email',
        columnNames: ['email'],
      })
    );
  }

  public async down(queryRunner: QueryRunner): Promise<void> {
    await queryRunner.dropTable('users');
  }
}

Alembic Migrations (Python)

# alembic/versions/001_create_users.py
from alembic import op
import sqlalchemy as sa

def upgrade():
    op.create_table(
        'users',
        sa.Column('id', sa.Integer(), primary_key=True),
        sa.Column('email', sa.String(255), unique=True, nullable=False),
        sa.Column('username', sa.String(50), unique=True, nullable=False),
        sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()),
    )

    op.create_index('idx_users_email', 'users', ['email'])

def downgrade():
    op.drop_index('idx_users_email')
    op.drop_table('users')

Zero-Downtime Migration Strategies

-- Adding a NOT NULL column safely

-- Step 1: Add column as nullable
ALTER TABLE users ADD COLUMN phone VARCHAR(20);

-- Step 2: Backfill data in batches
UPDATE users SET phone = '000-000-0000' WHERE phone IS NULL;

-- Step 3: Add NOT NULL constraint
ALTER TABLE users ALTER COLUMN phone SET NOT NULL;

-- Renaming a column safely

-- Step 1: Add new column
ALTER TABLE users ADD COLUMN full_name VARCHAR(200);

-- Step 2: Dual-write to both columns in application code
-- Step 3: Backfill data
UPDATE users SET full_name = name WHERE full_name IS NULL;

-- Step 4: Switch reads to new column in application
-- Step 5: Drop old column
ALTER TABLE users DROP COLUMN name;

NoSQL Database Patterns

MongoDB Schema Design

Embedding vs Referencing

// Embedding (One-to-Few)
{
  _id: ObjectId("..."),
  username: "johndoe",
  email: "john@example.com",
  addresses: [
    {
      type: "home",
      street: "123 Main St",
      city: "New York",
      zip: "10001"
    },
    {
      type: "work",
      street: "456 Office Blvd",
      city: "New York",
      zip: "10002"
    }
  ]
}

// Referencing (One-to-Many or Many-to-Many)
// Users collection
{
  _id: ObjectId("user1"),
  username: "johndoe",
  email: "john@example.com"
}

// Posts collection
{
  _id: ObjectId("post1"),
  user_id: ObjectId("user1"),
  title: "My Post",
  content: "...",
  created_at: ISODate("2024-01-01")
}

// Extended Reference Pattern (Denormalization)
{
  _id: ObjectId("post1"),
  user: {
    _id: ObjectId("user1"),
    username: "johndoe",
    avatar: "https://..."
  },
  title: "My Post",
  content: "..."
}

Compound Indexes

// Create compound index
db.posts.createIndex({ user_id: 1, created_at: -1 });

// Index with unique constraint
db.users.createIndex({ email: 1 }, { unique: true });

// Partial index
db.orders.createIndex(
  { status: 1, created_at: -1 },
  { partialFilterExpression: { status: { $in: ["pending", "processing"] } } }
);

// Text index for full-text search
db.articles.createIndex({ title: "text", content: "text" });

// Geospatial index
db.locations.createIndex({ coordinates: "2dsphere" });

Aggregation Pipeline

// Complex aggregation example
db.orders.aggregate([
  // Stage 1: Match recent orders
  {
    $match: {
      created_at: { $gte: new Date("2024-01-01") },
      status: "completed"
    }
  },

  // Stage 2: Lookup user data
  {
    $lookup: {
      from: "users",
      localField: "user_id",
      foreignField: "_id",
      as: "user"
    }
  },

  // Stage 3: Unwind user array
  { $unwind: "$user" },

  // Stage 4: Group by user and calculate totals
  {
    $group: {
      _id: "$user._id",
      username: { $first: "$user.username" },
      total_orders: { $sum: 1 },
      total_revenue: { $sum: "$total_amount" },
      avg_order_value: { $avg: "$total_amount" }
    }
  },

  // Stage 5: Sort by revenue
  { $sort: { total_revenue: -1 } },

  // Stage 6: Limit results
  { $limit: 10 }
]);

// Use $facet for multiple aggregations in one query
db.products.aggregate([
  {
    $facet: {
      categoryCounts: [
        { $group: { _id: "$category", count: { $sum: 1 } } }
      ],
      priceRanges: [
        {
          $bucket: {
            groupBy: "$price",
            boundaries: [0, 25, 50, 100, 500],
            default: "500+",
            output: { count: { $sum: 1 } }
          }
        }
      ],
      topRated: [
        { $sort: { rating: -1 } },
        { $limit: 5 }
      ]
    }
  }
]);

Redis Patterns

Caching Strategy

import Redis from 'ioredis';

const redis = new Redis({
  host: 'localhost',
  port: 6379,
  retryStrategy: (times) => Math.min(times * 50, 2000),
});

// Cache-aside pattern
async function getUser(userId: string) {
  const cacheKey = `user:${userId}`;

  // Try cache first
  const cached = await redis.get(cacheKey);
  if (cached) {
    return JSON.parse(cached);
  }

  // Cache miss - fetch from database
  const user = await db.users.findById(userId);

  // Store in cache with TTL
  await redis.setex(cacheKey, 3600, JSON.stringify(user));

  return user;
}

// Invalidate cache on update
async function updateUser(userId: string, data: UserData) {
  const user = await db.users.update(userId, data);

  // Invalidate cache
  await redis.del(`user:${userId}`);

  return user;
}

// Rate limiting with Redis
async function checkRateLimit(userId: string, limit: number, window: number) {
  const key = `ratelimit:${userId}`;
  const current = await redis.incr(key);

  if (current === 1) {
    await redis.expire(key, window);
  }

  return current <= limit;
}

// Usage
const allowed = await checkRateLimit('user123', 100, 60); // 100 requests per minute
if (!allowed) {
  throw new Error('Rate limit exceeded');
}

// Distributed locking
async function withLock<T>(
  lockKey: string,
  ttl: number,
  fn: () => Promise<T>
): Promise<T> {
  const lockValue = crypto.randomUUID();
  const acquired = await redis.set(lockKey, lockValue, 'EX', ttl, 'NX');

  if (!acquired) {
    throw new Error('Could not acquire lock');
  }

  try {
    return await fn();
  } finally {
    // Release lock only if we still own it
    const script = `
      if redis.call("get", KEYS[1]) == ARGV[1] then
        return redis.call("del", KEYS[1])
      else
        return 0
      end
    `;
    await redis.eval(script, 1, lockKey, lockValue);
  }
}

// Pub/Sub pattern
const publisher = new Redis();
const subscriber = new Redis();

subscriber.subscribe('notifications', (err, count) => {
  console.log(`Subscribed to ${count} channels`);
});

subscriber.on('message', (channel, message) => {
  console.log(`Received ${message} from ${channel}`);
});

publisher.publish('notifications', JSON.stringify({
  type: 'new_message',
  userId: '123',
  content: 'Hello!'
}));

Database Performance Best Practices

1. Use Connection Pooling

Always use connection pools to avoid connection overhead

2. Index Strategically

Index foreign keys and columns used in WHERE, JOIN, ORDER BY
Avoid over-indexing (impacts write performance)
Use composite indexes for multi-column queries
Monitor index usage and remove unused ones

3. Optimize Queries

Use EXPLAIN to analyze query plans
Avoid SELECT * - fetch only needed columns
Use pagination for large result sets
Batch operations when possible

4. Cache Frequently Accessed Data

Use Redis or Memcached for hot data
Implement cache invalidation strategy
Consider read replicas for read-heavy workloads

5. Partition Large Tables

-- Range partitioning by date
CREATE TABLE events (
    id SERIAL,
    event_type VARCHAR(50),
    data JSONB,
    created_at TIMESTAMP
) PARTITION BY RANGE (created_at);

CREATE TABLE events_2024_q1 PARTITION OF events
FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');

CREATE TABLE events_2024_q2 PARTITION OF events
FOR VALUES FROM ('2024-04-01') TO ('2024-07-01');

6. Monitor and Analyze

Track slow queries
Monitor connection pool usage
Analyze query performance trends
Set up alerts for anomalies

7. Use Appropriate Data Types

-- Good
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    price NUMERIC(10,2),  -- Exact decimal for money
    created_at TIMESTAMP WITH TIME ZONE
);

-- Bad
CREATE TABLE products (
    id VARCHAR(255) PRIMARY KEY,  -- Wasteful for numeric IDs
    price FLOAT,  -- Floating point for money (precision issues)
    created_at VARCHAR(50)  -- String for dates
);

8. Implement Proper Backup Strategy

Regular automated backups
Test restore procedures
Use point-in-time recovery when possible
Replicate to multiple regions for disaster recovery

18 KiB Raw Blame History