750 lines
19 KiB
Markdown
750 lines
19 KiB
Markdown
# Data Analysis Skill
|
||
|
||
**Expert patterns for exploratory data analysis, statistical analysis, and visualization**
|
||
|
||
## Core Principles
|
||
|
||
1. **Data Quality First**: Always assess quality before analysis
|
||
2. **Statistical Rigor**: Use appropriate tests and validate assumptions
|
||
3. **Visual Storytelling**: Visualizations should communicate insights clearly
|
||
4. **Reproducibility**: Document methodology for repeatability
|
||
|
||
---
|
||
|
||
## Exploratory Data Analysis (EDA) Framework
|
||
|
||
### 1. Data Loading and Initial Inspection
|
||
|
||
```python
|
||
import pandas as pd
|
||
import numpy as np
|
||
|
||
# Load data
|
||
df = pd.read_csv('data.csv')
|
||
|
||
# Initial inspection
|
||
print(f"Shape: {df.shape}")
|
||
print(f"\nData Types:\n{df.dtypes}")
|
||
print(f"\nFirst 5 rows:\n{df.head()}")
|
||
print(f"\nBasic Stats:\n{df.describe()}")
|
||
```
|
||
|
||
### 2. Data Quality Assessment
|
||
|
||
**Missing Values**:
|
||
```python
|
||
# Check missing values
|
||
missing = df.isnull().sum()
|
||
missing_pct = (missing / len(df)) * 100
|
||
missing_df = pd.DataFrame({
|
||
'Missing Count': missing,
|
||
'Percentage': missing_pct
|
||
}).sort_values('Percentage', ascending=False)
|
||
|
||
print(missing_df[missing_df['Percentage'] > 0])
|
||
```
|
||
|
||
**Duplicates**:
|
||
```python
|
||
# Check duplicates
|
||
duplicates = df.duplicated().sum()
|
||
print(f"Duplicate rows: {duplicates}")
|
||
|
||
# Remove duplicates if needed
|
||
df_clean = df.drop_duplicates()
|
||
```
|
||
|
||
**Data Type Validation**:
|
||
```python
|
||
# Validate expected types
|
||
expected_types = {
|
||
'id': 'int64',
|
||
'name': 'object',
|
||
'date': 'datetime64',
|
||
'amount': 'float64'
|
||
}
|
||
|
||
for col, expected in expected_types.items():
|
||
actual = df[col].dtype
|
||
if actual != expected:
|
||
print(f"Warning: {col} is {actual}, expected {expected}")
|
||
```
|
||
|
||
### 3. Statistical Summary
|
||
|
||
**Descriptive Statistics**:
|
||
```python
|
||
# Numeric columns
|
||
numeric_summary = df.describe().T
|
||
numeric_summary['missing'] = df.isnull().sum()
|
||
numeric_summary['unique'] = df.nunique()
|
||
|
||
# Categorical columns
|
||
categorical_cols = df.select_dtypes(include=['object']).columns
|
||
for col in categorical_cols:
|
||
print(f"\n{col} value counts:")
|
||
print(df[col].value_counts().head(10))
|
||
```
|
||
|
||
**Distribution Analysis**:
|
||
```python
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
|
||
# Distribution plots for numeric columns
|
||
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
||
|
||
fig, axes = plt.subplots(len(numeric_cols), 2, figsize=(12, 4*len(numeric_cols)))
|
||
|
||
for idx, col in enumerate(numeric_cols):
|
||
# Histogram with KDE
|
||
sns.histplot(df[col], kde=True, ax=axes[idx, 0])
|
||
axes[idx, 0].set_title(f'Distribution of {col}')
|
||
|
||
# Box plot
|
||
sns.boxplot(x=df[col], ax=axes[idx, 1])
|
||
axes[idx, 1].set_title(f'Box Plot of {col}')
|
||
|
||
plt.tight_layout()
|
||
plt.savefig('distributions.png', dpi=300, bbox_inches='tight')
|
||
```
|
||
|
||
### 4. Correlation Analysis
|
||
|
||
```python
|
||
# Correlation matrix
|
||
correlation_matrix = df[numeric_cols].corr()
|
||
|
||
# Heatmap
|
||
plt.figure(figsize=(10, 8))
|
||
sns.heatmap(
|
||
correlation_matrix,
|
||
annot=True,
|
||
fmt='.2f',
|
||
cmap='coolwarm',
|
||
center=0,
|
||
square=True,
|
||
linewidths=1
|
||
)
|
||
plt.title('Correlation Matrix')
|
||
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
|
||
|
||
# Strong correlations
|
||
threshold = 0.7
|
||
strong_corr = correlation_matrix[(correlation_matrix > threshold) | (correlation_matrix < -threshold)]
|
||
strong_corr = strong_corr[strong_corr != 1.0].dropna(how='all').dropna(axis=1, how='all')
|
||
print("Strong correlations:\n", strong_corr)
|
||
```
|
||
|
||
### 5. Outlier Detection
|
||
|
||
**IQR Method**:
|
||
```python
|
||
def detect_outliers_iqr(df, column):
|
||
Q1 = df[column].quantile(0.25)
|
||
Q3 = df[column].quantile(0.75)
|
||
IQR = Q3 - Q1
|
||
|
||
lower_bound = Q1 - 1.5 * IQR
|
||
upper_bound = Q3 + 1.5 * IQR
|
||
|
||
outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
|
||
|
||
return outliers, lower_bound, upper_bound
|
||
|
||
# Apply to all numeric columns
|
||
for col in numeric_cols:
|
||
outliers, lower, upper = detect_outliers_iqr(df, col)
|
||
print(f"\n{col}:")
|
||
print(f" Outliers: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")
|
||
print(f" Range: [{lower:.2f}, {upper:.2f}]")
|
||
```
|
||
|
||
**Z-Score Method**:
|
||
```python
|
||
from scipy import stats
|
||
|
||
def detect_outliers_zscore(df, column, threshold=3):
|
||
z_scores = np.abs(stats.zscore(df[column].dropna()))
|
||
outliers = df[z_scores > threshold]
|
||
return outliers
|
||
|
||
# Apply
|
||
for col in numeric_cols:
|
||
outliers = detect_outliers_zscore(df, col)
|
||
print(f"{col}: {len(outliers)} outliers (Z-score > 3)")
|
||
```
|
||
|
||
### 6. Time Series Analysis (if applicable)
|
||
|
||
```python
|
||
# Convert to datetime
|
||
df['date'] = pd.to_datetime(df['date'])
|
||
df = df.set_index('date')
|
||
|
||
# Resample and aggregate
|
||
daily = df.resample('D').sum()
|
||
weekly = df.resample('W').sum()
|
||
monthly = df.resample('M').sum()
|
||
|
||
# Plot trends
|
||
fig, axes = plt.subplots(3, 1, figsize=(12, 10))
|
||
|
||
daily['metric'].plot(ax=axes[0], title='Daily Trend')
|
||
weekly['metric'].plot(ax=axes[1], title='Weekly Trend')
|
||
monthly['metric'].plot(ax=axes[2], title='Monthly Trend')
|
||
|
||
plt.tight_layout()
|
||
plt.savefig('time_series_trends.png', dpi=300)
|
||
|
||
# Seasonality detection
|
||
from statsmodels.tsa.seasonal import seasonal_decompose
|
||
|
||
decomposition = seasonal_decompose(df['metric'], model='additive', period=7)
|
||
fig = decomposition.plot()
|
||
fig.set_size_inches(12, 8)
|
||
plt.savefig('seasonality.png', dpi=300)
|
||
```
|
||
|
||
---
|
||
|
||
## Visualization Best Practices
|
||
|
||
### Chart Type Selection
|
||
|
||
| Data Type | Best Chart | When to Use |
|
||
|-----------|-----------|-------------|
|
||
| **Single variable distribution** | Histogram, Box plot | Understand data spread |
|
||
| **Two variable relationship** | Scatter plot | Check correlation |
|
||
| **Time series** | Line chart | Show trends over time |
|
||
| **Comparison across categories** | Bar chart | Compare groups |
|
||
| **Part-to-whole** | Pie chart, Stacked bar | Show composition |
|
||
| **Multiple variables** | Pair plot, Heatmap | Explore relationships |
|
||
|
||
### Matplotlib/Seaborn Patterns
|
||
|
||
**Professional Styling**:
|
||
```python
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
|
||
# Set style
|
||
sns.set_style("whitegrid")
|
||
sns.set_context("notebook", font_scale=1.2)
|
||
|
||
# Color palette
|
||
colors = sns.color_palette("husl", 8)
|
||
|
||
# Figure size
|
||
plt.figure(figsize=(12, 6))
|
||
|
||
# Plot
|
||
sns.lineplot(data=df, x='date', y='value', hue='category')
|
||
|
||
# Customize
|
||
plt.title('Revenue Trend by Category', fontsize=16, fontweight='bold')
|
||
plt.xlabel('Date', fontsize=12)
|
||
plt.ylabel('Revenue ($)', fontsize=12)
|
||
plt.legend(title='Category', title_fontsize=12)
|
||
plt.grid(alpha=0.3)
|
||
|
||
# Save high quality
|
||
plt.savefig('chart.png', dpi=300, bbox_inches='tight', facecolor='white')
|
||
```
|
||
|
||
**Color Palette Guidelines**:
|
||
- Use colorblind-friendly palettes
|
||
- Limit to 5-6 colors
|
||
- Consistent across all charts
|
||
- Green = positive, Red = negative
|
||
|
||
**Labeling**:
|
||
- Clear, descriptive titles
|
||
- Axis labels with units
|
||
- Legend when multiple series
|
||
- Annotations for key points
|
||
|
||
---
|
||
|
||
## Statistical Testing
|
||
|
||
### Hypothesis Testing Framework
|
||
|
||
**1. Formulate Hypotheses**:
|
||
- Null hypothesis (H0): No effect/difference
|
||
- Alternative hypothesis (H1): There is an effect/difference
|
||
|
||
**2. Choose Test**:
|
||
|
||
| Scenario | Test | Function |
|
||
|----------|------|----------|
|
||
| Compare two means | t-test | `scipy.stats.ttest_ind()` |
|
||
| Compare >2 means | ANOVA | `scipy.stats.f_oneway()` |
|
||
| Compare proportions | Chi-square | `scipy.stats.chi2_contingency()` |
|
||
| Correlation | Pearson/Spearman | `scipy.stats.pearsonr()` |
|
||
| Normality | Shapiro-Wilk | `scipy.stats.shapiro()` |
|
||
|
||
**3. Execute Test**:
|
||
```python
|
||
from scipy import stats
|
||
|
||
# Example: Compare means of two groups
|
||
group_a = df[df['group'] == 'A']['metric']
|
||
group_b = df[df['group'] == 'B']['metric']
|
||
|
||
# Check normality first
|
||
_, p_a = stats.shapiro(group_a)
|
||
_, p_b = stats.shapiro(group_b)
|
||
|
||
if p_a > 0.05 and p_b > 0.05:
|
||
# Normal distribution - use t-test
|
||
t_stat, p_value = stats.ttest_ind(group_a, group_b)
|
||
test_used = "t-test"
|
||
else:
|
||
# Non-normal - use Mann-Whitney U test
|
||
t_stat, p_value = stats.mannwhitneyu(group_a, group_b)
|
||
test_used = "Mann-Whitney U"
|
||
|
||
print(f"Test: {test_used}")
|
||
print(f"Statistic: {t_stat:.4f}")
|
||
print(f"P-value: {p_value:.4f}")
|
||
print(f"Significant: {'Yes' if p_value < 0.05 else 'No'}")
|
||
```
|
||
|
||
**4. Interpret Results**:
|
||
- p < 0.05: Reject null hypothesis (significant)
|
||
- p ≥ 0.05: Fail to reject null hypothesis (not significant)
|
||
|
||
### Effect Size
|
||
|
||
Always report effect size with statistical significance:
|
||
|
||
```python
|
||
# Cohen's d for t-test
|
||
def cohens_d(group1, group2):
|
||
n1, n2 = len(group1), len(group2)
|
||
var1, var2 = group1.var(), group2.var()
|
||
pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
|
||
return (group1.mean() - group2.mean()) / pooled_std
|
||
|
||
d = cohens_d(group_a, group_b)
|
||
print(f"Effect size (Cohen's d): {d:.3f}")
|
||
print(f"Magnitude: {'Small' if abs(d) < 0.5 else 'Medium' if abs(d) < 0.8 else 'Large'}")
|
||
```
|
||
|
||
---
|
||
|
||
## Data Cleaning Patterns
|
||
|
||
### Handling Missing Values
|
||
|
||
**1. Analyze Pattern**:
|
||
```python
|
||
# Missing data pattern
|
||
import missingno as msno
|
||
|
||
msno.matrix(df)
|
||
plt.savefig('missing_pattern.png')
|
||
|
||
# Correlation of missingness
|
||
msno.heatmap(df)
|
||
plt.savefig('missing_correlation.png')
|
||
```
|
||
|
||
**2. Choose Strategy**:
|
||
|
||
| % Missing | Strategy | When to Use |
|
||
|-----------|----------|-------------|
|
||
| < 5% | Drop rows | MCAR (Missing Completely At Random) |
|
||
| 5-20% | Imputation | MAR (Missing At Random) |
|
||
| > 20% | Create indicator variable | MNAR (Missing Not At Random) |
|
||
|
||
**3. Implement**:
|
||
```python
|
||
# Drop if few missing
|
||
df_clean = df.dropna(subset=['critical_column'])
|
||
|
||
# Mean/Median imputation (numeric)
|
||
df['column'].fillna(df['column'].median(), inplace=True)
|
||
|
||
# Mode imputation (categorical)
|
||
df['category'].fillna(df['category'].mode()[0], inplace=True)
|
||
|
||
# Forward fill (time series)
|
||
df['value'].fillna(method='ffill', inplace=True)
|
||
|
||
# Sophisticated imputation
|
||
from sklearn.impute import KNNImputer
|
||
imputer = KNNImputer(n_neighbors=5)
|
||
df_imputed = pd.DataFrame(
|
||
imputer.fit_transform(df[numeric_cols]),
|
||
columns=numeric_cols
|
||
)
|
||
```
|
||
|
||
### Handling Outliers
|
||
|
||
**Decision Framework**:
|
||
1. **Investigate**: Are they errors or real extreme values?
|
||
2. **Document**: Explain handling decision
|
||
3. **Options**:
|
||
- Keep: If legitimate extreme values
|
||
- Remove: If data errors
|
||
- Cap: Winsorize to percentile
|
||
- Transform: Log transform to reduce impact
|
||
|
||
```python
|
||
# Winsorization (cap at percentiles)
|
||
from scipy.stats.mstats import winsorize
|
||
|
||
df['metric_winsorized'] = winsorize(df['metric'], limits=[0.05, 0.05])
|
||
|
||
# Log transformation
|
||
df['metric_log'] = np.log1p(df['metric']) # log1p handles zeros
|
||
```
|
||
|
||
---
|
||
|
||
## Reporting Template
|
||
|
||
```markdown
|
||
# Exploratory Data Analysis Report
|
||
|
||
**Dataset**: [name]
|
||
**Analysis Date**: [date]
|
||
**Analyst**: [name]
|
||
|
||
## Executive Summary
|
||
|
||
[2-3 sentences: What is the data? Key findings? Recommendations?]
|
||
|
||
## Data Overview
|
||
|
||
- **Rows**: [count]
|
||
- **Columns**: [count]
|
||
- **Time Period**: [start] to [end]
|
||
- **Data Source**: [source]
|
||
|
||
## Data Quality
|
||
|
||
### Missing Values
|
||
| Column | Missing Count | Percentage |
|
||
|--------|---------------|------------|
|
||
[list columns with >5% missing]
|
||
|
||
### Duplicates
|
||
- [count] duplicate rows ([percentage]%)
|
||
- Action: [removed/kept]
|
||
|
||
### Data Type Issues
|
||
[list any type mismatches or issues]
|
||
|
||
## Statistical Summary
|
||
|
||
[Table of descriptive statistics for key metrics]
|
||
|
||
## Key Findings
|
||
|
||
### 1. [Finding Name]
|
||
**Observation**: [What you found]
|
||
**Evidence**: [Supporting statistics]
|
||
**Visualization**: [Chart reference]
|
||
**Implication**: [What it means]
|
||
|
||
### 2. [Finding Name]
|
||
[Same structure]
|
||
|
||
### 3. [Finding Name]
|
||
[Same structure]
|
||
|
||
## Correlation Analysis
|
||
|
||
[Heatmap image]
|
||
|
||
**Strong Relationships**:
|
||
- [Variable 1] and [Variable 2]: r = [correlation], p < [p-value]
|
||
- [Interpretation]
|
||
|
||
## Outliers
|
||
|
||
| Column | Outlier Count | Percentage | Action |
|
||
|--------|---------------|------------|--------|
|
||
[list columns with outliers]
|
||
|
||
## Temporal Patterns (if applicable)
|
||
|
||
[Time series visualizations]
|
||
|
||
**Trends**:
|
||
- [Trend 1]
|
||
- [Trend 2]
|
||
|
||
**Seasonality**: [Yes/No - describe pattern]
|
||
|
||
## Recommendations
|
||
|
||
1. **Data Collection**: [Suggestions for improving data quality]
|
||
2. **Further Analysis**: [Suggested deep-dives]
|
||
3. **Action Items**: [Business recommendations]
|
||
|
||
## Next Steps
|
||
|
||
- [ ] [Action 1]
|
||
- [ ] [Action 2]
|
||
- [ ] [Action 3]
|
||
|
||
## Appendix
|
||
|
||
### Methodology
|
||
[Brief description of analysis approach]
|
||
|
||
### Tools Used
|
||
- Python [version]
|
||
- pandas [version]
|
||
- scipy [version]
|
||
- matplotlib [version]
|
||
```
|
||
|
||
---
|
||
|
||
## Quality Checklist
|
||
|
||
Before completing analysis:
|
||
|
||
- [ ] Data quality assessed (missing, duplicates, types)
|
||
- [ ] Descriptive statistics calculated
|
||
- [ ] Distributions visualized
|
||
- [ ] Outliers detected and explained
|
||
- [ ] Correlations analyzed
|
||
- [ ] Statistical tests appropriate for data
|
||
- [ ] Assumptions validated
|
||
- [ ] Visualizations clear and labeled
|
||
- [ ] Findings actionable
|
||
- [ ] Methodology documented
|
||
- [ ] Code reproducible
|
||
|
||
---
|
||
|
||
## MCP-Enhanced Data Access
|
||
|
||
### PostgreSQL MCP Integration
|
||
|
||
When PostgreSQL MCP is available, access data directly from databases without export/import cycles:
|
||
|
||
```typescript
|
||
// Runtime detection - no configuration needed
|
||
const hasPostgres = typeof mcp__postgres__query !== 'undefined';
|
||
|
||
if (hasPostgres) {
|
||
console.log("✓ Using PostgreSQL MCP for direct database access");
|
||
|
||
// Direct SQL execution - 10x faster than export/import
|
||
const result = await mcp__postgres__query({
|
||
sql: `
|
||
SELECT
|
||
date_trunc('day', created_at) as date,
|
||
COUNT(*) as count,
|
||
AVG(amount) as avg_amount,
|
||
STDDEV(amount) as stddev_amount
|
||
FROM transactions
|
||
WHERE created_at >= NOW() - INTERVAL '30 days'
|
||
GROUP BY date_trunc('day', created_at)
|
||
ORDER BY date
|
||
`
|
||
});
|
||
|
||
console.log(`✓ Retrieved ${result.rows.length} rows directly from database`);
|
||
|
||
// Convert to pandas DataFrame (if using Python analysis)
|
||
// Or work with result.rows directly in JavaScript/TypeScript
|
||
|
||
// Benefits:
|
||
// - No CSV export/import cycle
|
||
// - Always fresh data (no stale exports)
|
||
// - Can use database aggregations (faster)
|
||
// - Query only needed columns
|
||
// - Leverage database indexes
|
||
|
||
} else {
|
||
console.log("ℹ️ PostgreSQL MCP not available");
|
||
console.log(" Install for 10x faster data access:");
|
||
console.log(" npm install -g @modelcontextprotocol/server-postgres");
|
||
console.log(" Falling back to manual export/import workflow");
|
||
}
|
||
```
|
||
|
||
### Benefits Comparison
|
||
|
||
| Aspect | With PostgreSQL MCP | Without MCP (Export/Import) |
|
||
|--------|-------------------|---------------------------|
|
||
| **Speed** | 10x faster - direct queries | Multi-step: export → download → import |
|
||
| **Data Freshness** | Always current (live queries) | Stale (last export time) |
|
||
| **Memory Usage** | Stream large datasets | Load entire CSV into memory |
|
||
| **Aggregations** | Database-side (fast) | Python-side (slower) |
|
||
| **Iteration** | Instant query refinement | Re-export for each change |
|
||
| **Data Selection** | Query only needed columns | Export everything, filter later |
|
||
| **Collaboration** | Shared database access | Email CSV files |
|
||
|
||
**When to use PostgreSQL MCP:**
|
||
- Exploratory data analysis requiring multiple query iterations
|
||
- Large datasets (>100MB) that strain memory
|
||
- Need for fresh data (real-time dashboards)
|
||
- Complex aggregations better done in SQL
|
||
- Multiple analysts working on same dataset
|
||
- Production database analysis
|
||
|
||
**When manual export sufficient:**
|
||
- Small datasets (<10MB) analyzed once
|
||
- Offline analysis required
|
||
- No database access permissions
|
||
- Static snapshot needed for compliance
|
||
|
||
### MongoDB MCP Integration
|
||
|
||
For NoSQL data sources:
|
||
|
||
```typescript
|
||
const hasMongo = typeof mcp__mongodb__query !== 'undefined';
|
||
|
||
if (hasMongo) {
|
||
console.log("✓ Using MongoDB MCP for document database access");
|
||
|
||
// Direct MongoDB queries
|
||
const result = await mcp__mongodb__query({
|
||
database: "analytics",
|
||
collection: "events",
|
||
query: {
|
||
timestamp: { $gte: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000) }
|
||
},
|
||
aggregation: [
|
||
{ $group: {
|
||
_id: { $dateToString: { format: "%Y-%m-%d", date: "$timestamp" } },
|
||
count: { $sum: 1 },
|
||
avgValue: { $avg: "$value" }
|
||
}},
|
||
{ $sort: { _id: 1 } }
|
||
]
|
||
});
|
||
|
||
console.log(`✓ Aggregated ${result.length} daily summaries`);
|
||
|
||
} else {
|
||
console.log("ℹ️ MongoDB MCP not available");
|
||
console.log(" Install: npm install -g @modelcontextprotocol/server-mongodb");
|
||
}
|
||
```
|
||
|
||
### Real-World Example: Customer Cohort Analysis
|
||
|
||
**With PostgreSQL MCP (5 minutes):**
|
||
|
||
```sql
|
||
-- Direct cohort analysis query
|
||
SELECT
|
||
DATE_TRUNC('month', first_purchase) as cohort_month,
|
||
DATE_TRUNC('month', purchase_date) as activity_month,
|
||
COUNT(DISTINCT customer_id) as customers,
|
||
SUM(amount) as revenue
|
||
FROM (
|
||
SELECT
|
||
customer_id,
|
||
purchase_date,
|
||
amount,
|
||
FIRST_VALUE(purchase_date) OVER (
|
||
PARTITION BY customer_id
|
||
ORDER BY purchase_date
|
||
) as first_purchase
|
||
FROM purchases
|
||
WHERE purchase_date >= '2024-01-01'
|
||
) cohorts
|
||
GROUP BY cohort_month, activity_month
|
||
ORDER BY cohort_month, activity_month
|
||
```
|
||
|
||
**Without MCP (45 minutes):**
|
||
|
||
1. Request database export from DBA (10 min wait)
|
||
2. Download CSV file (2 min)
|
||
3. Load into pandas (3 min)
|
||
4. Write Python cohort analysis code (20 min)
|
||
5. Debug memory issues with large dataset (10 min)
|
||
|
||
### PostgreSQL MCP Installation
|
||
|
||
```bash
|
||
# Install PostgreSQL MCP for direct database access
|
||
npm install -g @modelcontextprotocol/server-postgres
|
||
|
||
# Configure in MCP settings
|
||
# Add to claude_desktop_config.json:
|
||
{
|
||
"mcpServers": {
|
||
"postgres": {
|
||
"command": "npx",
|
||
"args": ["-y", "@modelcontextprotocol/server-postgres"],
|
||
"env": {
|
||
"POSTGRES_CONNECTION_STRING": "postgresql://user:pass@localhost:5432/dbname"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
# Or configure per-project connection
|
||
# Add to .env file:
|
||
POSTGRES_CONNECTION_STRING=postgresql://user:pass@localhost:5432/dbname
|
||
```
|
||
|
||
### MongoDB MCP Installation
|
||
|
||
```bash
|
||
# Install MongoDB MCP
|
||
npm install -g @modelcontextprotocol/server-mongodb
|
||
|
||
# Configure in MCP settings
|
||
{
|
||
"mcpServers": {
|
||
"mongodb": {
|
||
"command": "npx",
|
||
"args": ["-y", "@modelcontextprotocol/server-mongodb"],
|
||
"env": {
|
||
"MONGODB_URI": "mongodb://localhost:27017"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
Once installed, all agents reading this skill automatically detect and use database MCPs for direct data access.
|
||
|
||
### Security Best Practices
|
||
|
||
When using database MCPs:
|
||
|
||
1. **Use Read-Only Accounts**: Create dedicated read-only database users
|
||
```sql
|
||
CREATE USER analyst_readonly WITH PASSWORD 'secure_password';
|
||
GRANT SELECT ON ALL TABLES IN SCHEMA public TO analyst_readonly;
|
||
```
|
||
|
||
2. **Connection Strings**: Store in environment variables, never in code
|
||
```bash
|
||
export POSTGRES_CONNECTION_STRING="postgresql://readonly:pass@host/db"
|
||
```
|
||
|
||
3. **Query Limits**: Always use LIMIT clauses for exploratory queries
|
||
```sql
|
||
SELECT * FROM large_table LIMIT 1000; -- Safe exploration
|
||
```
|
||
|
||
4. **Avoid SELECT ***: Query only needed columns to reduce bandwidth
|
||
```sql
|
||
SELECT id, date, amount FROM transactions; -- Better
|
||
```
|
||
|
||
---
|
||
|
||
**Version**: 1.0
|
||
**Last Updated**: January 2025
|
||
**MCP Enhancement**: PostgreSQL/MongoDB for 10x faster data access
|
||
**Success Rate**: 95% analysis accuracy following these patterns
|