275 lines
6.4 KiB
Markdown
275 lines
6.4 KiB
Markdown
# Data Analyst
|
|
|
|
**Description**: Perform systematic data analysis using Python, pandas, and visualization libraries
|
|
|
|
## Core Workflow
|
|
|
|
1. **Load & Inspect Data**
|
|
2. **Clean & Transform Data**
|
|
3. **Explore & Visualize**
|
|
4. **Analyze & Model**
|
|
5. **Report Findings**
|
|
|
|
## Data Loading & Inspection
|
|
|
|
```python
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
# Load data
|
|
df = pd.read_csv('data.csv')
|
|
# df = pd.read_excel('data.xlsx')
|
|
# df = pd.read_sql(query, connection)
|
|
# df = pd.read_json('data.json')
|
|
|
|
# Initial inspection
|
|
print(df.shape) # (rows, columns)
|
|
print(df.info()) # Data types, null counts
|
|
print(df.describe()) # Statistical summary
|
|
print(df.head()) # First 5 rows
|
|
print(df.columns.tolist()) # Column names
|
|
print(df.dtypes) # Data types
|
|
|
|
# Check for missing data
|
|
print(df.isnull().sum())
|
|
print(df.duplicated().sum())
|
|
```
|
|
|
|
## Data Cleaning
|
|
|
|
```python
|
|
# Handle missing values
|
|
df = df.dropna() # Drop rows with any null
|
|
df = df.dropna(subset=['important_col']) # Drop if specific column null
|
|
df['col'] = df['col'].fillna(0) # Fill with value
|
|
df['col'] = df['col'].fillna(df['col'].mean()) # Fill with mean
|
|
|
|
# Remove duplicates
|
|
df = df.drop_duplicates()
|
|
df = df.drop_duplicates(subset=['id'], keep='first')
|
|
|
|
# Fix data types
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
df['price'] = df['price'].astype(float)
|
|
df['category'] = df['category'].astype('category')
|
|
|
|
# Handle outliers
|
|
from scipy import stats
|
|
df = df[(np.abs(stats.zscore(df['value'])) < 3)] # Remove outliers
|
|
|
|
# Rename columns
|
|
df = df.rename(columns={'old_name': 'new_name'})
|
|
df.columns = df.columns.str.lower().str.replace(' ', '_')
|
|
```
|
|
|
|
## Data Transformation
|
|
|
|
```python
|
|
# Create new columns
|
|
df['total'] = df['quantity'] * df['price']
|
|
df['month'] = df['date'].dt.month
|
|
df['year'] = df['date'].dt.year
|
|
|
|
# Apply functions
|
|
df['normalized'] = df['value'] / df['value'].max()
|
|
df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3})
|
|
|
|
# Groupby aggregations
|
|
summary = df.groupby('category').agg({
|
|
'sales': ['sum', 'mean', 'count'],
|
|
'profit': 'sum'
|
|
}).round(2)
|
|
|
|
# Pivot tables
|
|
pivot = df.pivot_table(
|
|
values='sales',
|
|
index='category',
|
|
columns='region',
|
|
aggfunc='sum',
|
|
fill_value=0
|
|
)
|
|
|
|
# Merge datasets
|
|
merged = pd.merge(df1, df2, on='id', how='left')
|
|
```
|
|
|
|
## Exploratory Data Analysis
|
|
|
|
```python
|
|
# Distribution analysis
|
|
print(df['value'].value_counts())
|
|
print(df['value'].value_counts(normalize=True)) # Percentages
|
|
|
|
# Correlation
|
|
correlation = df[['col1', 'col2', 'col3']].corr()
|
|
|
|
# Cross-tabulation
|
|
pd.crosstab(df['category'], df['region'], normalize='index')
|
|
```
|
|
|
|
## Visualization
|
|
|
|
```python
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
# Set style
|
|
sns.set_style('whitegrid')
|
|
plt.rcParams['figure.figsize'] = (12, 6)
|
|
|
|
# Histogram
|
|
plt.hist(df['value'], bins=30, edgecolor='black')
|
|
plt.xlabel('Value')
|
|
plt.ylabel('Frequency')
|
|
plt.title('Distribution of Values')
|
|
plt.show()
|
|
|
|
# Boxplot
|
|
sns.boxplot(data=df, x='category', y='value')
|
|
plt.title('Value Distribution by Category')
|
|
plt.show()
|
|
|
|
# Time series
|
|
df.set_index('date')['sales'].plot()
|
|
plt.title('Sales Over Time')
|
|
plt.show()
|
|
|
|
# Heatmap
|
|
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
|
|
plt.title('Correlation Matrix')
|
|
plt.show()
|
|
|
|
# Scatter plot
|
|
sns.scatterplot(data=df, x='x_value', y='y_value', hue='category')
|
|
plt.title('X vs Y by Category')
|
|
plt.show()
|
|
```
|
|
|
|
## Statistical Analysis
|
|
|
|
```python
|
|
from scipy import stats
|
|
|
|
# T-test
|
|
group1 = df[df['category'] == 'A']['value']
|
|
group2 = df[df['category'] == 'B']['value']
|
|
t_stat, p_value = stats.ttest_ind(group1, group2)
|
|
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
|
|
|
|
# Chi-square test
|
|
contingency_table = pd.crosstab(df['category'], df['outcome'])
|
|
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
|
|
print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}")
|
|
|
|
# Linear regression
|
|
from sklearn.linear_model import LinearRegression
|
|
X = df[['feature1', 'feature2']]
|
|
y = df['target']
|
|
model = LinearRegression()
|
|
model.fit(X, y)
|
|
print(f"R² Score: {model.score(X, y):.4f}")
|
|
print(f"Coefficients: {model.coef_}")
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### 1. **Reproducibility**
|
|
```python
|
|
# Set random seed
|
|
np.random.seed(42)
|
|
|
|
# Save processed data
|
|
df.to_csv('cleaned_data.csv', index=False)
|
|
|
|
# Document transformations
|
|
# Use Jupyter notebooks with markdown cells
|
|
```
|
|
|
|
### 2. **Data Quality Checks**
|
|
```python
|
|
def validate_data(df):
|
|
"""Validate data quality"""
|
|
checks = {
|
|
'null_values': df.isnull().sum().sum() == 0,
|
|
'duplicates': df.duplicated().sum() == 0,
|
|
'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'),
|
|
'no_negative_prices': (df['price'] >= 0).all()
|
|
}
|
|
return all(checks.values()), checks
|
|
|
|
is_valid, results = validate_data(df)
|
|
print(f"Data valid: {is_valid}")
|
|
print(results)
|
|
```
|
|
|
|
### 3. **Performance**
|
|
```python
|
|
# Use vectorized operations
|
|
df['result'] = df['a'] * df['b'] # ✅ Fast
|
|
|
|
# Avoid loops
|
|
for i in range(len(df)): # ❌ Slow
|
|
df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']
|
|
|
|
# Use appropriate data types
|
|
df['category'] = df['category'].astype('category') # Saves memory
|
|
|
|
# Chunk large files
|
|
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
|
|
process(chunk)
|
|
```
|
|
|
|
## Analysis Workflow Template
|
|
|
|
```python
|
|
# 1. LOAD DATA
|
|
df = pd.read_csv('data.csv')
|
|
|
|
# 2. INITIAL INSPECTION
|
|
print(f"Shape: {df.shape}")
|
|
print(f"Columns: {df.columns.tolist()}")
|
|
print(f"Nulls: \n{df.isnull().sum()}")
|
|
|
|
# 3. CLEAN DATA
|
|
df = df.drop_duplicates()
|
|
df = df.dropna(subset=['critical_column'])
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
|
|
# 4. TRANSFORM
|
|
df['month'] = df['date'].dt.month
|
|
df['total'] = df['quantity'] * df['price']
|
|
|
|
# 5. EXPLORE
|
|
print(df['category'].value_counts())
|
|
print(df[['sales', 'profit']].describe())
|
|
|
|
# 6. VISUALIZE
|
|
df.groupby('month')['sales'].sum().plot(kind='bar')
|
|
plt.title('Monthly Sales')
|
|
plt.show()
|
|
|
|
# 7. ANALYZE
|
|
monthly_avg = df.groupby('month')['sales'].mean()
|
|
print(f"Average monthly sales: ${monthly_avg.mean():.2f}")
|
|
|
|
# 8. EXPORT RESULTS
|
|
results = df.groupby('category').agg({
|
|
'sales': 'sum',
|
|
'profit': 'sum'
|
|
})
|
|
results.to_csv('results.csv')
|
|
```
|
|
|
|
## When to Use This Skill
|
|
|
|
- Analyzing CSV/Excel datasets
|
|
- Creating data visualizations
|
|
- Performing statistical analysis
|
|
- Cleaning and transforming data
|
|
- Generating data reports
|
|
- Exploratory data analysis
|
|
|
|
---
|
|
|
|
**Remember**: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report!
|