Files
2025-11-30 08:53:48 +08:00

275 lines
6.4 KiB
Markdown

# Data Analyst
**Description**: Perform systematic data analysis using Python, pandas, and visualization libraries
## Core Workflow
1. **Load & Inspect Data**
2. **Clean & Transform Data**
3. **Explore & Visualize**
4. **Analyze & Model**
5. **Report Findings**
## Data Loading & Inspection
```python
import pandas as pd
import numpy as np
# Load data
df = pd.read_csv('data.csv')
# df = pd.read_excel('data.xlsx')
# df = pd.read_sql(query, connection)
# df = pd.read_json('data.json')
# Initial inspection
print(df.shape) # (rows, columns)
print(df.info()) # Data types, null counts
print(df.describe()) # Statistical summary
print(df.head()) # First 5 rows
print(df.columns.tolist()) # Column names
print(df.dtypes) # Data types
# Check for missing data
print(df.isnull().sum())
print(df.duplicated().sum())
```
## Data Cleaning
```python
# Handle missing values
df = df.dropna() # Drop rows with any null
df = df.dropna(subset=['important_col']) # Drop if specific column null
df['col'] = df['col'].fillna(0) # Fill with value
df['col'] = df['col'].fillna(df['col'].mean()) # Fill with mean
# Remove duplicates
df = df.drop_duplicates()
df = df.drop_duplicates(subset=['id'], keep='first')
# Fix data types
df['date'] = pd.to_datetime(df['date'])
df['price'] = df['price'].astype(float)
df['category'] = df['category'].astype('category')
# Handle outliers
from scipy import stats
df = df[(np.abs(stats.zscore(df['value'])) < 3)] # Remove outliers
# Rename columns
df = df.rename(columns={'old_name': 'new_name'})
df.columns = df.columns.str.lower().str.replace(' ', '_')
```
## Data Transformation
```python
# Create new columns
df['total'] = df['quantity'] * df['price']
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
# Apply functions
df['normalized'] = df['value'] / df['value'].max()
df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3})
# Groupby aggregations
summary = df.groupby('category').agg({
'sales': ['sum', 'mean', 'count'],
'profit': 'sum'
}).round(2)
# Pivot tables
pivot = df.pivot_table(
values='sales',
index='category',
columns='region',
aggfunc='sum',
fill_value=0
)
# Merge datasets
merged = pd.merge(df1, df2, on='id', how='left')
```
## Exploratory Data Analysis
```python
# Distribution analysis
print(df['value'].value_counts())
print(df['value'].value_counts(normalize=True)) # Percentages
# Correlation
correlation = df[['col1', 'col2', 'col3']].corr()
# Cross-tabulation
pd.crosstab(df['category'], df['region'], normalize='index')
```
## Visualization
```python
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
# Histogram
plt.hist(df['value'], bins=30, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of Values')
plt.show()
# Boxplot
sns.boxplot(data=df, x='category', y='value')
plt.title('Value Distribution by Category')
plt.show()
# Time series
df.set_index('date')['sales'].plot()
plt.title('Sales Over Time')
plt.show()
# Heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()
# Scatter plot
sns.scatterplot(data=df, x='x_value', y='y_value', hue='category')
plt.title('X vs Y by Category')
plt.show()
```
## Statistical Analysis
```python
from scipy import stats
# T-test
group1 = df[df['category'] == 'A']['value']
group2 = df[df['category'] == 'B']['value']
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
# Chi-square test
contingency_table = pd.crosstab(df['category'], df['outcome'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}")
# Linear regression
from sklearn.linear_model import LinearRegression
X = df[['feature1', 'feature2']]
y = df['target']
model = LinearRegression()
model.fit(X, y)
print(f"R² Score: {model.score(X, y):.4f}")
print(f"Coefficients: {model.coef_}")
```
## Best Practices
### 1. **Reproducibility**
```python
# Set random seed
np.random.seed(42)
# Save processed data
df.to_csv('cleaned_data.csv', index=False)
# Document transformations
# Use Jupyter notebooks with markdown cells
```
### 2. **Data Quality Checks**
```python
def validate_data(df):
"""Validate data quality"""
checks = {
'null_values': df.isnull().sum().sum() == 0,
'duplicates': df.duplicated().sum() == 0,
'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'),
'no_negative_prices': (df['price'] >= 0).all()
}
return all(checks.values()), checks
is_valid, results = validate_data(df)
print(f"Data valid: {is_valid}")
print(results)
```
### 3. **Performance**
```python
# Use vectorized operations
df['result'] = df['a'] * df['b'] # ✅ Fast
# Avoid loops
for i in range(len(df)): # ❌ Slow
df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']
# Use appropriate data types
df['category'] = df['category'].astype('category') # Saves memory
# Chunk large files
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
process(chunk)
```
## Analysis Workflow Template
```python
# 1. LOAD DATA
df = pd.read_csv('data.csv')
# 2. INITIAL INSPECTION
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Nulls: \n{df.isnull().sum()}")
# 3. CLEAN DATA
df = df.drop_duplicates()
df = df.dropna(subset=['critical_column'])
df['date'] = pd.to_datetime(df['date'])
# 4. TRANSFORM
df['month'] = df['date'].dt.month
df['total'] = df['quantity'] * df['price']
# 5. EXPLORE
print(df['category'].value_counts())
print(df[['sales', 'profit']].describe())
# 6. VISUALIZE
df.groupby('month')['sales'].sum().plot(kind='bar')
plt.title('Monthly Sales')
plt.show()
# 7. ANALYZE
monthly_avg = df.groupby('month')['sales'].mean()
print(f"Average monthly sales: ${monthly_avg.mean():.2f}")
# 8. EXPORT RESULTS
results = df.groupby('category').agg({
'sales': 'sum',
'profit': 'sum'
})
results.to_csv('results.csv')
```
## When to Use This Skill
- Analyzing CSV/Excel datasets
- Creating data visualizations
- Performing statistical analysis
- Cleaning and transforming data
- Generating data reports
- Exploratory data analysis
---
**Remember**: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report!