Files
2025-11-30 08:53:48 +08:00

6.4 KiB

Data Analyst

Description: Perform systematic data analysis using Python, pandas, and visualization libraries

Core Workflow

  1. Load & Inspect Data
  2. Clean & Transform Data
  3. Explore & Visualize
  4. Analyze & Model
  5. Report Findings

Data Loading & Inspection

import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('data.csv')
# df = pd.read_excel('data.xlsx')
# df = pd.read_sql(query, connection)
# df = pd.read_json('data.json')

# Initial inspection
print(df.shape)              # (rows, columns)
print(df.info())             # Data types, null counts
print(df.describe())         # Statistical summary
print(df.head())             # First 5 rows
print(df.columns.tolist())   # Column names
print(df.dtypes)             # Data types

# Check for missing data
print(df.isnull().sum())
print(df.duplicated().sum())

Data Cleaning

# Handle missing values
df = df.dropna()                           # Drop rows with any null
df = df.dropna(subset=['important_col'])   # Drop if specific column null
df['col'] = df['col'].fillna(0)            # Fill with value
df['col'] = df['col'].fillna(df['col'].mean())  # Fill with mean

# Remove duplicates
df = df.drop_duplicates()
df = df.drop_duplicates(subset=['id'], keep='first')

# Fix data types
df['date'] = pd.to_datetime(df['date'])
df['price'] = df['price'].astype(float)
df['category'] = df['category'].astype('category')

# Handle outliers
from scipy import stats
df = df[(np.abs(stats.zscore(df['value'])) < 3)]  # Remove outliers

# Rename columns
df = df.rename(columns={'old_name': 'new_name'})
df.columns = df.columns.str.lower().str.replace(' ', '_')

Data Transformation

# Create new columns
df['total'] = df['quantity'] * df['price']
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Apply functions
df['normalized'] = df['value'] / df['value'].max()
df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3})

# Groupby aggregations
summary = df.groupby('category').agg({
    'sales': ['sum', 'mean', 'count'],
    'profit': 'sum'
}).round(2)

# Pivot tables
pivot = df.pivot_table(
    values='sales',
    index='category',
    columns='region',
    aggfunc='sum',
    fill_value=0
)

# Merge datasets
merged = pd.merge(df1, df2, on='id', how='left')

Exploratory Data Analysis

# Distribution analysis
print(df['value'].value_counts())
print(df['value'].value_counts(normalize=True))  # Percentages

# Correlation
correlation = df[['col1', 'col2', 'col3']].corr()

# Cross-tabulation
pd.crosstab(df['category'], df['region'], normalize='index')

Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Histogram
plt.hist(df['value'], bins=30, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of Values')
plt.show()

# Boxplot
sns.boxplot(data=df, x='category', y='value')
plt.title('Value Distribution by Category')
plt.show()

# Time series
df.set_index('date')['sales'].plot()
plt.title('Sales Over Time')
plt.show()

# Heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Scatter plot
sns.scatterplot(data=df, x='x_value', y='y_value', hue='category')
plt.title('X vs Y by Category')
plt.show()

Statistical Analysis

from scipy import stats

# T-test
group1 = df[df['category'] == 'A']['value']
group2 = df[df['category'] == 'B']['value']
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

# Chi-square test
contingency_table = pd.crosstab(df['category'], df['outcome'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}")

# Linear regression
from sklearn.linear_model import LinearRegression
X = df[['feature1', 'feature2']]
y = df['target']
model = LinearRegression()
model.fit(X, y)
print(f"R² Score: {model.score(X, y):.4f}")
print(f"Coefficients: {model.coef_}")

Best Practices

1. Reproducibility

# Set random seed
np.random.seed(42)

# Save processed data
df.to_csv('cleaned_data.csv', index=False)

# Document transformations
# Use Jupyter notebooks with markdown cells

2. Data Quality Checks

def validate_data(df):
    """Validate data quality"""
    checks = {
        'null_values': df.isnull().sum().sum() == 0,
        'duplicates': df.duplicated().sum() == 0,
        'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'),
        'no_negative_prices': (df['price'] >= 0).all()
    }
    return all(checks.values()), checks

is_valid, results = validate_data(df)
print(f"Data valid: {is_valid}")
print(results)

3. Performance

# Use vectorized operations
df['result'] = df['a'] * df['b']  # ✅ Fast

# Avoid loops
for i in range(len(df)):  # ❌ Slow
    df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']

# Use appropriate data types
df['category'] = df['category'].astype('category')  # Saves memory

# Chunk large files
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
    process(chunk)

Analysis Workflow Template

# 1. LOAD DATA
df = pd.read_csv('data.csv')

# 2. INITIAL INSPECTION
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Nulls: \n{df.isnull().sum()}")

# 3. CLEAN DATA
df = df.drop_duplicates()
df = df.dropna(subset=['critical_column'])
df['date'] = pd.to_datetime(df['date'])

# 4. TRANSFORM
df['month'] = df['date'].dt.month
df['total'] = df['quantity'] * df['price']

# 5. EXPLORE
print(df['category'].value_counts())
print(df[['sales', 'profit']].describe())

# 6. VISUALIZE
df.groupby('month')['sales'].sum().plot(kind='bar')
plt.title('Monthly Sales')
plt.show()

# 7. ANALYZE
monthly_avg = df.groupby('month')['sales'].mean()
print(f"Average monthly sales: ${monthly_avg.mean():.2f}")

# 8. EXPORT RESULTS
results = df.groupby('category').agg({
    'sales': 'sum',
    'profit': 'sum'
})
results.to_csv('results.csv')

When to Use This Skill

  • Analyzing CSV/Excel datasets
  • Creating data visualizations
  • Performing statistical analysis
  • Cleaning and transforming data
  • Generating data reports
  • Exploratory data analysis

Remember: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report!