Files
gh-k-dense-ai-claude-scient…/skills/exploratory-data-analysis/scripts/eda_analyzer.py
2025-11-30 08:30:10 +08:00

548 lines
21 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Exploratory Data Analysis Analyzer
Analyzes scientific data files and generates comprehensive markdown reports
"""
import os
import sys
from pathlib import Path
from datetime import datetime
import json
def detect_file_type(filepath):
"""
Detect the file type based on extension and content.
Returns:
tuple: (extension, file_category, reference_file)
"""
file_path = Path(filepath)
extension = file_path.suffix.lower()
name = file_path.name.lower()
# Map extensions to categories and reference files
extension_map = {
# Chemistry/Molecular
'pdb': ('chemistry_molecular', 'Protein Data Bank'),
'cif': ('chemistry_molecular', 'Crystallographic Information File'),
'mol': ('chemistry_molecular', 'MDL Molfile'),
'mol2': ('chemistry_molecular', 'Tripos Mol2'),
'sdf': ('chemistry_molecular', 'Structure Data File'),
'xyz': ('chemistry_molecular', 'XYZ Coordinates'),
'smi': ('chemistry_molecular', 'SMILES String'),
'smiles': ('chemistry_molecular', 'SMILES String'),
'pdbqt': ('chemistry_molecular', 'AutoDock PDBQT'),
'mae': ('chemistry_molecular', 'Maestro Format'),
'gro': ('chemistry_molecular', 'GROMACS Coordinate File'),
'log': ('chemistry_molecular', 'Gaussian Log File'),
'out': ('chemistry_molecular', 'Quantum Chemistry Output'),
'wfn': ('chemistry_molecular', 'Wavefunction Files'),
'wfx': ('chemistry_molecular', 'Wavefunction Files'),
'fchk': ('chemistry_molecular', 'Gaussian Formatted Checkpoint'),
'cube': ('chemistry_molecular', 'Gaussian Cube File'),
'dcd': ('chemistry_molecular', 'Binary Trajectory'),
'xtc': ('chemistry_molecular', 'Compressed Trajectory'),
'trr': ('chemistry_molecular', 'GROMACS Trajectory'),
'nc': ('chemistry_molecular', 'Amber NetCDF Trajectory'),
'netcdf': ('chemistry_molecular', 'Amber NetCDF Trajectory'),
# Bioinformatics/Genomics
'fasta': ('bioinformatics_genomics', 'FASTA Format'),
'fa': ('bioinformatics_genomics', 'FASTA Format'),
'fna': ('bioinformatics_genomics', 'FASTA Format'),
'fastq': ('bioinformatics_genomics', 'FASTQ Format'),
'fq': ('bioinformatics_genomics', 'FASTQ Format'),
'sam': ('bioinformatics_genomics', 'Sequence Alignment/Map'),
'bam': ('bioinformatics_genomics', 'Binary Alignment/Map'),
'cram': ('bioinformatics_genomics', 'CRAM Format'),
'bed': ('bioinformatics_genomics', 'Browser Extensible Data'),
'bedgraph': ('bioinformatics_genomics', 'BED with Graph Data'),
'bigwig': ('bioinformatics_genomics', 'Binary BigWig'),
'bw': ('bioinformatics_genomics', 'Binary BigWig'),
'bigbed': ('bioinformatics_genomics', 'Binary BigBed'),
'bb': ('bioinformatics_genomics', 'Binary BigBed'),
'gff': ('bioinformatics_genomics', 'General Feature Format'),
'gff3': ('bioinformatics_genomics', 'General Feature Format'),
'gtf': ('bioinformatics_genomics', 'Gene Transfer Format'),
'vcf': ('bioinformatics_genomics', 'Variant Call Format'),
'bcf': ('bioinformatics_genomics', 'Binary VCF'),
'gvcf': ('bioinformatics_genomics', 'Genomic VCF'),
# Microscopy/Imaging
'tif': ('microscopy_imaging', 'Tagged Image File Format'),
'tiff': ('microscopy_imaging', 'Tagged Image File Format'),
'nd2': ('microscopy_imaging', 'Nikon NIS-Elements'),
'lif': ('microscopy_imaging', 'Leica Image Format'),
'czi': ('microscopy_imaging', 'Carl Zeiss Image'),
'oib': ('microscopy_imaging', 'Olympus Image Format'),
'oif': ('microscopy_imaging', 'Olympus Image Format'),
'vsi': ('microscopy_imaging', 'Olympus VSI'),
'ims': ('microscopy_imaging', 'Imaris Format'),
'lsm': ('microscopy_imaging', 'Zeiss LSM'),
'stk': ('microscopy_imaging', 'MetaMorph Stack'),
'dv': ('microscopy_imaging', 'DeltaVision'),
'mrc': ('microscopy_imaging', 'Medical Research Council'),
'dm3': ('microscopy_imaging', 'Gatan Digital Micrograph'),
'dm4': ('microscopy_imaging', 'Gatan Digital Micrograph'),
'dcm': ('microscopy_imaging', 'DICOM'),
'nii': ('microscopy_imaging', 'NIfTI'),
'nrrd': ('microscopy_imaging', 'Nearly Raw Raster Data'),
# Spectroscopy/Analytical
'fid': ('spectroscopy_analytical', 'NMR Free Induction Decay'),
'mzml': ('spectroscopy_analytical', 'Mass Spectrometry Markup Language'),
'mzxml': ('spectroscopy_analytical', 'Mass Spectrometry XML'),
'raw': ('spectroscopy_analytical', 'Vendor Raw Files'),
'd': ('spectroscopy_analytical', 'Agilent Data Directory'),
'mgf': ('spectroscopy_analytical', 'Mascot Generic Format'),
'spc': ('spectroscopy_analytical', 'Galactic SPC'),
'jdx': ('spectroscopy_analytical', 'JCAMP-DX'),
'jcamp': ('spectroscopy_analytical', 'JCAMP-DX'),
# Proteomics/Metabolomics
'pepxml': ('proteomics_metabolomics', 'Trans-Proteomic Pipeline Peptide XML'),
'protxml': ('proteomics_metabolomics', 'Protein Inference Results'),
'mzid': ('proteomics_metabolomics', 'Peptide Identification Format'),
'mztab': ('proteomics_metabolomics', 'Proteomics/Metabolomics Tabular Format'),
# General Scientific
'npy': ('general_scientific', 'NumPy Array'),
'npz': ('general_scientific', 'Compressed NumPy Archive'),
'csv': ('general_scientific', 'Comma-Separated Values'),
'tsv': ('general_scientific', 'Tab-Separated Values'),
'xlsx': ('general_scientific', 'Excel Spreadsheets'),
'xls': ('general_scientific', 'Excel Spreadsheets'),
'json': ('general_scientific', 'JavaScript Object Notation'),
'xml': ('general_scientific', 'Extensible Markup Language'),
'hdf5': ('general_scientific', 'Hierarchical Data Format 5'),
'h5': ('general_scientific', 'Hierarchical Data Format 5'),
'h5ad': ('bioinformatics_genomics', 'Anndata Format'),
'zarr': ('general_scientific', 'Chunked Array Storage'),
'parquet': ('general_scientific', 'Apache Parquet'),
'mat': ('general_scientific', 'MATLAB Data'),
'fits': ('general_scientific', 'Flexible Image Transport System'),
}
ext_clean = extension.lstrip('.')
if ext_clean in extension_map:
category, description = extension_map[ext_clean]
return ext_clean, category, description
return ext_clean, 'unknown', 'Unknown Format'
def get_file_basic_info(filepath):
"""Get basic file information."""
file_path = Path(filepath)
stat = file_path.stat()
return {
'filename': file_path.name,
'path': str(file_path.absolute()),
'size_bytes': stat.st_size,
'size_human': format_bytes(stat.st_size),
'modified': datetime.fromtimestamp(stat.st_mtime).isoformat(),
'extension': file_path.suffix.lower(),
}
def format_bytes(size):
"""Convert bytes to human-readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size < 1024.0:
return f"{size:.2f} {unit}"
size /= 1024.0
return f"{size:.2f} PB"
def load_reference_info(category, extension):
"""
Load reference information for the file type.
Args:
category: File category (e.g., 'chemistry_molecular')
extension: File extension
Returns:
dict: Reference information
"""
# Map categories to reference files
category_files = {
'chemistry_molecular': 'chemistry_molecular_formats.md',
'bioinformatics_genomics': 'bioinformatics_genomics_formats.md',
'microscopy_imaging': 'microscopy_imaging_formats.md',
'spectroscopy_analytical': 'spectroscopy_analytical_formats.md',
'proteomics_metabolomics': 'proteomics_metabolomics_formats.md',
'general_scientific': 'general_scientific_formats.md',
}
if category not in category_files:
return None
# Get the reference file path
script_dir = Path(__file__).parent
ref_file = script_dir.parent / 'references' / category_files[category]
if not ref_file.exists():
return None
# Parse the reference file for the specific extension
# This is a simplified parser - could be more sophisticated
try:
with open(ref_file, 'r') as f:
content = f.read()
# Extract section for this file type
# Look for the extension heading
import re
pattern = rf'### \.{extension}[^#]*?(?=###|\Z)'
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
section = match.group(0)
return {
'raw_section': section,
'reference_file': category_files[category]
}
except Exception as e:
print(f"Error loading reference: {e}", file=sys.stderr)
return None
def analyze_file(filepath):
"""
Main analysis function that routes to specific analyzers.
Returns:
dict: Analysis results
"""
basic_info = get_file_basic_info(filepath)
extension, category, description = detect_file_type(filepath)
analysis = {
'basic_info': basic_info,
'file_type': {
'extension': extension,
'category': category,
'description': description
},
'reference_info': load_reference_info(category, extension),
'data_analysis': {}
}
# Try to perform data-specific analysis based on file type
try:
if category == 'general_scientific':
analysis['data_analysis'] = analyze_general_scientific(filepath, extension)
elif category == 'bioinformatics_genomics':
analysis['data_analysis'] = analyze_bioinformatics(filepath, extension)
elif category == 'microscopy_imaging':
analysis['data_analysis'] = analyze_imaging(filepath, extension)
# Add more specific analyzers as needed
except Exception as e:
analysis['data_analysis']['error'] = str(e)
return analysis
def analyze_general_scientific(filepath, extension):
"""Analyze general scientific data formats."""
results = {}
try:
if extension in ['npy']:
import numpy as np
data = np.load(filepath)
results = {
'shape': data.shape,
'dtype': str(data.dtype),
'size': data.size,
'ndim': data.ndim,
'statistics': {
'min': float(np.min(data)) if np.issubdtype(data.dtype, np.number) else None,
'max': float(np.max(data)) if np.issubdtype(data.dtype, np.number) else None,
'mean': float(np.mean(data)) if np.issubdtype(data.dtype, np.number) else None,
'std': float(np.std(data)) if np.issubdtype(data.dtype, np.number) else None,
}
}
elif extension in ['npz']:
import numpy as np
data = np.load(filepath)
results = {
'arrays': list(data.files),
'array_count': len(data.files),
'array_shapes': {name: data[name].shape for name in data.files}
}
elif extension in ['csv', 'tsv']:
import pandas as pd
sep = '\t' if extension == 'tsv' else ','
df = pd.read_csv(filepath, sep=sep, nrows=10000) # Sample first 10k rows
results = {
'shape': df.shape,
'columns': list(df.columns),
'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()},
'missing_values': df.isnull().sum().to_dict(),
'summary_statistics': df.describe().to_dict() if len(df.select_dtypes(include='number').columns) > 0 else {}
}
elif extension in ['json']:
with open(filepath, 'r') as f:
data = json.load(f)
results = {
'type': type(data).__name__,
'keys': list(data.keys()) if isinstance(data, dict) else None,
'length': len(data) if isinstance(data, (list, dict)) else None
}
elif extension in ['h5', 'hdf5']:
import h5py
with h5py.File(filepath, 'r') as f:
def get_structure(group, prefix=''):
items = {}
for key in group.keys():
path = f"{prefix}/{key}"
if isinstance(group[key], h5py.Dataset):
items[path] = {
'type': 'dataset',
'shape': group[key].shape,
'dtype': str(group[key].dtype)
}
elif isinstance(group[key], h5py.Group):
items[path] = {'type': 'group'}
items.update(get_structure(group[key], path))
return items
results = {
'structure': get_structure(f),
'attributes': dict(f.attrs)
}
except ImportError as e:
results['error'] = f"Required library not installed: {e}"
except Exception as e:
results['error'] = f"Analysis error: {e}"
return results
def analyze_bioinformatics(filepath, extension):
"""Analyze bioinformatics/genomics formats."""
results = {}
try:
if extension in ['fasta', 'fa', 'fna']:
from Bio import SeqIO
sequences = list(SeqIO.parse(filepath, 'fasta'))
lengths = [len(seq) for seq in sequences]
results = {
'sequence_count': len(sequences),
'total_length': sum(lengths),
'mean_length': sum(lengths) / len(lengths) if lengths else 0,
'min_length': min(lengths) if lengths else 0,
'max_length': max(lengths) if lengths else 0,
'sequence_ids': [seq.id for seq in sequences[:10]] # First 10
}
elif extension in ['fastq', 'fq']:
from Bio import SeqIO
sequences = []
for i, seq in enumerate(SeqIO.parse(filepath, 'fastq')):
sequences.append(seq)
if i >= 9999: # Sample first 10k
break
lengths = [len(seq) for seq in sequences]
qualities = [sum(seq.letter_annotations['phred_quality']) / len(seq) for seq in sequences]
results = {
'read_count_sampled': len(sequences),
'mean_length': sum(lengths) / len(lengths) if lengths else 0,
'mean_quality': sum(qualities) / len(qualities) if qualities else 0,
'min_length': min(lengths) if lengths else 0,
'max_length': max(lengths) if lengths else 0,
}
except ImportError as e:
results['error'] = f"Required library not installed (try: pip install biopython): {e}"
except Exception as e:
results['error'] = f"Analysis error: {e}"
return results
def analyze_imaging(filepath, extension):
"""Analyze microscopy/imaging formats."""
results = {}
try:
if extension in ['tif', 'tiff', 'png', 'jpg', 'jpeg']:
from PIL import Image
import numpy as np
img = Image.open(filepath)
img_array = np.array(img)
results = {
'size': img.size,
'mode': img.mode,
'format': img.format,
'shape': img_array.shape,
'dtype': str(img_array.dtype),
'value_range': [int(img_array.min()), int(img_array.max())],
'mean_intensity': float(img_array.mean()),
}
# Check for multi-page TIFF
if extension in ['tif', 'tiff']:
try:
frame_count = 0
while True:
img.seek(frame_count)
frame_count += 1
except EOFError:
results['page_count'] = frame_count
except ImportError as e:
results['error'] = f"Required library not installed (try: pip install pillow): {e}"
except Exception as e:
results['error'] = f"Analysis error: {e}"
return results
def generate_markdown_report(analysis, output_path=None):
"""
Generate a comprehensive markdown report from analysis results.
Args:
analysis: Analysis results dictionary
output_path: Path to save the report (if None, prints to stdout)
"""
lines = []
# Title
filename = analysis['basic_info']['filename']
lines.append(f"# Exploratory Data Analysis Report: {filename}\n")
lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
lines.append("---\n")
# Basic Information
lines.append("## Basic Information\n")
basic = analysis['basic_info']
lines.append(f"- **Filename:** `{basic['filename']}`")
lines.append(f"- **Full Path:** `{basic['path']}`")
lines.append(f"- **File Size:** {basic['size_human']} ({basic['size_bytes']:,} bytes)")
lines.append(f"- **Last Modified:** {basic['modified']}")
lines.append(f"- **Extension:** `.{analysis['file_type']['extension']}`\n")
# File Type Information
lines.append("## File Type\n")
ft = analysis['file_type']
lines.append(f"- **Category:** {ft['category'].replace('_', ' ').title()}")
lines.append(f"- **Description:** {ft['description']}\n")
# Reference Information
if analysis.get('reference_info'):
lines.append("## Format Reference\n")
ref = analysis['reference_info']
if 'raw_section' in ref:
lines.append(ref['raw_section'])
lines.append(f"\n*Reference: {ref['reference_file']}*\n")
# Data Analysis
if analysis.get('data_analysis'):
lines.append("## Data Analysis\n")
data = analysis['data_analysis']
if 'error' in data:
lines.append(f"⚠️ **Analysis Error:** {data['error']}\n")
else:
# Format the data analysis based on what's present
lines.append("### Summary Statistics\n")
lines.append("```json")
lines.append(json.dumps(data, indent=2, default=str))
lines.append("```\n")
# Recommendations
lines.append("## Recommendations for Further Analysis\n")
lines.append(f"Based on the file type (`.{analysis['file_type']['extension']}`), consider the following analyses:\n")
# Add specific recommendations based on category
category = analysis['file_type']['category']
if category == 'general_scientific':
lines.append("- Statistical distribution analysis")
lines.append("- Missing value imputation strategies")
lines.append("- Correlation analysis between variables")
lines.append("- Outlier detection and handling")
lines.append("- Dimensionality reduction (PCA, t-SNE)")
elif category == 'bioinformatics_genomics':
lines.append("- Sequence quality control and filtering")
lines.append("- GC content analysis")
lines.append("- Read alignment and mapping statistics")
lines.append("- Variant calling and annotation")
lines.append("- Differential expression analysis")
elif category == 'microscopy_imaging':
lines.append("- Image quality assessment")
lines.append("- Background correction and normalization")
lines.append("- Segmentation and object detection")
lines.append("- Colocalization analysis")
lines.append("- Intensity measurements and quantification")
lines.append("")
# Footer
lines.append("---")
lines.append("*This report was generated by the exploratory-data-analysis skill.*")
report = '\n'.join(lines)
if output_path:
with open(output_path, 'w') as f:
f.write(report)
print(f"Report saved to: {output_path}")
else:
print(report)
return report
def main():
"""Main CLI interface."""
if len(sys.argv) < 2:
print("Usage: python eda_analyzer.py <filepath> [output.md]")
print(" filepath: Path to the data file to analyze")
print(" output.md: Optional output path for markdown report")
sys.exit(1)
filepath = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else None
if not os.path.exists(filepath):
print(f"Error: File not found: {filepath}")
sys.exit(1)
# If no output path specified, use the input filename
if output_path is None:
input_path = Path(filepath)
output_path = input_path.parent / f"{input_path.stem}_eda_report.md"
print(f"Analyzing: {filepath}")
analysis = analyze_file(filepath)
print(f"\nGenerating report...")
generate_markdown_report(analysis, output_path)
print(f"\n✓ Analysis complete!")
if __name__ == '__main__':
main()