347 lines
10 KiB
Python
Executable File
347 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Check clinical reports for HIPAA identifiers that need removal.
|
|
|
|
Scans text for 18 HIPAA identifiers and flags potential privacy violations.
|
|
|
|
Usage:
|
|
python check_deidentification.py <input_file>
|
|
python check_deidentification.py <input_file> --output violations.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
|
|
# 18 HIPAA Identifiers patterns
|
|
HIPAA_IDENTIFIERS = {
|
|
"1_names": {
|
|
"description": "Names (patient, family, providers)",
|
|
"patterns": [
|
|
r"\b(Dr\.|Mr\.|Mrs\.|Ms\.)\s+[A-Z][a-z]+",
|
|
r"\b[A-Z][a-z]+,\s+[A-Z][a-z]+\b", # Last, First
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"2_geographic": {
|
|
"description": "Geographic subdivisions smaller than state",
|
|
"patterns": [
|
|
r"\b\d+\s+[A-Z][a-z]+\s+(Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr)\b",
|
|
r"\b[A-Z][a-z]+,\s+[A-Z]{2}\s+\d{5}\b", # City, ST ZIP
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"3_dates": {
|
|
"description": "Dates (except year)",
|
|
"patterns": [
|
|
r"\b(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])/\d{4}\b",
|
|
r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},\s+\d{4}\b",
|
|
r"\b\d{1,2}\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"4_telephone": {
|
|
"description": "Telephone numbers",
|
|
"patterns": [
|
|
r"\b\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",
|
|
r"\b1-\d{3}-\d{3}-\d{4}\b",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"5_fax": {
|
|
"description": "Fax numbers",
|
|
"patterns": [
|
|
r"(?i)fax[:]\s*\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"6_email": {
|
|
"description": "Email addresses",
|
|
"patterns": [
|
|
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"7_ssn": {
|
|
"description": "Social Security numbers",
|
|
"patterns": [
|
|
r"\b\d{3}-\d{2}-\d{4}\b",
|
|
r"\b\d{9}\b",
|
|
],
|
|
"severity": "CRITICAL"
|
|
},
|
|
"8_mrn": {
|
|
"description": "Medical record numbers",
|
|
"patterns": [
|
|
r"(?i)(mrn|medical\s+record\s+(number|#))[:]\s*\d+",
|
|
r"(?i)patient\s+id[:]\s*\d+",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"9_health_plan": {
|
|
"description": "Health plan beneficiary numbers",
|
|
"patterns": [
|
|
r"(?i)(insurance|policy)\s+(number|#|id)[:]\s*[A-Z0-9]+",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"10_account": {
|
|
"description": "Account numbers",
|
|
"patterns": [
|
|
r"(?i)account\s+(number|#)[:]\s*\d+",
|
|
],
|
|
"severity": "MEDIUM"
|
|
},
|
|
"11_license": {
|
|
"description": "Certificate/license numbers",
|
|
"patterns": [
|
|
r"(?i)(driver[']?s\s+license|DL)[:]\s*[A-Z0-9]+",
|
|
],
|
|
"severity": "MEDIUM"
|
|
},
|
|
"12_vehicle": {
|
|
"description": "Vehicle identifiers",
|
|
"patterns": [
|
|
r"(?i)(license\s+plate|VIN)[:]\s*[A-Z0-9]+",
|
|
],
|
|
"severity": "MEDIUM"
|
|
},
|
|
"13_device": {
|
|
"description": "Device identifiers and serial numbers",
|
|
"patterns": [
|
|
r"(?i)(serial|device)\s+(number|#)[:]\s*[A-Z0-9-]+",
|
|
],
|
|
"severity": "MEDIUM"
|
|
},
|
|
"14_url": {
|
|
"description": "Web URLs",
|
|
"patterns": [
|
|
r"https?://[^\s]+",
|
|
r"www\.[^\s]+",
|
|
],
|
|
"severity": "MEDIUM"
|
|
},
|
|
"15_ip": {
|
|
"description": "IP addresses",
|
|
"patterns": [
|
|
r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"16_biometric": {
|
|
"description": "Biometric identifiers",
|
|
"patterns": [
|
|
r"(?i)(fingerprint|voiceprint|retinal\s+scan)",
|
|
],
|
|
"severity": "CRITICAL"
|
|
},
|
|
"17_photos": {
|
|
"description": "Full-face photographs",
|
|
"patterns": [
|
|
r"(?i)(photograph|photo|image).*face",
|
|
r"\.(jpg|jpeg|png|gif)\b",
|
|
],
|
|
"severity": "HIGH"
|
|
},
|
|
"18_unique": {
|
|
"description": "Any other unique identifying characteristic",
|
|
"patterns": [
|
|
r"(?i)(tattoo|birthmark|scar).*unique",
|
|
],
|
|
"severity": "MEDIUM"
|
|
},
|
|
}
|
|
|
|
|
|
def check_identifiers(text: str) -> Dict:
|
|
"""Check text for HIPAA identifiers."""
|
|
violations = {}
|
|
total_issues = 0
|
|
|
|
for identifier_id, config in HIPAA_IDENTIFIERS.items():
|
|
matches = []
|
|
for pattern in config["patterns"]:
|
|
found = re.findall(pattern, text, re.IGNORECASE)
|
|
matches.extend(found)
|
|
|
|
if matches:
|
|
# Remove duplicates, limit to first 5 examples
|
|
unique_matches = list(set(matches))[:5]
|
|
violations[identifier_id] = {
|
|
"description": config["description"],
|
|
"severity": config["severity"],
|
|
"count": len(matches),
|
|
"examples": unique_matches
|
|
}
|
|
total_issues += len(matches)
|
|
|
|
return {
|
|
"total_violations": len(violations),
|
|
"total_instances": total_issues,
|
|
"violations": violations
|
|
}
|
|
|
|
|
|
def check_age_compliance(text: str) -> Dict:
|
|
"""Check if ages >89 are properly aggregated."""
|
|
age_pattern = r"\b(\d{2,3})\s*(?:year|yr)s?[\s-]?old\b"
|
|
ages = [int(age) for age in re.findall(age_pattern, text, re.IGNORECASE)]
|
|
|
|
violations = [age for age in ages if age > 89]
|
|
|
|
return {
|
|
"ages_over_89": len(violations),
|
|
"examples": violations[:5] if violations else [],
|
|
"compliant": len(violations) == 0
|
|
}
|
|
|
|
|
|
def generate_report(filename: str) -> Dict:
|
|
"""Generate de-identification compliance report."""
|
|
filepath = Path(filename)
|
|
|
|
if not filepath.exists():
|
|
raise FileNotFoundError(f"File not found: {filename}")
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
text = f.read()
|
|
|
|
identifier_check = check_identifiers(text)
|
|
age_check = check_age_compliance(text)
|
|
|
|
# Determine overall compliance
|
|
critical_violations = sum(
|
|
1 for v in identifier_check["violations"].values()
|
|
if v["severity"] == "CRITICAL"
|
|
)
|
|
high_violations = sum(
|
|
1 for v in identifier_check["violations"].values()
|
|
if v["severity"] == "HIGH"
|
|
)
|
|
|
|
if critical_violations > 0 or high_violations >= 3:
|
|
status = "NON_COMPLIANT"
|
|
elif high_violations > 0 or not age_check["compliant"]:
|
|
status = "NEEDS_REVIEW"
|
|
else:
|
|
status = "COMPLIANT"
|
|
|
|
report = {
|
|
"filename": str(filename),
|
|
"status": status,
|
|
"identifier_violations": identifier_check,
|
|
"age_compliance": age_check,
|
|
"recommendation": get_recommendation(status, identifier_check, age_check)
|
|
}
|
|
|
|
return report
|
|
|
|
|
|
def get_recommendation(status: str, identifiers: Dict, ages: Dict) -> str:
|
|
"""Generate recommendation based on findings."""
|
|
if status == "COMPLIANT":
|
|
return "Document appears compliant. Perform final manual review before publication."
|
|
|
|
recommendations = []
|
|
|
|
if identifiers["total_violations"] > 0:
|
|
recommendations.append(
|
|
f"Remove or redact {identifiers['total_instances']} identified HIPAA identifiers."
|
|
)
|
|
|
|
if not ages["compliant"]:
|
|
recommendations.append(
|
|
f"Aggregate {ages['ages_over_89']} age(s) >89 years to '90 or older' or '>89 years'."
|
|
)
|
|
|
|
return " ".join(recommendations)
|
|
|
|
|
|
def print_report(report: Dict):
|
|
"""Print human-readable report."""
|
|
print("=" * 70)
|
|
print("HIPAA DE-IDENTIFICATION CHECK")
|
|
print(f"File: {report['filename']}")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
print(f"Overall Status: {report['status']}")
|
|
print()
|
|
|
|
if report["identifier_violations"]["total_violations"] == 0:
|
|
print("✓ No HIPAA identifiers detected")
|
|
else:
|
|
print(f"⚠ Found {report['identifier_violations']['total_violations']} types of violations")
|
|
print(f" Total instances: {report['identifier_violations']['total_instances']}")
|
|
print()
|
|
|
|
print("Violations by type:")
|
|
print("-" * 70)
|
|
|
|
for id_type, details in sorted(
|
|
report["identifier_violations"]["violations"].items(),
|
|
key=lambda x: {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2}[x[1]["severity"]]
|
|
):
|
|
severity_symbol = "⚠⚠⚠" if details["severity"] == "CRITICAL" else "⚠⚠" if details["severity"] == "HIGH" else "⚠"
|
|
print(f"{severity_symbol} [{details['severity']:8}] {details['description']}")
|
|
print(f" Count: {details['count']}")
|
|
print(f" Examples:")
|
|
for example in details["examples"]:
|
|
print(f" - {example}")
|
|
print()
|
|
|
|
age_check = report["age_compliance"]
|
|
if age_check["compliant"]:
|
|
print("✓ Age reporting compliant (no ages >89 or properly aggregated)")
|
|
else:
|
|
print(f"⚠ Age compliance issue: {age_check['ages_over_89']} age(s) >89 detected")
|
|
print(f" Ages must be aggregated to '90 or older' or '>89 years'")
|
|
print(f" Ages found: {age_check['examples']}")
|
|
|
|
print()
|
|
print("Recommendation:")
|
|
print(report["recommendation"])
|
|
print("=" * 70)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Check clinical reports for HIPAA identifiers"
|
|
)
|
|
parser.add_argument("input_file", help="Path to clinical report file")
|
|
parser.add_argument("--output", "-o", help="Output JSON report to file")
|
|
parser.add_argument("--json", action="store_true", help="Output JSON to stdout")
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
report = generate_report(args.input_file)
|
|
|
|
if args.json:
|
|
print(json.dumps(report, indent=2))
|
|
else:
|
|
print_report(report)
|
|
|
|
if args.output:
|
|
with open(args.output, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
print(f"\nJSON report saved to: {args.output}")
|
|
|
|
# Exit with non-zero if violations found
|
|
exit_code = 0 if report["status"] == "COMPLIANT" else 1
|
|
return exit_code
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.exit(main())
|
|
|