#!/usr/bin/env python3 """ Data Analysis Script for Tercul Database Migration This script analyzes the current SQLite database to understand data quality, relationships, and prepare for migration to the Go/GORM schema. """ import sqlite3 import json import re from datetime import datetime from typing import Dict, List, Any, Optional from dataclasses import dataclass from pathlib import Path @dataclass class TableStats: """Statistics for a database table""" name: str record_count: int sample_records: List[Dict[str, Any]] column_info: List[Dict[str, Any]] foreign_keys: List[Dict[str, Any]] data_quality_issues: List[str] class DataAnalyzer: def __init__(self, db_path: str): self.db_path = db_path self.conn = sqlite3.connect(db_path) self.conn.row_factory = sqlite3.Row self.stats: Dict[str, TableStats] = {} def __del__(self): if hasattr(self, 'conn'): self.conn.close() def get_table_names(self) -> List[str]: """Get all table names from the database""" cursor = self.conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") return [row[0] for row in cursor.fetchall()] def get_table_info(self, table_name: str) -> List[Dict[str, Any]]: """Get column information for a table""" cursor = self.conn.cursor() cursor.execute(f"PRAGMA table_info({table_name})") columns = [] for row in cursor.fetchall(): columns.append({ 'name': row[1], 'type': row[2], 'not_null': bool(row[3]), 'default_value': row[4], 'primary_key': bool(row[5]) }) return columns def get_foreign_keys(self, table_name: str) -> List[Dict[str, Any]]: """Get foreign key information for a table""" cursor = self.conn.cursor() cursor.execute(f"PRAGMA foreign_key_list({table_name})") foreign_keys = [] for row in cursor.fetchall(): foreign_keys.append({ 'id': row[0], 'seq': row[1], 'table': row[2], 'from': row[3], 'to': row[4], 'on_update': row[5], 'on_delete': row[6], 'match': row[7] }) return foreign_keys def get_record_count(self, table_name: str) -> int: """Get the number of records in a table""" cursor = self.conn.cursor() cursor.execute(f"SELECT COUNT(*) FROM {table_name}") return cursor.fetchone()[0] def get_sample_records(self, table_name: str, limit: int = 3) -> List[Dict[str, Any]]: """Get sample records from a table""" cursor = self.conn.cursor() cursor.execute(f"SELECT * FROM {table_name} LIMIT {limit}") records = [] for row in cursor.fetchall(): record = {} for key in row.keys(): value = row[key] # Convert to JSON-serializable format if isinstance(value, datetime): value = value.isoformat() record[key] = value records.append(record) return records def analyze_uuid_format(self, table_name: str, id_column: str = 'id') -> List[str]: """Analyze UUID format in ID columns""" cursor = self.conn.cursor() cursor.execute(f"SELECT {id_column} FROM {table_name} WHERE {id_column} IS NOT NULL LIMIT 100") issues = [] uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE) for row in cursor.fetchall(): uuid_value = row[0] if uuid_value and not uuid_pattern.match(str(uuid_value)): issues.append(f"Invalid UUID format: {uuid_value}") return issues[:10] # Limit to first 10 issues def analyze_timestamp_format(self, table_name: str) -> List[str]: """Analyze timestamp format in datetime columns""" cursor = self.conn.cursor() cursor.execute(f"PRAGMA table_info({table_name})") timestamp_columns = [] for row in cursor.fetchall(): if 'time' in row[1].lower() or 'date' in row[1].lower() or 'at' in row[1].lower(): timestamp_columns.append(row[1]) issues = [] for column in timestamp_columns: cursor.execute(f"SELECT {column} FROM {table_name} WHERE {column} IS NOT NULL LIMIT 50") for row in cursor.fetchall(): timestamp_value = row[0] if timestamp_value: try: # Try to parse common timestamp formats if isinstance(timestamp_value, str): # Try different formats formats = [ '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%fZ' ] parsed = False for fmt in formats: try: datetime.strptime(timestamp_value, fmt) parsed = True break except ValueError: continue if not parsed: issues.append(f"Invalid timestamp format in {column}: {timestamp_value}") except Exception as e: issues.append(f"Error parsing timestamp in {column}: {timestamp_value} - {e}") return issues[:10] # Limit to first 10 issues def analyze_relationships(self, table_name: str) -> List[str]: """Analyze relationship integrity""" cursor = self.conn.cursor() cursor.execute(f"PRAGMA foreign_key_list({table_name})") foreign_keys = cursor.fetchall() issues = [] for fk in foreign_keys: from_column = fk[3] to_table = fk[2] to_column = fk[4] # Check for orphaned references cursor.execute(f""" SELECT COUNT(*) FROM {table_name} t1 LEFT JOIN {to_table} t2 ON t1.{from_column} = t2.{to_column} WHERE t1.{from_column} IS NOT NULL AND t2.{to_column} IS NULL """) orphaned_count = cursor.fetchone()[0] if orphaned_count > 0: issues.append(f"Found {orphaned_count} orphaned references to {to_table}.{to_column}") return issues def analyze_content_format(self, table_name: str, content_columns: List[str]) -> List[str]: """Analyze content format in text columns""" cursor = self.conn.cursor() issues = [] for column in content_columns: cursor.execute(f"SELECT {column} FROM {table_name} WHERE {column} IS NOT NULL LIMIT 20") for row in cursor.fetchall(): content = row[0] if content and isinstance(content, str): # Check for YAML/Ruby format if '--- !ruby/hash:ActiveSupport::HashWithIndifferentAccess' in content: issues.append(f"YAML/Ruby format detected in {column}") # Check for HTML content if '

' in content or '

' in content: issues.append(f"HTML content detected in {column}") # Check for very long content if len(content) > 10000: issues.append(f"Very long content in {column}: {len(content)} characters") return list(set(issues)) # Remove duplicates def analyze_table(self, table_name: str) -> TableStats: """Comprehensive analysis of a single table""" print(f"Analyzing table: {table_name}") # Basic information record_count = self.get_record_count(table_name) column_info = self.get_table_info(table_name) foreign_keys = self.get_foreign_keys(table_name) sample_records = self.get_sample_records(table_name) # Data quality analysis data_quality_issues = [] # UUID analysis uuid_issues = self.analyze_uuid_format(table_name) data_quality_issues.extend(uuid_issues) # Timestamp analysis timestamp_issues = self.analyze_timestamp_format(table_name) data_quality_issues.extend(timestamp_issues) # Relationship analysis relationship_issues = self.analyze_relationships(table_name) data_quality_issues.extend(relationship_issues) # Content analysis for text columns text_columns = [col['name'] for col in column_info if 'text' in col['type'].lower() or col['name'] in ['body', 'content', 'description']] content_issues = self.analyze_content_format(table_name, text_columns) data_quality_issues.extend(content_issues) stats = TableStats( name=table_name, record_count=record_count, sample_records=sample_records, column_info=column_info, foreign_keys=foreign_keys, data_quality_issues=data_quality_issues ) self.stats[table_name] = stats return stats def analyze_all_tables(self) -> Dict[str, TableStats]: """Analyze all tables in the database""" table_names = self.get_table_names() for table_name in table_names: try: self.analyze_table(table_name) except Exception as e: print(f"Error analyzing table {table_name}: {e}") return self.stats def generate_report(self, output_file: str = "data_analysis_report.json"): """Generate a comprehensive analysis report""" report = { 'database_info': { 'path': self.db_path, 'analyzed_at': datetime.now().isoformat(), 'total_tables': len(self.stats) }, 'summary': { 'total_records': sum(stats.record_count for stats in self.stats.values()), 'tables_with_issues': len([stats for stats in self.stats.values() if stats.data_quality_issues]), 'total_issues': sum(len(stats.data_quality_issues) for stats in self.stats.values()) }, 'tables': {} } for table_name, stats in self.stats.items(): report['tables'][table_name] = { 'record_count': stats.record_count, 'columns': [col['name'] for col in stats.column_info], 'foreign_keys': [fk['table'] for fk in stats.foreign_keys], 'data_quality_issues': stats.data_quality_issues, 'sample_records': stats.sample_records[:2] # Limit sample records in report } with open(output_file, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"Analysis report saved to: {output_file}") return report def print_summary(self): """Print a summary of the analysis""" print("\n" + "="*80) print("DATA ANALYSIS SUMMARY") print("="*80) total_records = sum(stats.record_count for stats in self.stats.values()) total_issues = sum(len(stats.data_quality_issues) for stats in self.stats.values()) print(f"Total tables analyzed: {len(self.stats)}") print(f"Total records: {total_records:,}") print(f"Total data quality issues: {total_issues}") print() print("Table Statistics:") print("-" * 60) for table_name, stats in sorted(self.stats.items()): print(f"{table_name:25} | {stats.record_count:8,} records | {len(stats.data_quality_issues):2} issues") print("\nTables with Issues:") print("-" * 60) for table_name, stats in self.stats.items(): if stats.data_quality_issues: print(f"\n{table_name}:") for issue in stats.data_quality_issues[:3]: # Show first 3 issues print(f" - {issue}") if len(stats.data_quality_issues) > 3: print(f" ... and {len(stats.data_quality_issues) - 3} more issues") def main(): """Main function to run the data analysis""" db_path = "tercul_data.db" if not Path(db_path).exists(): print(f"Database file not found: {db_path}") return print("Starting data analysis...") analyzer = DataAnalyzer(db_path) # Analyze all tables analyzer.analyze_all_tables() # Generate report analyzer.generate_report() # Print summary analyzer.print_summary() if __name__ == "__main__": main()