tercul-backend/scripts/analyze_data.py
Damir Mukimov 4957117cb6 Initial commit: Tercul Go project with comprehensive architecture
- Core Go application with GraphQL API using gqlgen
- Comprehensive data models for literary works, authors, translations
- Repository pattern with caching layer
- Authentication and authorization system
- Linguistics analysis capabilities with multiple adapters
- Vector search integration with Weaviate
- Docker containerization support
- Python data migration and analysis scripts
- Clean architecture with proper separation of concerns
- Production-ready configuration and middleware
- Proper .gitignore excluding vendor/, database files, and build artifacts
2025-08-13 07:42:32 +02:00

338 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Data Analysis Script for Tercul Database Migration
This script analyzes the current SQLite database to understand data quality,
relationships, and prepare for migration to the Go/GORM schema.
"""
import sqlite3
import json
import re
from datetime import datetime
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TableStats:
"""Statistics for a database table"""
name: str
record_count: int
sample_records: List[Dict[str, Any]]
column_info: List[Dict[str, Any]]
foreign_keys: List[Dict[str, Any]]
data_quality_issues: List[str]
class DataAnalyzer:
def __init__(self, db_path: str):
self.db_path = db_path
self.conn = sqlite3.connect(db_path)
self.conn.row_factory = sqlite3.Row
self.stats: Dict[str, TableStats] = {}
def __del__(self):
if hasattr(self, 'conn'):
self.conn.close()
def get_table_names(self) -> List[str]:
"""Get all table names from the database"""
cursor = self.conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
return [row[0] for row in cursor.fetchall()]
def get_table_info(self, table_name: str) -> List[Dict[str, Any]]:
"""Get column information for a table"""
cursor = self.conn.cursor()
cursor.execute(f"PRAGMA table_info({table_name})")
columns = []
for row in cursor.fetchall():
columns.append({
'name': row[1],
'type': row[2],
'not_null': bool(row[3]),
'default_value': row[4],
'primary_key': bool(row[5])
})
return columns
def get_foreign_keys(self, table_name: str) -> List[Dict[str, Any]]:
"""Get foreign key information for a table"""
cursor = self.conn.cursor()
cursor.execute(f"PRAGMA foreign_key_list({table_name})")
foreign_keys = []
for row in cursor.fetchall():
foreign_keys.append({
'id': row[0],
'seq': row[1],
'table': row[2],
'from': row[3],
'to': row[4],
'on_update': row[5],
'on_delete': row[6],
'match': row[7]
})
return foreign_keys
def get_record_count(self, table_name: str) -> int:
"""Get the number of records in a table"""
cursor = self.conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
return cursor.fetchone()[0]
def get_sample_records(self, table_name: str, limit: int = 3) -> List[Dict[str, Any]]:
"""Get sample records from a table"""
cursor = self.conn.cursor()
cursor.execute(f"SELECT * FROM {table_name} LIMIT {limit}")
records = []
for row in cursor.fetchall():
record = {}
for key in row.keys():
value = row[key]
# Convert to JSON-serializable format
if isinstance(value, datetime):
value = value.isoformat()
record[key] = value
records.append(record)
return records
def analyze_uuid_format(self, table_name: str, id_column: str = 'id') -> List[str]:
"""Analyze UUID format in ID columns"""
cursor = self.conn.cursor()
cursor.execute(f"SELECT {id_column} FROM {table_name} WHERE {id_column} IS NOT NULL LIMIT 100")
issues = []
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
for row in cursor.fetchall():
uuid_value = row[0]
if uuid_value and not uuid_pattern.match(str(uuid_value)):
issues.append(f"Invalid UUID format: {uuid_value}")
return issues[:10] # Limit to first 10 issues
def analyze_timestamp_format(self, table_name: str) -> List[str]:
"""Analyze timestamp format in datetime columns"""
cursor = self.conn.cursor()
cursor.execute(f"PRAGMA table_info({table_name})")
timestamp_columns = []
for row in cursor.fetchall():
if 'time' in row[1].lower() or 'date' in row[1].lower() or 'at' in row[1].lower():
timestamp_columns.append(row[1])
issues = []
for column in timestamp_columns:
cursor.execute(f"SELECT {column} FROM {table_name} WHERE {column} IS NOT NULL LIMIT 50")
for row in cursor.fetchall():
timestamp_value = row[0]
if timestamp_value:
try:
# Try to parse common timestamp formats
if isinstance(timestamp_value, str):
# Try different formats
formats = [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
'%Y-%m-%d',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M:%S.%fZ'
]
parsed = False
for fmt in formats:
try:
datetime.strptime(timestamp_value, fmt)
parsed = True
break
except ValueError:
continue
if not parsed:
issues.append(f"Invalid timestamp format in {column}: {timestamp_value}")
except Exception as e:
issues.append(f"Error parsing timestamp in {column}: {timestamp_value} - {e}")
return issues[:10] # Limit to first 10 issues
def analyze_relationships(self, table_name: str) -> List[str]:
"""Analyze relationship integrity"""
cursor = self.conn.cursor()
cursor.execute(f"PRAGMA foreign_key_list({table_name})")
foreign_keys = cursor.fetchall()
issues = []
for fk in foreign_keys:
from_column = fk[3]
to_table = fk[2]
to_column = fk[4]
# Check for orphaned references
cursor.execute(f"""
SELECT COUNT(*) FROM {table_name} t1
LEFT JOIN {to_table} t2 ON t1.{from_column} = t2.{to_column}
WHERE t1.{from_column} IS NOT NULL AND t2.{to_column} IS NULL
""")
orphaned_count = cursor.fetchone()[0]
if orphaned_count > 0:
issues.append(f"Found {orphaned_count} orphaned references to {to_table}.{to_column}")
return issues
def analyze_content_format(self, table_name: str, content_columns: List[str]) -> List[str]:
"""Analyze content format in text columns"""
cursor = self.conn.cursor()
issues = []
for column in content_columns:
cursor.execute(f"SELECT {column} FROM {table_name} WHERE {column} IS NOT NULL LIMIT 20")
for row in cursor.fetchall():
content = row[0]
if content and isinstance(content, str):
# Check for YAML/Ruby format
if '--- !ruby/hash:ActiveSupport::HashWithIndifferentAccess' in content:
issues.append(f"YAML/Ruby format detected in {column}")
# Check for HTML content
if '<p>' in content or '<div>' in content:
issues.append(f"HTML content detected in {column}")
# Check for very long content
if len(content) > 10000:
issues.append(f"Very long content in {column}: {len(content)} characters")
return list(set(issues)) # Remove duplicates
def analyze_table(self, table_name: str) -> TableStats:
"""Comprehensive analysis of a single table"""
print(f"Analyzing table: {table_name}")
# Basic information
record_count = self.get_record_count(table_name)
column_info = self.get_table_info(table_name)
foreign_keys = self.get_foreign_keys(table_name)
sample_records = self.get_sample_records(table_name)
# Data quality analysis
data_quality_issues = []
# UUID analysis
uuid_issues = self.analyze_uuid_format(table_name)
data_quality_issues.extend(uuid_issues)
# Timestamp analysis
timestamp_issues = self.analyze_timestamp_format(table_name)
data_quality_issues.extend(timestamp_issues)
# Relationship analysis
relationship_issues = self.analyze_relationships(table_name)
data_quality_issues.extend(relationship_issues)
# Content analysis for text columns
text_columns = [col['name'] for col in column_info if 'text' in col['type'].lower() or col['name'] in ['body', 'content', 'description']]
content_issues = self.analyze_content_format(table_name, text_columns)
data_quality_issues.extend(content_issues)
stats = TableStats(
name=table_name,
record_count=record_count,
sample_records=sample_records,
column_info=column_info,
foreign_keys=foreign_keys,
data_quality_issues=data_quality_issues
)
self.stats[table_name] = stats
return stats
def analyze_all_tables(self) -> Dict[str, TableStats]:
"""Analyze all tables in the database"""
table_names = self.get_table_names()
for table_name in table_names:
try:
self.analyze_table(table_name)
except Exception as e:
print(f"Error analyzing table {table_name}: {e}")
return self.stats
def generate_report(self, output_file: str = "data_analysis_report.json"):
"""Generate a comprehensive analysis report"""
report = {
'database_info': {
'path': self.db_path,
'analyzed_at': datetime.now().isoformat(),
'total_tables': len(self.stats)
},
'summary': {
'total_records': sum(stats.record_count for stats in self.stats.values()),
'tables_with_issues': len([stats for stats in self.stats.values() if stats.data_quality_issues]),
'total_issues': sum(len(stats.data_quality_issues) for stats in self.stats.values())
},
'tables': {}
}
for table_name, stats in self.stats.items():
report['tables'][table_name] = {
'record_count': stats.record_count,
'columns': [col['name'] for col in stats.column_info],
'foreign_keys': [fk['table'] for fk in stats.foreign_keys],
'data_quality_issues': stats.data_quality_issues,
'sample_records': stats.sample_records[:2] # Limit sample records in report
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"Analysis report saved to: {output_file}")
return report
def print_summary(self):
"""Print a summary of the analysis"""
print("\n" + "="*80)
print("DATA ANALYSIS SUMMARY")
print("="*80)
total_records = sum(stats.record_count for stats in self.stats.values())
total_issues = sum(len(stats.data_quality_issues) for stats in self.stats.values())
print(f"Total tables analyzed: {len(self.stats)}")
print(f"Total records: {total_records:,}")
print(f"Total data quality issues: {total_issues}")
print()
print("Table Statistics:")
print("-" * 60)
for table_name, stats in sorted(self.stats.items()):
print(f"{table_name:25} | {stats.record_count:8,} records | {len(stats.data_quality_issues):2} issues")
print("\nTables with Issues:")
print("-" * 60)
for table_name, stats in self.stats.items():
if stats.data_quality_issues:
print(f"\n{table_name}:")
for issue in stats.data_quality_issues[:3]: # Show first 3 issues
print(f" - {issue}")
if len(stats.data_quality_issues) > 3:
print(f" ... and {len(stats.data_quality_issues) - 3} more issues")
def main():
"""Main function to run the data analysis"""
db_path = "tercul_data.db"
if not Path(db_path).exists():
print(f"Database file not found: {db_path}")
return
print("Starting data analysis...")
analyzer = DataAnalyzer(db_path)
# Analyze all tables
analyzer.analyze_all_tables()
# Generate report
analyzer.generate_report()
# Print summary
analyzer.print_summary()
if __name__ == "__main__":
main()