mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 05:11:34 +00:00
- Core Go application with GraphQL API using gqlgen - Comprehensive data models for literary works, authors, translations - Repository pattern with caching layer - Authentication and authorization system - Linguistics analysis capabilities with multiple adapters - Vector search integration with Weaviate - Docker containerization support - Python data migration and analysis scripts - Clean architecture with proper separation of concerns - Production-ready configuration and middleware - Proper .gitignore excluding vendor/, database files, and build artifacts
338 lines
13 KiB
Python
338 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Data Analysis Script for Tercul Database Migration
|
|
|
|
This script analyzes the current SQLite database to understand data quality,
|
|
relationships, and prepare for migration to the Go/GORM schema.
|
|
"""
|
|
|
|
import sqlite3
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Dict, List, Any, Optional
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
@dataclass
|
|
class TableStats:
|
|
"""Statistics for a database table"""
|
|
name: str
|
|
record_count: int
|
|
sample_records: List[Dict[str, Any]]
|
|
column_info: List[Dict[str, Any]]
|
|
foreign_keys: List[Dict[str, Any]]
|
|
data_quality_issues: List[str]
|
|
|
|
class DataAnalyzer:
|
|
def __init__(self, db_path: str):
|
|
self.db_path = db_path
|
|
self.conn = sqlite3.connect(db_path)
|
|
self.conn.row_factory = sqlite3.Row
|
|
self.stats: Dict[str, TableStats] = {}
|
|
|
|
def __del__(self):
|
|
if hasattr(self, 'conn'):
|
|
self.conn.close()
|
|
|
|
def get_table_names(self) -> List[str]:
|
|
"""Get all table names from the database"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
|
|
return [row[0] for row in cursor.fetchall()]
|
|
|
|
def get_table_info(self, table_name: str) -> List[Dict[str, Any]]:
|
|
"""Get column information for a table"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute(f"PRAGMA table_info({table_name})")
|
|
columns = []
|
|
for row in cursor.fetchall():
|
|
columns.append({
|
|
'name': row[1],
|
|
'type': row[2],
|
|
'not_null': bool(row[3]),
|
|
'default_value': row[4],
|
|
'primary_key': bool(row[5])
|
|
})
|
|
return columns
|
|
|
|
def get_foreign_keys(self, table_name: str) -> List[Dict[str, Any]]:
|
|
"""Get foreign key information for a table"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute(f"PRAGMA foreign_key_list({table_name})")
|
|
foreign_keys = []
|
|
for row in cursor.fetchall():
|
|
foreign_keys.append({
|
|
'id': row[0],
|
|
'seq': row[1],
|
|
'table': row[2],
|
|
'from': row[3],
|
|
'to': row[4],
|
|
'on_update': row[5],
|
|
'on_delete': row[6],
|
|
'match': row[7]
|
|
})
|
|
return foreign_keys
|
|
|
|
def get_record_count(self, table_name: str) -> int:
|
|
"""Get the number of records in a table"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
return cursor.fetchone()[0]
|
|
|
|
def get_sample_records(self, table_name: str, limit: int = 3) -> List[Dict[str, Any]]:
|
|
"""Get sample records from a table"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute(f"SELECT * FROM {table_name} LIMIT {limit}")
|
|
records = []
|
|
for row in cursor.fetchall():
|
|
record = {}
|
|
for key in row.keys():
|
|
value = row[key]
|
|
# Convert to JSON-serializable format
|
|
if isinstance(value, datetime):
|
|
value = value.isoformat()
|
|
record[key] = value
|
|
records.append(record)
|
|
return records
|
|
|
|
def analyze_uuid_format(self, table_name: str, id_column: str = 'id') -> List[str]:
|
|
"""Analyze UUID format in ID columns"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute(f"SELECT {id_column} FROM {table_name} WHERE {id_column} IS NOT NULL LIMIT 100")
|
|
issues = []
|
|
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
|
|
for row in cursor.fetchall():
|
|
uuid_value = row[0]
|
|
if uuid_value and not uuid_pattern.match(str(uuid_value)):
|
|
issues.append(f"Invalid UUID format: {uuid_value}")
|
|
|
|
return issues[:10] # Limit to first 10 issues
|
|
|
|
def analyze_timestamp_format(self, table_name: str) -> List[str]:
|
|
"""Analyze timestamp format in datetime columns"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute(f"PRAGMA table_info({table_name})")
|
|
timestamp_columns = []
|
|
for row in cursor.fetchall():
|
|
if 'time' in row[1].lower() or 'date' in row[1].lower() or 'at' in row[1].lower():
|
|
timestamp_columns.append(row[1])
|
|
|
|
issues = []
|
|
for column in timestamp_columns:
|
|
cursor.execute(f"SELECT {column} FROM {table_name} WHERE {column} IS NOT NULL LIMIT 50")
|
|
for row in cursor.fetchall():
|
|
timestamp_value = row[0]
|
|
if timestamp_value:
|
|
try:
|
|
# Try to parse common timestamp formats
|
|
if isinstance(timestamp_value, str):
|
|
# Try different formats
|
|
formats = [
|
|
'%Y-%m-%d %H:%M:%S',
|
|
'%Y-%m-%d %H:%M:%S.%f',
|
|
'%Y-%m-%d',
|
|
'%Y-%m-%dT%H:%M:%S',
|
|
'%Y-%m-%dT%H:%M:%S.%fZ'
|
|
]
|
|
parsed = False
|
|
for fmt in formats:
|
|
try:
|
|
datetime.strptime(timestamp_value, fmt)
|
|
parsed = True
|
|
break
|
|
except ValueError:
|
|
continue
|
|
if not parsed:
|
|
issues.append(f"Invalid timestamp format in {column}: {timestamp_value}")
|
|
except Exception as e:
|
|
issues.append(f"Error parsing timestamp in {column}: {timestamp_value} - {e}")
|
|
|
|
return issues[:10] # Limit to first 10 issues
|
|
|
|
def analyze_relationships(self, table_name: str) -> List[str]:
|
|
"""Analyze relationship integrity"""
|
|
cursor = self.conn.cursor()
|
|
cursor.execute(f"PRAGMA foreign_key_list({table_name})")
|
|
foreign_keys = cursor.fetchall()
|
|
|
|
issues = []
|
|
for fk in foreign_keys:
|
|
from_column = fk[3]
|
|
to_table = fk[2]
|
|
to_column = fk[4]
|
|
|
|
# Check for orphaned references
|
|
cursor.execute(f"""
|
|
SELECT COUNT(*) FROM {table_name} t1
|
|
LEFT JOIN {to_table} t2 ON t1.{from_column} = t2.{to_column}
|
|
WHERE t1.{from_column} IS NOT NULL AND t2.{to_column} IS NULL
|
|
""")
|
|
orphaned_count = cursor.fetchone()[0]
|
|
if orphaned_count > 0:
|
|
issues.append(f"Found {orphaned_count} orphaned references to {to_table}.{to_column}")
|
|
|
|
return issues
|
|
|
|
def analyze_content_format(self, table_name: str, content_columns: List[str]) -> List[str]:
|
|
"""Analyze content format in text columns"""
|
|
cursor = self.conn.cursor()
|
|
issues = []
|
|
|
|
for column in content_columns:
|
|
cursor.execute(f"SELECT {column} FROM {table_name} WHERE {column} IS NOT NULL LIMIT 20")
|
|
for row in cursor.fetchall():
|
|
content = row[0]
|
|
if content and isinstance(content, str):
|
|
# Check for YAML/Ruby format
|
|
if '--- !ruby/hash:ActiveSupport::HashWithIndifferentAccess' in content:
|
|
issues.append(f"YAML/Ruby format detected in {column}")
|
|
|
|
# Check for HTML content
|
|
if '<p>' in content or '<div>' in content:
|
|
issues.append(f"HTML content detected in {column}")
|
|
|
|
# Check for very long content
|
|
if len(content) > 10000:
|
|
issues.append(f"Very long content in {column}: {len(content)} characters")
|
|
|
|
return list(set(issues)) # Remove duplicates
|
|
|
|
def analyze_table(self, table_name: str) -> TableStats:
|
|
"""Comprehensive analysis of a single table"""
|
|
print(f"Analyzing table: {table_name}")
|
|
|
|
# Basic information
|
|
record_count = self.get_record_count(table_name)
|
|
column_info = self.get_table_info(table_name)
|
|
foreign_keys = self.get_foreign_keys(table_name)
|
|
sample_records = self.get_sample_records(table_name)
|
|
|
|
# Data quality analysis
|
|
data_quality_issues = []
|
|
|
|
# UUID analysis
|
|
uuid_issues = self.analyze_uuid_format(table_name)
|
|
data_quality_issues.extend(uuid_issues)
|
|
|
|
# Timestamp analysis
|
|
timestamp_issues = self.analyze_timestamp_format(table_name)
|
|
data_quality_issues.extend(timestamp_issues)
|
|
|
|
# Relationship analysis
|
|
relationship_issues = self.analyze_relationships(table_name)
|
|
data_quality_issues.extend(relationship_issues)
|
|
|
|
# Content analysis for text columns
|
|
text_columns = [col['name'] for col in column_info if 'text' in col['type'].lower() or col['name'] in ['body', 'content', 'description']]
|
|
content_issues = self.analyze_content_format(table_name, text_columns)
|
|
data_quality_issues.extend(content_issues)
|
|
|
|
stats = TableStats(
|
|
name=table_name,
|
|
record_count=record_count,
|
|
sample_records=sample_records,
|
|
column_info=column_info,
|
|
foreign_keys=foreign_keys,
|
|
data_quality_issues=data_quality_issues
|
|
)
|
|
|
|
self.stats[table_name] = stats
|
|
return stats
|
|
|
|
def analyze_all_tables(self) -> Dict[str, TableStats]:
|
|
"""Analyze all tables in the database"""
|
|
table_names = self.get_table_names()
|
|
|
|
for table_name in table_names:
|
|
try:
|
|
self.analyze_table(table_name)
|
|
except Exception as e:
|
|
print(f"Error analyzing table {table_name}: {e}")
|
|
|
|
return self.stats
|
|
|
|
def generate_report(self, output_file: str = "data_analysis_report.json"):
|
|
"""Generate a comprehensive analysis report"""
|
|
report = {
|
|
'database_info': {
|
|
'path': self.db_path,
|
|
'analyzed_at': datetime.now().isoformat(),
|
|
'total_tables': len(self.stats)
|
|
},
|
|
'summary': {
|
|
'total_records': sum(stats.record_count for stats in self.stats.values()),
|
|
'tables_with_issues': len([stats for stats in self.stats.values() if stats.data_quality_issues]),
|
|
'total_issues': sum(len(stats.data_quality_issues) for stats in self.stats.values())
|
|
},
|
|
'tables': {}
|
|
}
|
|
|
|
for table_name, stats in self.stats.items():
|
|
report['tables'][table_name] = {
|
|
'record_count': stats.record_count,
|
|
'columns': [col['name'] for col in stats.column_info],
|
|
'foreign_keys': [fk['table'] for fk in stats.foreign_keys],
|
|
'data_quality_issues': stats.data_quality_issues,
|
|
'sample_records': stats.sample_records[:2] # Limit sample records in report
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Analysis report saved to: {output_file}")
|
|
return report
|
|
|
|
def print_summary(self):
|
|
"""Print a summary of the analysis"""
|
|
print("\n" + "="*80)
|
|
print("DATA ANALYSIS SUMMARY")
|
|
print("="*80)
|
|
|
|
total_records = sum(stats.record_count for stats in self.stats.values())
|
|
total_issues = sum(len(stats.data_quality_issues) for stats in self.stats.values())
|
|
|
|
print(f"Total tables analyzed: {len(self.stats)}")
|
|
print(f"Total records: {total_records:,}")
|
|
print(f"Total data quality issues: {total_issues}")
|
|
print()
|
|
|
|
print("Table Statistics:")
|
|
print("-" * 60)
|
|
for table_name, stats in sorted(self.stats.items()):
|
|
print(f"{table_name:25} | {stats.record_count:8,} records | {len(stats.data_quality_issues):2} issues")
|
|
|
|
print("\nTables with Issues:")
|
|
print("-" * 60)
|
|
for table_name, stats in self.stats.items():
|
|
if stats.data_quality_issues:
|
|
print(f"\n{table_name}:")
|
|
for issue in stats.data_quality_issues[:3]: # Show first 3 issues
|
|
print(f" - {issue}")
|
|
if len(stats.data_quality_issues) > 3:
|
|
print(f" ... and {len(stats.data_quality_issues) - 3} more issues")
|
|
|
|
def main():
|
|
"""Main function to run the data analysis"""
|
|
db_path = "tercul_data.db"
|
|
|
|
if not Path(db_path).exists():
|
|
print(f"Database file not found: {db_path}")
|
|
return
|
|
|
|
print("Starting data analysis...")
|
|
analyzer = DataAnalyzer(db_path)
|
|
|
|
# Analyze all tables
|
|
analyzer.analyze_all_tables()
|
|
|
|
# Generate report
|
|
analyzer.generate_report()
|
|
|
|
# Print summary
|
|
analyzer.print_summary()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|