#!/usr/bin/env python3 """ Data Migration Preparation Script for Tercul This script prepares data from the current SQLite database for migration to the Go/GORM schema. It handles ID conversion, timestamp parsing, content extraction, and relationship mapping. """ import sqlite3 import json import re import uuid from datetime import datetime from typing import Dict, List, Any, Optional, Tuple from dataclasses import dataclass from pathlib import Path import yaml from bs4 import BeautifulSoup import re @dataclass class MigrationMapping: """Mapping information for migration""" old_id: str new_id: int table_name: str class DataMigrationPreparer: def __init__(self, source_db_path: str, output_dir: str = "migration_data"): self.source_db_path = source_db_path self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) self.conn = sqlite3.connect(source_db_path) self.conn.row_factory = sqlite3.Row # ID mapping storage self.id_mappings: Dict[str, Dict[str, int]] = {} self.next_ids: Dict[str, int] = {} # Initialize next IDs self.next_ids = { 'countries': 1, 'authors': 1, 'works': 1, 'translations': 1, 'books': 1, 'users': 1, 'tags': 1, 'categories': 1 } def __del__(self): if hasattr(self, 'conn'): self.conn.close() def get_next_id(self, table_name: str) -> int: """Get the next available ID for a table""" if table_name not in self.next_ids: self.next_ids[table_name] = 1 current_id = self.next_ids[table_name] self.next_ids[table_name] += 1 return current_id def parse_timestamp(self, timestamp_str: str) -> Optional[str]: """Parse timestamp string to ISO format""" if not timestamp_str: return None # Common timestamp formats formats = [ '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%fZ' ] for fmt in formats: try: dt = datetime.strptime(timestamp_str, fmt) return dt.isoformat() except ValueError: continue return None def extract_yaml_content(self, content: str) -> Dict[str, Any]: """Extract content from Ruby object format""" if not content or not isinstance(content, str): return {} # Check if it's Ruby object format if '--- !ruby/hash:ActiveSupport::HashWithIndifferentAccess' in content: try: # Method 1: Try to extract text field directly using regex text_match = re.search(r'text:\s*"([^"]*(?:\\"[^"]*)*)"', content, re.DOTALL) if text_match: text_content = text_match.group(1) # Unescape the text content text_content = text_content.replace('\\n', '\n').replace('\\"', '"').replace('\\\\', '\\') return {'text': text_content} # Method 2: Try to parse as YAML after cleaning # First, unescape the content unescaped = content.replace('\\n', '\n').replace('\\"', '"').replace('\\\\', '\\') # Extract the YAML part yaml_start = unescaped.find('---') if yaml_start != -1: yaml_content = unescaped[yaml_start:] # Clean up the Ruby-specific parts yaml_content = yaml_content.replace('!ruby/hash:ActiveSupport::HashWithIndifferentAccess', '') # Parse YAML data = yaml.safe_load(yaml_content) return data or {} except Exception as e: # Method 3: Fallback - try to extract any text content try: # Look for any quoted content text_match = re.search(r'text:\s*"([^"]*(?:\\"[^"]*)*)"', content, re.DOTALL) if text_match: text_content = text_match.group(1) # Basic unescaping text_content = text_content.replace('\\n', '\n').replace('\\"', '"') return {'text': text_content} except Exception as e2: print(f"Error extracting text manually: {e2}") return {} # If not Ruby format, return as plain text return {'text': content} def html_to_markdown(self, content: str) -> str: """Convert HTML content to Markdown format""" if not content or not isinstance(content, str): return "" try: soup = BeautifulSoup(content, 'html.parser') # Convert common HTML elements to Markdown # Paragraphs for p in soup.find_all('p'): p.replace_with(f"\n\n{p.get_text()}\n\n") # Headers for i in range(1, 7): for h in soup.find_all(f'h{i}'): h.replace_with(f"\n\n{'#' * i} {h.get_text()}\n\n") # Bold for b in soup.find_all(['b', 'strong']): b.replace_with(f"**{b.get_text()}**") # Italic for i in soup.find_all(['i', 'em']): i.replace_with(f"*{i.get_text()}*") # Lists for ul in soup.find_all('ul'): items = [] for li in ul.find_all('li'): items.append(f"- {li.get_text()}") ul.replace_with(f"\n\n{chr(10).join(items)}\n\n") for ol in soup.find_all('ol'): items = [] for i, li in enumerate(ol.find_all('li'), 1): items.append(f"{i}. {li.get_text()}") ol.replace_with(f"\n\n{chr(10).join(items)}\n\n") # Blockquotes for blockquote in soup.find_all('blockquote'): lines = blockquote.get_text().split('\n') quoted_lines = [f"> {line}" for line in lines if line.strip()] blockquote.replace_with(f"\n\n{chr(10).join(quoted_lines)}\n\n") # Links for a in soup.find_all('a'): href = a.get('href', '') text = a.get_text() a.replace_with(f"[{text}]({href})") # Get the final text and clean up result = soup.get_text() # Clean up excessive whitespace result = re.sub(r'\n\s*\n\s*\n', '\n\n', result) result = result.strip() return result except Exception as e: print(f"Error converting HTML to Markdown: {e}") return content def migrate_countries(self) -> List[Dict[str, Any]]: """Migrate countries data""" print("Migrating countries...") cursor = self.conn.cursor() cursor.execute(""" SELECT c.id, c.created_at, c.updated_at, ct.name, ct.language_code FROM countries c LEFT JOIN country_translations ct ON c.id = ct.country_id ORDER BY c.id """) countries = {} for row in cursor.fetchall(): country_id = row['id'] if country_id not in countries: new_id = self.get_next_id('countries') self.id_mappings.setdefault('countries', {})[country_id] = new_id countries[country_id] = { 'id': new_id, 'name': '', 'code': '', # Will need to be generated or mapped 'phone_code': '', 'currency': '', 'continent': '', 'language': 'multi', 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } # Add translation data if row['name'] and row['language_code']: if row['language_code'] == 'en': countries[country_id]['name'] = row['name'] # Generate country code from name countries[country_id]['code'] = row['name'][:2].upper() return list(countries.values()) def migrate_authors(self) -> List[Dict[str, Any]]: """Migrate authors data""" print("Migrating authors...") cursor = self.conn.cursor() cursor.execute(""" SELECT a.id, a.date_of_birth, a.date_of_death, a.created_at, a.updated_at, a.is_top, a.date_of_death_precision, a.date_of_birth_precision, a.is_draft, a.custom_date_of_birth, a.custom_date_of_death, a.slug, at.first_name, at.last_name, at.full_name, at.place_of_birth, at.place_of_death, at.pen_names, at.language_code, atr.body as biography FROM authors a LEFT JOIN author_translations at ON a.id = at.author_id LEFT JOIN action_text_rich_texts atr ON at.id = atr.record_id AND atr.name = 'biography' ORDER BY a.id """) authors = {} for row in cursor.fetchall(): author_id = row['id'] if author_id not in authors: new_id = self.get_next_id('authors') self.id_mappings.setdefault('authors', {})[author_id] = new_id authors[author_id] = { 'id': new_id, 'name': '', 'biography': '', 'birth_date': None, 'death_date': None, 'language': 'multi', 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } # Add translation data if row['language_code'] == 'en': if row['full_name']: authors[author_id]['name'] = row['full_name'] elif row['first_name'] and row['last_name']: authors[author_id]['name'] = f"{row['first_name']} {row['last_name']}" # Parse dates if row['date_of_birth']: authors[author_id]['birth_date'] = self.parse_timestamp(row['date_of_birth']) if row['date_of_death']: authors[author_id]['death_date'] = self.parse_timestamp(row['date_of_death']) # Add biography if row['biography']: content = self.extract_yaml_content(row['biography']) if 'text' in content: authors[author_id]['biography'] = self.html_to_markdown(content['text']) return list(authors.values()) def migrate_works(self) -> List[Dict[str, Any]]: """Migrate works data""" print("Migrating works...") cursor = self.conn.cursor() cursor.execute(""" SELECT w.id, w.author_id, w.date_created, w.age_restrictions, w.literature_type, w.created_at, w.updated_at, w.is_top, w.date_created_precision, w.is_draft, w.genres, w.slug FROM works w ORDER BY w.id """) works = [] for row in cursor.fetchall(): new_id = self.get_next_id('works') self.id_mappings.setdefault('works', {})[row['id']] = new_id # Map author_id author_id = None if row['author_id'] and row['author_id'] in self.id_mappings.get('authors', {}): author_id = self.id_mappings['authors'][row['author_id']] work = { 'id': new_id, 'title': '', # Will be filled from translations 'content': '', # Will be filled from translations 'description': '', 'author_id': author_id, 'published_at': self.parse_timestamp(row['date_created']), 'status': 'draft' if row['is_draft'] else 'published', 'language': 'multi', 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } works.append(work) return works def migrate_translations(self) -> List[Dict[str, Any]]: """Migrate all translations data (polymorphic approach)""" print("Migrating translations...") translations = [] # Migrate work translations cursor = self.conn.cursor() cursor.execute(""" SELECT wt.id, wt.work_id, wt.title, wt.audio_url, wt.translator, wt.date_translated, wt.is_original_language, wt.created_at, wt.updated_at, wt.language_code, wt.body FROM work_translations wt ORDER BY wt.work_id, wt.language_code """) for row in cursor.fetchall(): new_id = self.get_next_id('translations') self.id_mappings.setdefault('translations', {})[row['id']] = new_id # Map work_id translatable_id = None if row['work_id'] and row['work_id'] in self.id_mappings.get('works', {}): translatable_id = self.id_mappings['works'][row['work_id']] # Extract content content = "" if row['body']: content_data = self.extract_yaml_content(row['body']) if 'text' in content_data: content = self.html_to_markdown(content_data['text']) translation = { 'id': new_id, 'title': row['title'] or '', 'content': content, 'description': '', 'language': row['language_code'] or 'en', 'status': 'published' if row['is_original_language'] else 'draft', 'published_at': self.parse_timestamp(row['date_translated']), 'translatable_id': translatable_id, 'translatable_type': 'Work', 'translator_id': None, # Will need user mapping 'is_original_language': bool(row['is_original_language']), 'audio_url': row['audio_url'] or '', 'date_translated': self.parse_timestamp(row['date_translated']), 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } translations.append(translation) # Migrate author translations cursor.execute(""" SELECT at.id, at.author_id, at.first_name, at.last_name, at.full_name, at.place_of_birth, at.place_of_death, at.pen_names, at.language_code, at.created_at, at.updated_at, atr.body as biography FROM author_translations at LEFT JOIN action_text_rich_texts atr ON at.id = atr.record_id AND atr.name = 'biography' ORDER BY at.author_id, at.language_code """) for row in cursor.fetchall(): new_id = self.get_next_id('translations') self.id_mappings.setdefault('translations', {})[row['id']] = new_id # Map author_id translatable_id = None if row['author_id'] and row['author_id'] in self.id_mappings.get('authors', {}): translatable_id = self.id_mappings['authors'][row['author_id']] # Extract biography content = "" if row['biography']: content_data = self.extract_yaml_content(row['biography']) if 'text' in content_data: content = self.html_to_markdown(content_data['text']) # Create title from name components title = "" if row['full_name']: title = row['full_name'] elif row['first_name'] and row['last_name']: title = f"{row['first_name']} {row['last_name']}" translation = { 'id': new_id, 'title': title, 'content': content, 'description': '', 'language': row['language_code'] or 'en', 'status': 'published', 'published_at': None, 'translatable_id': translatable_id, 'translatable_type': 'Author', 'translator_id': None, 'is_original_language': False, 'audio_url': '', 'date_translated': None, 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } translations.append(translation) # Migrate country translations cursor.execute(""" SELECT ct.id, ct.country_id, ct.name, ct.language_code, ct.created_at, ct.updated_at FROM country_translations ct ORDER BY ct.country_id, ct.language_code """) for row in cursor.fetchall(): new_id = self.get_next_id('translations') self.id_mappings.setdefault('translations', {})[row['id']] = new_id # Map country_id translatable_id = None if row['country_id'] and row['country_id'] in self.id_mappings.get('countries', {}): translatable_id = self.id_mappings['countries'][row['country_id']] translation = { 'id': new_id, 'title': row['name'] or '', 'content': '', 'description': '', 'language': row['language_code'] or 'en', 'status': 'published', 'published_at': None, 'translatable_id': translatable_id, 'translatable_type': 'Country', 'translator_id': None, 'is_original_language': False, 'audio_url': '', 'date_translated': None, 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } translations.append(translation) return translations def migrate_books(self) -> List[Dict[str, Any]]: """Migrate books data""" print("Migrating books...") cursor = self.conn.cursor() cursor.execute(""" SELECT b.id, b.created_at, b.updated_at, b.genre, b.types, b.is_draft, b.genres, bt.title, bt.annotation, bt.introduction, bt.isbn, bt.publisher, bt.language_code FROM books b LEFT JOIN book_translations bt ON b.id = bt.book_id ORDER BY b.id """) books = {} for row in cursor.fetchall(): book_id = row['id'] if book_id not in books: new_id = self.get_next_id('books') self.id_mappings.setdefault('books', {})[book_id] = new_id books[book_id] = { 'id': new_id, 'title': '', 'description': '', 'isbn': '', 'published_at': None, 'language': 'multi', 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } # Add translation data if row['language_code'] == 'en': if row['title']: books[book_id]['title'] = row['title'] if row['annotation']: books[book_id]['description'] = row['annotation'] elif row['introduction']: books[book_id]['description'] = row['introduction'] if row['isbn']: books[book_id]['isbn'] = row['isbn'] return list(books.values()) def migrate_users(self) -> List[Dict[str, Any]]: """Migrate users data""" print("Migrating users...") cursor = self.conn.cursor() cursor.execute("SELECT * FROM users") users = [] for row in cursor.fetchall(): new_id = self.get_next_id('users') self.id_mappings.setdefault('users', {})[row['id']] = new_id user = { 'id': new_id, 'username': row['name'] or f"user_{new_id}", 'email': row['email'] or f"user_{new_id}@example.com", 'password': row['password'] or '', # Will need to be rehashed 'first_name': '', 'last_name': '', 'display_name': row['name'] or '', 'bio': '', 'avatar_url': '', 'role': 'reader', 'verified': False, 'active': True, 'created_at': self.parse_timestamp(row['created_at']), 'updated_at': self.parse_timestamp(row['updated_at']) } users.append(user) return users def create_junction_tables(self) -> Dict[str, List[Dict[str, Any]]]: """Create junction table data""" print("Creating junction tables...") junction_data = { 'book_works': [], 'author_countries': [] } # Book-Works relationships cursor = self.conn.cursor() cursor.execute("SELECT book_id, work_id, `order` FROM book_works") for row in cursor.fetchall(): book_id = self.id_mappings.get('books', {}).get(row['book_id']) work_id = self.id_mappings.get('works', {}).get(row['work_id']) if book_id and work_id: junction_data['book_works'].append({ 'book_id': book_id, 'work_id': work_id, 'order': int(row['order']) if row['order'] else 0 }) # Author-Countries relationships cursor.execute("SELECT author_id, country_id FROM author_countries") for row in cursor.fetchall(): author_id = self.id_mappings.get('authors', {}).get(row['author_id']) country_id = self.id_mappings.get('countries', {}).get(row['country_id']) if author_id and country_id: junction_data['author_countries'].append({ 'author_id': author_id, 'country_id': country_id }) return junction_data def create_sqlite_database(self): """Create SQLite database with Go/GORM schema and import data""" print("Creating SQLite database...") # Migrate core entities countries = self.migrate_countries() authors = self.migrate_authors() works = self.migrate_works() translations = self.migrate_translations() books = self.migrate_books() users = self.migrate_users() # Create junction tables junction_data = self.create_junction_tables() # Create new SQLite database db_path = self.output_dir / "tercul_go.db" new_conn = sqlite3.connect(db_path) new_conn.row_factory = sqlite3.Row # Create tables with proper schema self.create_tables(new_conn) # Import data self.import_data(new_conn, countries, authors, works, translations, books, users, junction_data) new_conn.close() print(f"SQLite database created: {db_path}") return { 'countries': countries, 'authors': authors, 'works': works, 'translations': translations, 'books': books, 'users': users, 'junction_tables': junction_data, 'id_mappings': self.id_mappings, 'metadata': { 'total_records': len(countries) + len(authors) + len(works) + len(translations) + len(books) + len(users), 'migrated_at': datetime.now().isoformat(), 'source_database': self.source_db_path, 'target_database': str(db_path) } } def create_tables(self, conn): """Create tables with Go/GORM compatible schema""" cursor = conn.cursor() # Create countries table cursor.execute(""" CREATE TABLE countries ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, code TEXT, phone_code TEXT, currency TEXT, continent TEXT, language TEXT DEFAULT 'multi', created_at TEXT, updated_at TEXT ) """) # Create authors table cursor.execute(""" CREATE TABLE authors ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, status TEXT DEFAULT 'active', birth_date TEXT, death_date TEXT, language TEXT DEFAULT 'multi', created_at TEXT, updated_at TEXT ) """) # Create works table cursor.execute(""" CREATE TABLE works ( id INTEGER PRIMARY KEY, title TEXT NOT NULL, description TEXT, type TEXT DEFAULT 'other', status TEXT DEFAULT 'draft', published_at TEXT, language TEXT DEFAULT 'multi', created_at TEXT, updated_at TEXT ) """) # Create translations table (polymorphic) cursor.execute(""" CREATE TABLE translations ( id INTEGER PRIMARY KEY, title TEXT NOT NULL, content TEXT, description TEXT, language TEXT NOT NULL, status TEXT DEFAULT 'draft', published_at TEXT, translatable_id INTEGER NOT NULL, translatable_type TEXT NOT NULL, translator_id INTEGER, is_original_language BOOLEAN DEFAULT FALSE, audio_url TEXT, date_translated TEXT, created_at TEXT, updated_at TEXT, FOREIGN KEY (translator_id) REFERENCES users (id) ) """) # Create books table cursor.execute(""" CREATE TABLE books ( id INTEGER PRIMARY KEY, title TEXT NOT NULL, description TEXT, isbn TEXT, format TEXT DEFAULT 'paperback', status TEXT DEFAULT 'draft', published_at TEXT, language TEXT DEFAULT 'multi', created_at TEXT, updated_at TEXT ) """) # Create users table cursor.execute(""" CREATE TABLE users ( id INTEGER PRIMARY KEY, username TEXT NOT NULL UNIQUE, email TEXT NOT NULL UNIQUE, password TEXT NOT NULL, first_name TEXT, last_name TEXT, display_name TEXT, bio TEXT, avatar_url TEXT, role TEXT DEFAULT 'reader', verified BOOLEAN DEFAULT FALSE, active BOOLEAN DEFAULT TRUE, created_at TEXT, updated_at TEXT ) """) # Create junction tables cursor.execute(""" CREATE TABLE book_works ( id INTEGER PRIMARY KEY, book_id INTEGER NOT NULL, work_id INTEGER NOT NULL, "order" INTEGER DEFAULT 0, created_at TEXT, updated_at TEXT, FOREIGN KEY (book_id) REFERENCES books (id), FOREIGN KEY (work_id) REFERENCES works (id) ) """) cursor.execute(""" CREATE TABLE author_countries ( id INTEGER PRIMARY KEY, author_id INTEGER NOT NULL, country_id INTEGER NOT NULL, created_at TEXT, updated_at TEXT, FOREIGN KEY (author_id) REFERENCES authors (id), FOREIGN KEY (country_id) REFERENCES countries (id) ) """) # Create indexes cursor.execute("CREATE INDEX idx_translations_translatable ON translations (translatable_id, translatable_type)") cursor.execute("CREATE INDEX idx_translations_language ON translations (language)") cursor.execute("CREATE INDEX idx_works_author ON works (author_id)") cursor.execute("CREATE INDEX idx_book_works_book ON book_works (book_id)") cursor.execute("CREATE INDEX idx_book_works_work ON book_works (work_id)") cursor.execute("CREATE INDEX idx_author_countries_author ON author_countries (author_id)") cursor.execute("CREATE INDEX idx_author_countries_country ON author_countries (country_id)") conn.commit() print("Tables created successfully") def import_data(self, conn, countries, authors, works, translations, books, users, junction_data): """Import data into the new database""" cursor = conn.cursor() # Import countries print("Importing countries...") for country in countries: cursor.execute(""" INSERT INTO countries (id, name, code, phone_code, currency, continent, language, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( country['id'], country['name'], country['code'], country['phone_code'], country['currency'], country['continent'], country['language'], country['created_at'], country['updated_at'] )) # Import authors print("Importing authors...") for author in authors: cursor.execute(""" INSERT INTO authors (id, name, biography, birth_date, death_date, language, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, ( author['id'], author['name'], author['biography'], author['birth_date'], author['death_date'], author['language'], author['created_at'], author['updated_at'] )) # Import works print("Importing works...") for work in works: cursor.execute(""" INSERT INTO works (id, title, content, description, author_id, published_at, status, language, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( work['id'], work['title'], work['content'], work['description'], work['author_id'], work['published_at'], work['status'], work['language'], work['created_at'], work['updated_at'] )) # Import translations print("Importing translations...") for translation in translations: cursor.execute(""" INSERT INTO translations (id, title, content, description, language, status, published_at, translatable_id, translatable_type, translator_id, is_original_language, audio_url, date_translated, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( translation['id'], translation['title'], translation['content'], translation['description'], translation['language'], translation['status'], translation['published_at'], translation['translatable_id'], translation['translatable_type'], translation['translator_id'], translation['is_original_language'], translation['audio_url'], translation['date_translated'], translation['created_at'], translation['updated_at'] )) # Import books print("Importing books...") for book in books: cursor.execute(""" INSERT INTO books (id, title, description, isbn, published_at, language, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, ( book['id'], book['title'], book['description'], book['isbn'], book['published_at'], book['language'], book['created_at'], book['updated_at'] )) # Import users print("Importing users...") for user in users: cursor.execute(""" INSERT INTO users (id, username, email, password, first_name, last_name, display_name, bio, avatar_url, role, verified, active, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( user['id'], user['username'], user['email'], user['password'], user['first_name'], user['last_name'], user['display_name'], user['bio'], user['avatar_url'], user['role'], user['verified'], user['active'], user['created_at'], user['updated_at'] )) # Import junction tables print("Importing junction tables...") for book_work in junction_data['book_works']: cursor.execute(""" INSERT INTO book_works (book_id, work_id, "order") VALUES (?, ?, ?) """, (book_work['book_id'], book_work['work_id'], book_work['order'])) for author_country in junction_data['author_countries']: cursor.execute(""" INSERT INTO author_countries (author_id, country_id) VALUES (?, ?) """, (author_country['author_id'], author_country['country_id'])) conn.commit() print("Data import completed") def save_migration_data(self): """Save all migration data to JSON files""" print("Saving migration data...") # Migrate core entities countries = self.migrate_countries() authors = self.migrate_authors() works = self.migrate_works() translations = self.migrate_translations() books = self.migrate_books() users = self.migrate_users() # Create junction tables junction_data = self.create_junction_tables() # Save data to files migration_data = { 'countries': countries, 'authors': authors, 'works': works, 'translations': translations, 'books': books, 'users': users, 'junction_tables': junction_data, 'id_mappings': self.id_mappings, 'metadata': { 'total_records': len(countries) + len(authors) + len(works) + len(translations) + len(books) + len(users), 'migrated_at': datetime.now().isoformat(), 'source_database': self.source_db_path } } # Save main migration data with open(self.output_dir / 'migration_data.json', 'w', encoding='utf-8') as f: json.dump(migration_data, f, indent=2, ensure_ascii=False) # Save ID mappings separately for reference with open(self.output_dir / 'id_mappings.json', 'w', encoding='utf-8') as f: json.dump(self.id_mappings, f, indent=2, ensure_ascii=False) # Save individual entity files for entity_name, data in [ ('countries', countries), ('authors', authors), ('works', works), ('translations', translations), ('books', books), ('users', users) ]: with open(self.output_dir / f'{entity_name}.json', 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"Migration data saved to: {self.output_dir}") return migration_data def generate_migration_summary(self) -> Dict[str, Any]: """Generate a summary of the migration""" summary = { 'source_database': self.source_db_path, 'migrated_at': datetime.now().isoformat(), 'record_counts': { 'countries': len(self.id_mappings.get('countries', {})), 'authors': len(self.id_mappings.get('authors', {})), 'works': len(self.id_mappings.get('works', {})), 'translations': len(self.id_mappings.get('translations', {})), 'books': len(self.id_mappings.get('books', {})), 'users': len(self.id_mappings.get('users', {})) }, 'total_records': sum(len(mapping) for mapping in self.id_mappings.values()), 'next_ids': self.next_ids } with open(self.output_dir / 'migration_summary.json', 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) return summary def main(): """Main function to run the data migration preparation""" source_db = "tercul_data.db" if not Path(source_db).exists(): print(f"Source database not found: {source_db}") return print("Starting data migration and database creation...") preparer = DataMigrationPreparer(source_db) # Create SQLite database with migrated data migration_data = preparer.create_sqlite_database() # Generate summary summary = preparer.generate_migration_summary() print("\nMigration and Database Creation Summary:") print("=" * 50) for entity, count in summary['record_counts'].items(): print(f"{entity:15}: {count:6,} records") print(f"{'Total':15}: {summary['total_records']:6,} records") print(f"\nSQLite database created: {preparer.output_dir}/tercul_go.db") print(f"Migration data saved to: {preparer.output_dir}") if __name__ == "__main__": main()