tercul-backend/scripts/test_markdown_conversion.py
Damir Mukimov 4957117cb6 Initial commit: Tercul Go project with comprehensive architecture
- Core Go application with GraphQL API using gqlgen
- Comprehensive data models for literary works, authors, translations
- Repository pattern with caching layer
- Authentication and authorization system
- Linguistics analysis capabilities with multiple adapters
- Vector search integration with Weaviate
- Docker containerization support
- Python data migration and analysis scripts
- Clean architecture with proper separation of concerns
- Production-ready configuration and middleware
- Proper .gitignore excluding vendor/, database files, and build artifacts
2025-08-13 07:42:32 +02:00

108 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Test script to demonstrate HTML to Markdown conversion
"""
import re
from bs4 import BeautifulSoup
def html_to_markdown(content: str) -> str:
"""Convert HTML content to Markdown format"""
if not content or not isinstance(content, str):
return ""
try:
soup = BeautifulSoup(content, 'html.parser')
# Convert common HTML elements to Markdown
# Paragraphs
for p in soup.find_all('p'):
p.replace_with(f"\n\n{p.get_text()}\n\n")
# Headers
for i in range(1, 7):
for h in soup.find_all(f'h{i}'):
h.replace_with(f"\n\n{'#' * i} {h.get_text()}\n\n")
# Bold
for b in soup.find_all(['b', 'strong']):
b.replace_with(f"**{b.get_text()}**")
# Italic
for i in soup.find_all(['i', 'em']):
i.replace_with(f"*{i.get_text()}*")
# Lists
for ul in soup.find_all('ul'):
items = []
for li in ul.find_all('li'):
items.append(f"- {li.get_text()}")
ul.replace_with(f"\n\n{chr(10).join(items)}\n\n")
for ol in soup.find_all('ol'):
items = []
for i, li in enumerate(ol.find_all('li'), 1):
items.append(f"{i}. {li.get_text()}")
ol.replace_with(f"\n\n{chr(10).join(items)}\n\n")
# Blockquotes
for blockquote in soup.find_all('blockquote'):
lines = blockquote.get_text().split('\n')
quoted_lines = [f"> {line}" for line in lines if line.strip()]
blockquote.replace_with(f"\n\n{chr(10).join(quoted_lines)}\n\n")
# Links
for a in soup.find_all('a'):
href = a.get('href', '')
text = a.get_text()
a.replace_with(f"[{text}]({href})")
# Get the final text and clean up
result = soup.get_text()
# Clean up excessive whitespace
result = re.sub(r'\n\s*\n\s*\n', '\n\n', result)
result = result.strip()
return result
except Exception as e:
print(f"Error converting HTML to Markdown: {e}")
return content
def test_conversion():
"""Test the HTML to Markdown conversion with sample data"""
# Sample HTML content from the database
sample_html = """
<p>Жил в одном ауле старик с двумя сыновьями. Пришла старику пора помирать.
Позвал он сыновей и говорит:</p><p>— Мои дорогие дети, я оставляю вам наследство.
Но не наследством вы будете богаты. Дороже денег, дороже добра три совета. Будете
их помнить — проживёте в достатке всю жизнь. Вот мои советы, запоминайте. Первыми
никому не кланяйтесь — пусть другие вам кланяются. Всякую еду ешьте с мёдом. Спите
всегда на пуховиках.</p><p>Старик умер.</p><p>Сыновья забыли о его советах и давай
жить в своё удовольствие — пить да гулять, много есть и долго спать.</p>
"""
print("Original HTML:")
print("=" * 50)
print(sample_html)
print()
print("Converted to Markdown:")
print("=" * 50)
markdown = html_to_markdown(sample_html)
print(markdown)
print()
print("Benefits of Markdown format:")
print("- Preserves paragraph structure")
print("- Maintains dialogue formatting")
print("- Easy to read and edit")
print("- Can be rendered to HTML when needed")
print("- Smaller file size than HTML")
print("- Better for version control")
if __name__ == "__main__":
test_conversion()