mirror of
https://github.com/SamyRai/tercul-backend.git
synced 2025-12-27 05:11:34 +00:00
- Core Go application with GraphQL API using gqlgen - Comprehensive data models for literary works, authors, translations - Repository pattern with caching layer - Authentication and authorization system - Linguistics analysis capabilities with multiple adapters - Vector search integration with Weaviate - Docker containerization support - Python data migration and analysis scripts - Clean architecture with proper separation of concerns - Production-ready configuration and middleware - Proper .gitignore excluding vendor/, database files, and build artifacts
108 lines
4.0 KiB
Python
108 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Test script to demonstrate HTML to Markdown conversion
|
||
"""
|
||
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
|
||
def html_to_markdown(content: str) -> str:
|
||
"""Convert HTML content to Markdown format"""
|
||
if not content or not isinstance(content, str):
|
||
return ""
|
||
|
||
try:
|
||
soup = BeautifulSoup(content, 'html.parser')
|
||
|
||
# Convert common HTML elements to Markdown
|
||
# Paragraphs
|
||
for p in soup.find_all('p'):
|
||
p.replace_with(f"\n\n{p.get_text()}\n\n")
|
||
|
||
# Headers
|
||
for i in range(1, 7):
|
||
for h in soup.find_all(f'h{i}'):
|
||
h.replace_with(f"\n\n{'#' * i} {h.get_text()}\n\n")
|
||
|
||
# Bold
|
||
for b in soup.find_all(['b', 'strong']):
|
||
b.replace_with(f"**{b.get_text()}**")
|
||
|
||
# Italic
|
||
for i in soup.find_all(['i', 'em']):
|
||
i.replace_with(f"*{i.get_text()}*")
|
||
|
||
# Lists
|
||
for ul in soup.find_all('ul'):
|
||
items = []
|
||
for li in ul.find_all('li'):
|
||
items.append(f"- {li.get_text()}")
|
||
ul.replace_with(f"\n\n{chr(10).join(items)}\n\n")
|
||
|
||
for ol in soup.find_all('ol'):
|
||
items = []
|
||
for i, li in enumerate(ol.find_all('li'), 1):
|
||
items.append(f"{i}. {li.get_text()}")
|
||
ol.replace_with(f"\n\n{chr(10).join(items)}\n\n")
|
||
|
||
# Blockquotes
|
||
for blockquote in soup.find_all('blockquote'):
|
||
lines = blockquote.get_text().split('\n')
|
||
quoted_lines = [f"> {line}" for line in lines if line.strip()]
|
||
blockquote.replace_with(f"\n\n{chr(10).join(quoted_lines)}\n\n")
|
||
|
||
# Links
|
||
for a in soup.find_all('a'):
|
||
href = a.get('href', '')
|
||
text = a.get_text()
|
||
a.replace_with(f"[{text}]({href})")
|
||
|
||
# Get the final text and clean up
|
||
result = soup.get_text()
|
||
|
||
# Clean up excessive whitespace
|
||
result = re.sub(r'\n\s*\n\s*\n', '\n\n', result)
|
||
result = result.strip()
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
print(f"Error converting HTML to Markdown: {e}")
|
||
return content
|
||
|
||
def test_conversion():
|
||
"""Test the HTML to Markdown conversion with sample data"""
|
||
|
||
# Sample HTML content from the database
|
||
sample_html = """
|
||
<p>Жил в одном ауле старик с двумя сыновьями. Пришла старику пора помирать.
|
||
Позвал он сыновей и говорит:</p><p>— Мои дорогие дети, я оставляю вам наследство.
|
||
Но не наследством вы будете богаты. Дороже денег, дороже добра три совета. Будете
|
||
их помнить — проживёте в достатке всю жизнь. Вот мои советы, запоминайте. Первыми
|
||
никому не кланяйтесь — пусть другие вам кланяются. Всякую еду ешьте с мёдом. Спите
|
||
всегда на пуховиках.</p><p>Старик умер.</p><p>Сыновья забыли о его советах и давай
|
||
жить в своё удовольствие — пить да гулять, много есть и долго спать.</p>
|
||
"""
|
||
|
||
print("Original HTML:")
|
||
print("=" * 50)
|
||
print(sample_html)
|
||
print()
|
||
|
||
print("Converted to Markdown:")
|
||
print("=" * 50)
|
||
markdown = html_to_markdown(sample_html)
|
||
print(markdown)
|
||
print()
|
||
|
||
print("Benefits of Markdown format:")
|
||
print("- Preserves paragraph structure")
|
||
print("- Maintains dialogue formatting")
|
||
print("- Easy to read and edit")
|
||
print("- Can be rendered to HTML when needed")
|
||
print("- Smaller file size than HTML")
|
||
print("- Better for version control")
|
||
|
||
if __name__ == "__main__":
|
||
test_conversion()
|