tercul-backend/scripts/test_markdown_conversion.py

#!/usr/bin/env python3
"""
Test script to demonstrate HTML to Markdown conversion
"""

import re
from bs4 import BeautifulSoup

def html_to_markdown(content: str) -> str:
    """Convert HTML content to Markdown format"""
    if not content or not isinstance(content, str):
        return ""

    try:
        soup = BeautifulSoup(content, 'html.parser')

        # Convert common HTML elements to Markdown
        # Paragraphs
        for p in soup.find_all('p'):
            p.replace_with(f"\n\n{p.get_text()}\n\n")

        # Headers
        for i in range(1, 7):
            for h in soup.find_all(f'h{i}'):
                h.replace_with(f"\n\n{'#' * i} {h.get_text()}\n\n")

        # Bold
        for b in soup.find_all(['b', 'strong']):
            b.replace_with(f"**{b.get_text()}**")

        # Italic
        for i in soup.find_all(['i', 'em']):
            i.replace_with(f"*{i.get_text()}*")

        # Lists
        for ul in soup.find_all('ul'):
            items = []
            for li in ul.find_all('li'):
                items.append(f"- {li.get_text()}")
            ul.replace_with(f"\n\n{chr(10).join(items)}\n\n")

        for ol in soup.find_all('ol'):
            items = []
            for i, li in enumerate(ol.find_all('li'), 1):
                items.append(f"{i}. {li.get_text()}")
            ol.replace_with(f"\n\n{chr(10).join(items)}\n\n")

        # Blockquotes
        for blockquote in soup.find_all('blockquote'):
            lines = blockquote.get_text().split('\n')
            quoted_lines = [f"> {line}" for line in lines if line.strip()]
            blockquote.replace_with(f"\n\n{chr(10).join(quoted_lines)}\n\n")

        # Links
        for a in soup.find_all('a'):
            href = a.get('href', '')
            text = a.get_text()
            a.replace_with(f"[{text}]({href})")

        # Get the final text and clean up
        result = soup.get_text()

        # Clean up excessive whitespace
        result = re.sub(r'\n\s*\n\s*\n', '\n\n', result)
        result = result.strip()

        return result

    except Exception as e:
        print(f"Error converting HTML to Markdown: {e}")
        return content

def test_conversion():
    """Test the HTML to Markdown conversion with sample data"""

    # Sample HTML content from the database
    sample_html = """
    <p>Жил в одном ауле старик с двумя сыновьями. Пришла старику пора помирать.
      Позвал он сыновей и говорит:</p><p>— Мои дорогие дети, я оставляю вам наследство.
      Но не наследством вы будете богаты. Дороже денег, дороже добра три совета. Будете
      их помнить — проживёте в достатке всю жизнь. Вот мои советы, запоминайте. Первыми
      никому не кланяйтесь — пусть другие вам кланяются. Всякую еду ешьте с мёдом. Спите
      всегда на пуховиках.</p><p>Старик умер.</p><p>Сыновья забыли о его советах и давай
      жить в своё удовольствие — пить да гулять, много есть и долго спать.</p>
    """

    print("Original HTML:")
    print("=" * 50)
    print(sample_html)
    print()

    print("Converted to Markdown:")
    print("=" * 50)
    markdown = html_to_markdown(sample_html)
    print(markdown)
    print()

    print("Benefits of Markdown format:")
    print("- Preserves paragraph structure")
    print("- Maintains dialogue formatting")
    print("- Easy to read and edit")
    print("- Can be rendered to HTML when needed")
    print("- Smaller file size than HTML")
    print("- Better for version control")

if __name__ == "__main__":
    test_conversion()