Source code for upsies.utils.html

"""
HTML parsing
"""

import re
import warnings

from . import LazyModule

bs4 = LazyModule(module='bs4', namespace=globals())



[docs]
def parse(string):
    """
    Return :class:`~.bs4.BeautifulSoup` instance

    :param string: HTML document

    :raise ContentError: if `string` is invalid HTML
    """
    if isinstance(string, bs4.element.Tag):
        return string
    else:
        # Disable bs4 warnings, e.g. "MarkupResemblesLocatorWarning: The input
        # looks more like a filename than markup. You may want to open this file
        # and pass the filehandle into Beautiful Soup."
        with warnings.catch_warnings():
            warnings.simplefilter(action='ignore', category=UserWarning)
            return bs4.BeautifulSoup(string, features='html.parser')




[docs]
def dump(html, filepath):
    """
    Write `html` to `filepath` for debugging

    :param html: String or :class:`~.bs4.BeautifulSoup` instance
    """
    with open(filepath, 'w') as f:
        if isinstance(html, bs4.BeautifulSoup):
            f.write(html.prettify())
        else:
            f.write(parse(str(html)).prettify())




[docs]
def get(soup, *attributes):
    """
    Get `attributes` from `soup`

    These two calls are equivalent if all attributes exist:

    >>> soup.table.tr.td
    "td value"
    >>> html.get(soup, "table", "tr", "td")
    "td value"

    But if any attribute is `None` (which is what :class:`~.bs4.BeautifulSoup`
    returns for unknown tags), you get `None` instead of forcing you to catch an
    :class:`AttributeError`:

    >>> soup.table.no_such_attribute.td
    AttributeError: 'NoneType' object has no attribute 'td'
    >>> html.get(soup, "table", "no_such_attribute", "td")
    None
    """
    for attr in attributes:
        soup = getattr(soup, attr, None)
        if soup is None:
            return None
    return soup




[docs]
def as_text(html):
    """Strip HTML tags from string and return text without markup"""
    # Translate "<br>" to "\n" first.
    html = re.sub(r'<br\s*/?>', '\n', str(html))

    doc = parse(html)

    # BeautifulSoup stopped parsing tags like "<b>bold</b>" at some point when they are inside a
    # "<textarea>...</textarea>" tags. But we really want all HTML parsed.
    for textarea in doc.find_all('textarea'):
        textarea_content = ''.join(
            parse(c).get_text()
            for c in textarea.contents
        )
        textarea.replace_with(textarea_content)

    # Do normal HTML -> text conversion.
    text = doc.get_text()

    # Deduplicate spaces.
    text = re.sub(r'(\s)\s+', r'\1', text, flags=re.MULTILINE).strip()
    return text




[docs]
def purge_tags(html):
    """
    Return `html` with <script> and <style> tags removed

    :param str html: HTML string
    """
    def is_javascript(tag):
        # Match <script> tags that don't have type="application/ld+json".
        if tag.name == 'script':
            return tag.get('type') != 'application/ld+json'

        # Match <style> (CSS) tags.
        elif tag.name == 'style':
            return True

        return False

    soup = parse(html)
    for script_tag in soup.find_all(is_javascript):
        script_tag.decompose()
    return str(soup)