Source code for parslepy.selectors

# -*- coding: utf-8 -*-
import re
import copy

import lxml.cssselect
import lxml.etree

import parslepy.funcs


[docs]class Selector(object):
    """
    Class of objects returned by :class:`.SelectorHandler` instances'
    (and subclasses) :meth:`~.SelectorHandler.make` method.
    """

    def __init__(self, selector):
        self.selector = selector

    def __repr__(self):
        return "<Selector: inner=%s>" % self.selector


[docs]class SelectorHandler(object):
    """
    Called when building abstract Parsley trees
    and when etracting object values during the actual parsing
    of documents

    This should be subclassed to implement the selector processing logic
    you need for your Parsley handling.

    All 3 methods, :meth:`~.SelectorHandler.make`, :meth:`~.SelectorHandler.select`
    and :meth:`~.SelectorHandler.extract` MUST be overridden
    """

    DEBUG = False

    def __init__(self, debug=False):
        if debug:
            self.DEBUG = True

[docs]    def make(self, selection_string):
        """
        Interpret a selection_string as a selector
        for elements or element attributes in a (semi-)structured document.
        In case of XPath selectors, this can also be a function call.

        :param selection_string: a string representing a selector
        :rtype: :class:`.Selector`
        """

        raise NotImplementedError

[docs]    def select(self, document, selector):
        """
        Apply the selector on the document

        :param document: lxml-parsed document
        :param selector: input :class:`.Selector` to apply on the document
        :rtype: lxml.etree.Element list
        """

        raise NotImplementedError

[docs]    def extract(self, document, selector):
        """
        Apply the selector on the document
        and return a value for the matching elements (text content or
        element attributes)

        :param document: lxml-parsed document
        :param selector: input :class:`.Selector`  to apply on the document
        :rtype: depends on the selector (string, boolean value, ...)

        Return value can be single- or multi-valued.
        """

        raise NotImplementedError


[docs]class XPathSelectorHandler(SelectorHandler):
    """
    This selector only accepts XPath selectors.

    It understands what lxml.etree.XPath understands, that is XPath 1.0
    expressions
    """

    EXPECTED_NON_ELEMENT_TYPES = [
        bool,
        int,
        float,
        str,
    ]
    try:
        unicode         # Python 2.x
        EXPECTED_NON_ELEMENT_TYPES.append(unicode)
    except NameError:
        pass

    LOCAL_NAMESPACE = 'local-parslepy'
    LOCAL_XPATH_EXTENSIONS = {
        (LOCAL_NAMESPACE, 'text') : parslepy.funcs.xpathtostring,
        (LOCAL_NAMESPACE, 'textnl') : parslepy.funcs.xpathtostringnl,

        # aliases
        (LOCAL_NAMESPACE, 'str') : parslepy.funcs.xpathtostring,
        (LOCAL_NAMESPACE, 'strnl') : parslepy.funcs.xpathtostringnl,
        (LOCAL_NAMESPACE, 'nl') : parslepy.funcs.xpathtostringnl,

        (LOCAL_NAMESPACE, 'html') : parslepy.funcs.xpathtohtml,
        (LOCAL_NAMESPACE, 'xml') : parslepy.funcs.xpathtoxml,
        (LOCAL_NAMESPACE, 'strip') : parslepy.funcs.xpathstrip,

        (LOCAL_NAMESPACE, 'attrname') : parslepy.funcs.xpathattrname,
        (LOCAL_NAMESPACE, 'attrnames') : parslepy.funcs.xpathattrname,   # alias that's probably a better fit
    }
    EXSLT_NAMESPACES={
        'date': 'http://exslt.org/dates-and-times',
        'math': 'http://exslt.org/math',
        're': 'http://exslt.org/regular-expressions',
        'set': 'http://exslt.org/sets',
        'str': 'http://exslt.org/strings',
    }
    _extension_router = {}

    SMART_STRINGS = False
    SMART_STRINGS_FUNCTIONS = [
        (LOCAL_NAMESPACE, 'attrname'),
        (LOCAL_NAMESPACE, 'attrnames'),
    ]

    _selector_cache = {}

    def __init__(self, namespaces=None, extensions=None, context=None, debug=False):
        """
        :param namespaces: namespace mapping as :class:`dict`
        :param extensions: extension :class:`dict`
        :param context: user-context passed to XPath extension functions

        `namespaces` and `extensions` dicts should have the same format
        as for `lxml`_:
        see http://lxml.de/xpathxslt.html#namespaces-and-prefixes
        and `<http://lxml.de/extensions.html#xpath-extension-functions>`_

        Extension functions have a slightly different signature than
        pure-lxml extension functions: they must expect a user-context
        as first argument; all other arguments are the same as for
        `lxml` extensions.

        `context` will be passed as first argument to extension functions
        registered through `extensions`.
        Alternative: user-context can also be passed to :meth:`parslepy.base.Parselet.parse`

        """

        super(XPathSelectorHandler, self).__init__(debug=debug)

        # support EXSLT extensions
        self.namespaces = copy.copy(self.EXSLT_NAMESPACES)

        # add local XPath extension functions
        self._add_parsley_ns(self.namespaces)
        self.extensions = copy.copy(self.LOCAL_XPATH_EXTENSIONS)

        # add user-defined extensions
        self._user_extensions = None
        self.context = context
        if namespaces:
            self.namespaces.update(namespaces)
        if extensions:
            self._user_extensions = extensions
            self._process_extensions(extensions)

        # some functions need smart_strings=True
        self._set_smart_strings_regexps()

    def _test_smart_strings_needed(self, selector):
        return any([r.search(selector)
                    for r in self.smart_strings_regexps])

    def _get_smart_strings_regexps(self, ns, fname):
        # find out what prefixes match the supplied namespace
        prefix_matches = []
        for prefix, namespace in self.namespaces.items():
            if namespace == ns:
                prefix_matches.append(prefix)

        return [re.compile("%s:%s\(" % (p, fname)) for p in prefix_matches]

    def _set_smart_strings_regexps(self):
        self.smart_strings_regexps = []
        # smart_strings for built-in extensions
        for (ns, fname) in self.SMART_STRINGS_FUNCTIONS:
            self.smart_strings_regexps.extend(
                self._get_smart_strings_regexps(ns, fname))

        # smart_strings for user_defined extensions
        if self._user_extensions:
            for (ns, fname) in self._user_extensions:
                self.smart_strings_regexps.extend(
                    self._get_smart_strings_regexps(ns, fname))

    def _make_xpathextension(self, ns, fname):
        def xpath_ext(*args):
            return self._extension_router[(ns, fname)](self.context, *args)

        extension_name = str("xpext_%s_%d" % (fname, hash(ns)))
        xpath_ext.__doc__ = "docstring for %s" % extension_name
        xpath_ext.__name__ = extension_name
        setattr(self, xpath_ext.__name__, xpath_ext)

        return xpath_ext

    def _process_extensions(self, extensions):
        for (ns, fname), func in extensions.items():
            self._extension_router[(ns, fname)] = func
            self.extensions[(ns, fname)] = self._make_xpathextension(ns=ns, fname=fname)

    @classmethod
    def _add_parsley_ns(cls, namespace_dict):
        """
        Extend XPath evaluation with Parsley extensions' namespace
        """

        namespace_dict.update({
            'parslepy' : cls.LOCAL_NAMESPACE,
            'parsley' : cls.LOCAL_NAMESPACE,
        })
        return namespace_dict

    def make(self, selection):
        """
        XPath expression can also use EXSLT functions (as long as they are
        understood by libxslt)
        """

        cached = self._selector_cache.get(selection)
        if cached:
            return cached

        try:
            selector = lxml.etree.XPath(selection,
                namespaces = self.namespaces,
                extensions = self.extensions,
                smart_strings=(self.SMART_STRINGS
                            or self._test_smart_strings_needed(selection)),
                )

        except lxml.etree.XPathSyntaxError as syntax_error:
            syntax_error.msg += ": %s" % selection
            raise syntax_error

        except Exception as e:
            if self.DEBUG:
                print(repr(e), selection)
            raise

        # wrap it/cache it
        self._selector_cache[selection] = Selector(selector)
        return self._selector_cache[selection]

    @classmethod
    def select(cls, document, selector):
        try:
            return selector.selector(document)
        except Exception as e:
            if cls.DEBUG:
                print(str(e))
            return

    def extract(self, document, selector, debug_offset=''):
        """
        Try and convert matching Elements to unicode strings.

        If this fails, the selector evaluation probably already
        returned some string(s) of some sort, or boolean value,
        or int/float, so return that instead.
        """
        selected = self.select(document, selector)
        if selected is not None:

            if isinstance(selected, (list, tuple)):

                # FIXME: return None or return empty list?
                if not len(selected):
                    return

                return [self._extract_single(m) for m in selected]

            else:
                return self._extract_single(selected)

        # selector did not match anything
        else:
            if self.DEBUG:
                print(debug_offset, "selector did not match anything; return None")
            return None

    def _default_element_extract(self, element):
        """
        Overridable method to change how matching Elements
        are represented in output
        """

        return parslepy.funcs.extract_text(element)

    def _extract_single(self, retval):
        # XPath compiled expressions (and CSSSelect translations)
        # can return different types
        # See http://lxml.de/xpathxslt.html#xpath-return-values
        # - True or False, when the XPath expression
        #       has a boolean result
        # - a float, when the XPath expression has a numeric result
        #       (integer or float)
        # - a 'smart' string (as described below),
        #       when the XPath expression has a string result.
        # - a list of items, when the XPath expression has a list as result.
        #       The items may include Elements
        #       (also comments and processing instructions),
        #       strings and tuples.
        #
        #   Note that in the default implementation,
        #   smart strings are disabled
        if type(retval) == lxml.etree._Element:
            return self._default_element_extract(retval)

        elif type(retval) == lxml.etree._Comment:
            return self._default_element_extract(retval)

        elif isinstance(retval, tuple(self.EXPECTED_NON_ELEMENT_TYPES)):
            return retval

        else:
            raise Warning("unusual type %s" % type(retval))
            return retval

try:
    from cssselect import HTMLTranslator
    from cssselect.xpath import _unicode_safe_getattr, XPathExpr

    class CssTranslator(HTMLTranslator):

        def xpath_pseudo_element(self, xpath, pseudo_element):

            try:

                from cssselect.parser import FunctionalPseudoElement
                from cssselect.xpath import _unicode_safe_getattr, XPathExpr

                if isinstance(pseudo_element, FunctionalPseudoElement):
                    method = 'xpath_%s_functional_pseudo_element' % (
                        pseudo_element.name.replace('-', '_'))
                    method = _unicode_safe_getattr(self, method, None)
                    if not method:
                        raise ExpressionError(
                            "The functional pseudo-element ::%s() is unknown"
                        % pseudo_element.name)
                    xpath = method(xpath, pseudo_element.arguments)
                else:
                    method = 'xpath_%s_simple_pseudo_element' % (
                        pseudo_element.replace('-', '_'))
                    method = _unicode_safe_getattr(self, method, None)
                    if not method:
                        raise ExpressionError(
                            "The pseudo-element ::%s is unknown"
                            % pseudo_element)
                    xpath = method(xpath)

            except ImportError:
                pass

            return xpath

        # functional pseudo-element:
        # element's attribute by name
        def xpath_attr_functional_pseudo_element(self, xpath, arguments):
            attribute_name = arguments[0].value
            other = XPathExpr('@%s' % attribute_name, '', )
            return xpath.join('/', other)

        # pseudo-element:
        # element's text() nodes
        def xpath_text_simple_pseudo_element(self, xpath):
            other = XPathExpr('text()', '', )
            return xpath.join('/', other)

        # pseudo-element:
        # element's comment() nodes
        def xpath_comment_simple_pseudo_element(self, xpath):
            other = XPathExpr('comment()', '', )
            return xpath.join('/', other)

    css_translator = CssTranslator()
    def css_to_xpath(css):
        return css_translator.css_to_xpath(css)

except ImportError:

    def css_to_xpath(css):
        return lxml.cssselect.css_to_xpath(css)


[docs]class DefaultSelectorHandler(XPathSelectorHandler):
    """
    Default selector logic, loosely based on the original
    `Parsley` implementation.

    This handler understands what cssselect and lxml.etree.XPath understands,
    that is (roughly) XPath 1.0 and CSS3 for things that dont need browser context
    """

    # newer lxml version (>3) raise SelectorSyntaxError (directly from cssselect)
    # for invalid CSS selectors
    # but older lxml (2.3.8 for example) have cssselect included
    # and for some selectors raise AssertionError and TypeError instead
    CSSSELECT_SYNTAXERROR_EXCEPTIONS = set([
        # we could use lxml.cssselect.SelectorError (parent class for both),
        # but for lxml<3, they're not related
        lxml.cssselect.SelectorSyntaxError,
        # for unsupported pseudo-class or XPath namespaces prefix syntax
        lxml.cssselect.ExpressionError,
    ])
    # this is to add AssertionError and TypeError if lxml < 3.0.0
    for s in ('#a.', '//h1'):
        try:
            lxml.cssselect.CSSSelector(s)
        except Exception as e:
            CSSSELECT_SYNTAXERROR_EXCEPTIONS.add(type(e))

    # example: "a img @src" (fetch the 'src' attribute of an IMG tag)
    # other example: "im|img @im|src" when using namespace prefixes
    REGEX_ENDING_ATTRIBUTE = re.compile(r'^(?P<expr>.+)\s+(?P<attr>@[\:|\w_\d-]+)$')
    def make(self, selection):
        """
        Scopes and selectors are tested in this order:
        * is this a CSS selector with an appended @something attribute?
        * is this a regular CSS selector?
        * is this an XPath expression?

        XPath expression can also use EXSLT functions (as long as they are
        understood by libxslt)
        """
        cached = self._selector_cache.get(selection)
        if cached:
            return cached

        namespaces = self.EXSLT_NAMESPACES
        self._add_parsley_ns(namespaces)
        try:
            # CSS with attribute? (non-standard but convenient)
            # CSS selector cannot select attributes
            # this "<css selector> @<attr>" syntax is a Parsley extension
            # construct CSS selector and append attribute to XPath expression
            m = self.REGEX_ENDING_ATTRIBUTE.match(selection)
            if m:
                # the selector should be a regular CSS selector
                cssxpath = css_to_xpath(m.group("expr"))

                # if "|" is used for namespace prefix reference,
                #   convert it to XPath prefix syntax
                attribute = m.group("attr").replace('|', ':')

                cssxpath = "%s/%s" % (cssxpath, attribute)
            else:
                cssxpath = css_to_xpath(selection)

            selector = lxml.etree.XPath(
                cssxpath,
                namespaces = self.namespaces,
                extensions = self.extensions,
                smart_strings=(self.SMART_STRINGS
                            or self._test_smart_strings_needed(selection)),
                )

        except tuple(self.CSSSELECT_SYNTAXERROR_EXCEPTIONS) as syntax_error:
            if self.DEBUG:
                print(repr(syntax_error), selection)
                print("Try interpreting as XPath selector")
            try:
                selector = lxml.etree.XPath(selection,
                    namespaces = self.namespaces,
                    extensions = self.extensions,
                    smart_strings=(self.SMART_STRINGS
                                or self._test_smart_strings_needed(selection)),
                    )

            except lxml.etree.XPathSyntaxError as syntax_error:
                syntax_error.msg += ": %s" % selection
                raise syntax_error

            except Exception as e:
                if self.DEBUG:
                    print(repr(e), selection)
                raise

        # for exception when trying to convert <cssselector> @<attribute> syntax
        except lxml.etree.XPathSyntaxError as syntax_error:
            syntax_error.msg += ": %s" % selection
            raise syntax_error

        except Exception as e:
            if self.DEBUG:
                print(repr(e), selection)
            raise

        # wrap it/cache it
        self._selector_cache[selection] = Selector(selector)
        return self._selector_cache[selection]