Source code for parslepy.selectors

# -*- coding: utf-8 -*-
import re
import copy

import lxml.cssselect
import lxml.etree

import parslepy.funcs


[docs]class Selector(object): """ Class of objects returned by :class:`.SelectorHandler` instances' (and subclasses) :meth:`~.SelectorHandler.make` method. """ def __init__(self, selector): self.selector = selector def __repr__(self): return "<Selector: inner=%s>" % self.selector
[docs]class SelectorHandler(object): """ Called when building abstract Parsley trees and when etracting object values during the actual parsing of documents This should be subclassed to implement the selector processing logic you need for your Parsley handling. All 3 methods, :meth:`~.SelectorHandler.make`, :meth:`~.SelectorHandler.select` and :meth:`~.SelectorHandler.extract` MUST be overridden """ DEBUG = False def __init__(self, debug=False): if debug: self.DEBUG = True
[docs] def make(self, selection_string): """ Interpret a selection_string as a selector for elements or element attributes in a (semi-)structured document. In case of XPath selectors, this can also be a function call. :param selection_string: a string representing a selector :rtype: :class:`.Selector` """ raise NotImplementedError
[docs] def select(self, document, selector): """ Apply the selector on the document :param document: lxml-parsed document :param selector: input :class:`.Selector` to apply on the document :rtype: lxml.etree.Element list """ raise NotImplementedError
[docs] def extract(self, document, selector): """ Apply the selector on the document and return a value for the matching elements (text content or element attributes) :param document: lxml-parsed document :param selector: input :class:`.Selector` to apply on the document :rtype: depends on the selector (string, boolean value, ...) Return value can be single- or multi-valued. """ raise NotImplementedError
[docs]class XPathSelectorHandler(SelectorHandler): """ This selector only accepts XPath selectors. It understands what lxml.etree.XPath understands, that is XPath 1.0 expressions """ EXPECTED_NON_ELEMENT_TYPES = [ bool, int, float, str, ] try: unicode # Python 2.x EXPECTED_NON_ELEMENT_TYPES.append(unicode) except NameError: pass LOCAL_NAMESPACE = 'local-parslepy' LOCAL_XPATH_EXTENSIONS = { (LOCAL_NAMESPACE, 'text') : parslepy.funcs.xpathtostring, (LOCAL_NAMESPACE, 'textnl') : parslepy.funcs.xpathtostringnl, # aliases (LOCAL_NAMESPACE, 'str') : parslepy.funcs.xpathtostring, (LOCAL_NAMESPACE, 'strnl') : parslepy.funcs.xpathtostringnl, (LOCAL_NAMESPACE, 'nl') : parslepy.funcs.xpathtostringnl, (LOCAL_NAMESPACE, 'html') : parslepy.funcs.xpathtohtml, (LOCAL_NAMESPACE, 'xml') : parslepy.funcs.xpathtoxml, (LOCAL_NAMESPACE, 'strip') : parslepy.funcs.xpathstrip, (LOCAL_NAMESPACE, 'attrname') : parslepy.funcs.xpathattrname, (LOCAL_NAMESPACE, 'attrnames') : parslepy.funcs.xpathattrname, # alias that's probably a better fit } EXSLT_NAMESPACES={ 'date': 'http://exslt.org/dates-and-times', 'math': 'http://exslt.org/math', 're': 'http://exslt.org/regular-expressions', 'set': 'http://exslt.org/sets', 'str': 'http://exslt.org/strings', } _extension_router = {} SMART_STRINGS = False SMART_STRINGS_FUNCTIONS = [ (LOCAL_NAMESPACE, 'attrname'), (LOCAL_NAMESPACE, 'attrnames'), ] _selector_cache = {} def __init__(self, namespaces=None, extensions=None, context=None, debug=False): """ :param namespaces: namespace mapping as :class:`dict` :param extensions: extension :class:`dict` :param context: user-context passed to XPath extension functions `namespaces` and `extensions` dicts should have the same format as for `lxml`_: see http://lxml.de/xpathxslt.html#namespaces-and-prefixes and `<http://lxml.de/extensions.html#xpath-extension-functions>`_ Extension functions have a slightly different signature than pure-lxml extension functions: they must expect a user-context as first argument; all other arguments are the same as for `lxml` extensions. `context` will be passed as first argument to extension functions registered through `extensions`. Alternative: user-context can also be passed to :meth:`parslepy.base.Parselet.parse` """ super(XPathSelectorHandler, self).__init__(debug=debug) # support EXSLT extensions self.namespaces = copy.copy(self.EXSLT_NAMESPACES) # add local XPath extension functions self._add_parsley_ns(self.namespaces) self.extensions = copy.copy(self.LOCAL_XPATH_EXTENSIONS) # add user-defined extensions self._user_extensions = None self.context = context if namespaces: self.namespaces.update(namespaces) if extensions: self._user_extensions = extensions self._process_extensions(extensions) # some functions need smart_strings=True self._set_smart_strings_regexps() def _test_smart_strings_needed(self, selector): return any([r.search(selector) for r in self.smart_strings_regexps]) def _get_smart_strings_regexps(self, ns, fname): # find out what prefixes match the supplied namespace prefix_matches = [] for prefix, namespace in self.namespaces.items(): if namespace == ns: prefix_matches.append(prefix) return [re.compile("%s:%s\(" % (p, fname)) for p in prefix_matches] def _set_smart_strings_regexps(self): self.smart_strings_regexps = [] # smart_strings for built-in extensions for (ns, fname) in self.SMART_STRINGS_FUNCTIONS: self.smart_strings_regexps.extend( self._get_smart_strings_regexps(ns, fname)) # smart_strings for user_defined extensions if self._user_extensions: for (ns, fname) in self._user_extensions: self.smart_strings_regexps.extend( self._get_smart_strings_regexps(ns, fname)) def _make_xpathextension(self, ns, fname): def xpath_ext(*args): return self._extension_router[(ns, fname)](self.context, *args) extension_name = str("xpext_%s_%d" % (fname, hash(ns))) xpath_ext.__doc__ = "docstring for %s" % extension_name xpath_ext.__name__ = extension_name setattr(self, xpath_ext.__name__, xpath_ext) return xpath_ext def _process_extensions(self, extensions): for (ns, fname), func in extensions.items(): self._extension_router[(ns, fname)] = func self.extensions[(ns, fname)] = self._make_xpathextension(ns=ns, fname=fname) @classmethod def _add_parsley_ns(cls, namespace_dict): """ Extend XPath evaluation with Parsley extensions' namespace """ namespace_dict.update({ 'parslepy' : cls.LOCAL_NAMESPACE, 'parsley' : cls.LOCAL_NAMESPACE, }) return namespace_dict def make(self, selection): """ XPath expression can also use EXSLT functions (as long as they are understood by libxslt) """ cached = self._selector_cache.get(selection) if cached: return cached try: selector = lxml.etree.XPath(selection, namespaces = self.namespaces, extensions = self.extensions, smart_strings=(self.SMART_STRINGS or self._test_smart_strings_needed(selection)), ) except lxml.etree.XPathSyntaxError as syntax_error: syntax_error.msg += ": %s" % selection raise syntax_error except Exception as e: if self.DEBUG: print(repr(e), selection) raise # wrap it/cache it self._selector_cache[selection] = Selector(selector) return self._selector_cache[selection] @classmethod def select(cls, document, selector): try: return selector.selector(document) except Exception as e: if cls.DEBUG: print(str(e)) return def extract(self, document, selector, debug_offset=''): """ Try and convert matching Elements to unicode strings. If this fails, the selector evaluation probably already returned some string(s) of some sort, or boolean value, or int/float, so return that instead. """ selected = self.select(document, selector) if selected is not None: if isinstance(selected, (list, tuple)): # FIXME: return None or return empty list? if not len(selected): return return [self._extract_single(m) for m in selected] else: return self._extract_single(selected) # selector did not match anything else: if self.DEBUG: print(debug_offset, "selector did not match anything; return None") return None def _default_element_extract(self, element): """ Overridable method to change how matching Elements are represented in output """ return parslepy.funcs.extract_text(element) def _extract_single(self, retval): # XPath compiled expressions (and CSSSelect translations) # can return different types # See http://lxml.de/xpathxslt.html#xpath-return-values # - True or False, when the XPath expression # has a boolean result # - a float, when the XPath expression has a numeric result # (integer or float) # - a 'smart' string (as described below), # when the XPath expression has a string result. # - a list of items, when the XPath expression has a list as result. # The items may include Elements # (also comments and processing instructions), # strings and tuples. # # Note that in the default implementation, # smart strings are disabled if type(retval) == lxml.etree._Element: return self._default_element_extract(retval) elif type(retval) == lxml.etree._Comment: return self._default_element_extract(retval) elif isinstance(retval, tuple(self.EXPECTED_NON_ELEMENT_TYPES)): return retval else: raise Warning("unusual type %s" % type(retval)) return retval
try: from cssselect import HTMLTranslator from cssselect.xpath import _unicode_safe_getattr, XPathExpr class CssTranslator(HTMLTranslator): def xpath_pseudo_element(self, xpath, pseudo_element): try: from cssselect.parser import FunctionalPseudoElement from cssselect.xpath import _unicode_safe_getattr, XPathExpr if isinstance(pseudo_element, FunctionalPseudoElement): method = 'xpath_%s_functional_pseudo_element' % ( pseudo_element.name.replace('-', '_')) method = _unicode_safe_getattr(self, method, None) if not method: raise ExpressionError( "The functional pseudo-element ::%s() is unknown" % pseudo_element.name) xpath = method(xpath, pseudo_element.arguments) else: method = 'xpath_%s_simple_pseudo_element' % ( pseudo_element.replace('-', '_')) method = _unicode_safe_getattr(self, method, None) if not method: raise ExpressionError( "The pseudo-element ::%s is unknown" % pseudo_element) xpath = method(xpath) except ImportError: pass return xpath # functional pseudo-element: # element's attribute by name def xpath_attr_functional_pseudo_element(self, xpath, arguments): attribute_name = arguments[0].value other = XPathExpr('@%s' % attribute_name, '', ) return xpath.join('/', other) # pseudo-element: # element's text() nodes def xpath_text_simple_pseudo_element(self, xpath): other = XPathExpr('text()', '', ) return xpath.join('/', other) # pseudo-element: # element's comment() nodes def xpath_comment_simple_pseudo_element(self, xpath): other = XPathExpr('comment()', '', ) return xpath.join('/', other) css_translator = CssTranslator() def css_to_xpath(css): return css_translator.css_to_xpath(css) except ImportError: def css_to_xpath(css): return lxml.cssselect.css_to_xpath(css)
[docs]class DefaultSelectorHandler(XPathSelectorHandler): """ Default selector logic, loosely based on the original `Parsley` implementation. This handler understands what cssselect and lxml.etree.XPath understands, that is (roughly) XPath 1.0 and CSS3 for things that dont need browser context """ # newer lxml version (>3) raise SelectorSyntaxError (directly from cssselect) # for invalid CSS selectors # but older lxml (2.3.8 for example) have cssselect included # and for some selectors raise AssertionError and TypeError instead CSSSELECT_SYNTAXERROR_EXCEPTIONS = set([ # we could use lxml.cssselect.SelectorError (parent class for both), # but for lxml<3, they're not related lxml.cssselect.SelectorSyntaxError, # for unsupported pseudo-class or XPath namespaces prefix syntax lxml.cssselect.ExpressionError, ]) # this is to add AssertionError and TypeError if lxml < 3.0.0 for s in ('#a.', '//h1'): try: lxml.cssselect.CSSSelector(s) except Exception as e: CSSSELECT_SYNTAXERROR_EXCEPTIONS.add(type(e)) # example: "a img @src" (fetch the 'src' attribute of an IMG tag) # other example: "im|img @im|src" when using namespace prefixes REGEX_ENDING_ATTRIBUTE = re.compile(r'^(?P<expr>.+)\s+(?P<attr>@[\:|\w_\d-]+)$') def make(self, selection): """ Scopes and selectors are tested in this order: * is this a CSS selector with an appended @something attribute? * is this a regular CSS selector? * is this an XPath expression? XPath expression can also use EXSLT functions (as long as they are understood by libxslt) """ cached = self._selector_cache.get(selection) if cached: return cached namespaces = self.EXSLT_NAMESPACES self._add_parsley_ns(namespaces) try: # CSS with attribute? (non-standard but convenient) # CSS selector cannot select attributes # this "<css selector> @<attr>" syntax is a Parsley extension # construct CSS selector and append attribute to XPath expression m = self.REGEX_ENDING_ATTRIBUTE.match(selection) if m: # the selector should be a regular CSS selector cssxpath = css_to_xpath(m.group("expr")) # if "|" is used for namespace prefix reference, # convert it to XPath prefix syntax attribute = m.group("attr").replace('|', ':') cssxpath = "%s/%s" % (cssxpath, attribute) else: cssxpath = css_to_xpath(selection) selector = lxml.etree.XPath( cssxpath, namespaces = self.namespaces, extensions = self.extensions, smart_strings=(self.SMART_STRINGS or self._test_smart_strings_needed(selection)), ) except tuple(self.CSSSELECT_SYNTAXERROR_EXCEPTIONS) as syntax_error: if self.DEBUG: print(repr(syntax_error), selection) print("Try interpreting as XPath selector") try: selector = lxml.etree.XPath(selection, namespaces = self.namespaces, extensions = self.extensions, smart_strings=(self.SMART_STRINGS or self._test_smart_strings_needed(selection)), ) except lxml.etree.XPathSyntaxError as syntax_error: syntax_error.msg += ": %s" % selection raise syntax_error except Exception as e: if self.DEBUG: print(repr(e), selection) raise # for exception when trying to convert <cssselector> @<attribute> syntax except lxml.etree.XPathSyntaxError as syntax_error: syntax_error.msg += ": %s" % selection raise syntax_error except Exception as e: if self.DEBUG: print(repr(e), selection) raise # wrap it/cache it self._selector_cache[selection] = Selector(selector) return self._selector_cache[selection]