Source code for parslepy.base

# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from parslepy.selectors import DefaultSelectorHandler, SelectorHandler, Selector
import lxml.etree
import lxml.html
import re
import json


# http://stackoverflow.com/questions/11301138/how-to-check-if-variable-is-string-with-python-2-and-3-compatibility
try:
    isinstance("", basestring)
    def isstr(s):
        return isinstance(s, basestring)
except NameError:
    def isstr(s):
        return isinstance(s, str)

# ----------------------------------------------------------------------

# compiled Parsley scripts look like this
# ParsleyNode(
#       ParsleyContext(key, options[, Selector]): ParsleyNode(...),
#           ...or
#       ParsleyContext(key, options[, Selector]): Selector,
#       ...)
# --> a tree of ParsleyNode instances,
#     with terminal leaves of type Selector,
#     a parent ParsleyNode having 1 or more ParsleyNode children
#     references through ParsleyContext keys
#
class ParsleyNode(dict):
    pass


class ParsleyContext(object):
    """
    Stores parameters associated with extraction keys in `ParsleyNode` trees.
    Used as keys in `ParsleyNode` objects
    """

    def __init__(self, key, operator=None, required=True, scope=None, iterate=False):
        """
        Only `key` is required

        Arguments:
        operator (str)     -- "?" optional,  "!" for complete arrays; defaults to None (i.e. required)
        required (boolean) -- whether the key is required in the output (defaults to True)
        scope (`Selector`) -- restrict extraction to elements matching this selector
        iterate (boolean)  -- whether multiple objects will be extracted (defaults to False)
        """

        self.key = key
        self.operator = operator
        self.required = required
        self.scope = scope
        self.iterate = iterate

    def __repr__(self):
        return "<ParsleyContext: k=%s; op=%s; required=%s; scope=%s; iter=%s>" % (
            self.key, self.operator, self.required, self.scope, self.iterate)


[docs]class NonMatchingNonOptionalKey(RuntimeError): """ Raised by a :class:`.Parselet` instance while extracting content in strict mode, when a required key does not yield any content. >>> import parslepy >>> html = ''' ... <!DOCTYPE html> ... <html> ... <head> ... <title>Sample document to test parslepy</title> ... <meta http-equiv="content-type" content="text/html;charset=utf-8" /> ... </head> ... <body> ... <h1 id="main">What&rsquo;s new</h1> ... <ul> ... <li class="newsitem"><a href="/article-001.html">This is the first article</a></li> ... <li class="newsitem"><a href="/article-002.html">A second report on something</a></li> ... <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li> ... </ul> ... </body> ... </html> ... ''' >>> rules = { ... "heading1": "h1#main", ... "heading2": "h2#main", ... } >>> p = parslepy.Parselet(rules, strict=True) >>> try: ... p.parse_fromstring(html) ... except parslepy.base.NonMatchingNonOptionalKey as e: ... print "Missing mandatory key" Missing mandatory key """ pass
[docs]class InvalidKeySyntax(SyntaxError): """ Raised when the input Parsley script's syntax is invalid >>> import parslepy >>> try: ... p = parslepy.Parselet({"heading@": "#main"}) ... except parslepy.base.InvalidKeySyntax as e: ... print e Key heading@ is not valid """ pass
[docs]class Parselet(object): DEBUG = False SPECIAL_LEVEL_KEY = "--" KEEP_ONLY_FIRST_ELEMENT_IF_LIST = True STRICT_MODE = False def __init__(self, parselet, selector_handler=None, strict=False, debug=False): """ Take a parselet and optional selector_handler and build an abstract representation of the Parsley extraction logic. Two helper class methods can be used to instantiate a Parselet from JSON rules: :meth:`.from_jsonstring`, :meth:`.from_jsonfile`. :param dict parselet: Parsley script as a Python dict object :param boolean strict: Set to *True* is you want to enforce that missing required keys raise an Exception; default is False (i.e. lenient/non-strict mode) :param selector_handler: an instance of :class:`selectors.SelectorHandler` optional selector handler instance; defaults to an instance of :class:`selectors.DefaultSelectorHandler` :raises: :class:`.InvalidKeySyntax` Example: >>> import parslepy >>> rules = { ... "heading": "h1#main", ... "news(li.newsitem)": [{ ... "title": ".", ... "url": "a/@href" ... }], ... } >>> p = parslepy.Parselet(rules) >>> type(p) <class 'parslepy.base.Parselet'> Use :meth:`~base.Parselet.extract` or :meth:`~base.Parselet.parse` to get extracted content from documents. """ if debug: self.DEBUG = True if strict: self.STRICT_MODE = True self.parselet = parselet if not selector_handler: self.selector_handler = DefaultSelectorHandler(debug=self.DEBUG) elif not(isinstance(selector_handler, SelectorHandler)): raise ValueError("You must provide a SelectorHandler instance") else: self.selector_handler = selector_handler self.compile() # accept comments in parselets REGEX_COMMENT_LINE = re.compile(r'^\s*#') @classmethod
[docs] def from_jsonfile(cls, fp, selector_handler=None, strict=False, debug=False): """ Create a Parselet instance from a file containing the Parsley script as a JSON object >>> import parslepy >>> with open('parselet.json') as fp: ... parslepy.Parselet.from_jsonfile(fp) ... <parslepy.base.Parselet object at 0x2014e50> :param file fp: an open file-like pointer containing the Parsley script :rtype: :class:`.Parselet` Other arguments: same as for :class:`.Parselet` contructor """ return cls._from_jsonlines(fp, selector_handler=selector_handler, strict=strict, debug=debug)
@classmethod
[docs] def from_jsonstring(cls, s, selector_handler=None, strict=False, debug=False): """ Create a Parselet instance from s (str) containing the Parsley script as JSON >>> import parslepy >>> parsley_string = '{ "title": "h1", "link": "a @href"}' >>> p = parslepy.Parselet.from_jsonstring(parsley_string) >>> type(p) <class 'parslepy.base.Parselet'> >>> :param string s: a Parsley script as a JSON string :rtype: :class:`.Parselet` Other arguments: same as for :class:`.Parselet` contructor """ return cls._from_jsonlines(s.split("\n"), selector_handler=selector_handler, strict=strict, debug=debug)
@classmethod def _from_jsonlines(cls, lines, selector_handler=None, strict=False, debug=False): """ Interpret input lines as a JSON Parsley script. Python-style comment lines are skipped. """ return cls(json.loads( "\n".join([l for l in lines if not cls.REGEX_COMMENT_LINE.match(l)]) ), selector_handler=selector_handler, strict=strict, debug=debug)
[docs] def parse(self, fp, parser=None, context=None): """ Parse an HTML or XML document and return the extacted object following the Parsley rules give at instantiation. :param fp: file-like object containing an HTML or XML document, or URL or filename :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser() :param context: user-supplied context that will be passed to custom XPath extensions (as first argument) :rtype: Python :class:`dict` object with mapped extracted content :raises: :class:`.NonMatchingNonOptionalKey` To parse from a string, use the :meth:`~base.Parselet.parse_fromstring` method instead. Note that the fp paramater is passed directly to `lxml.etree.parse <http://lxml.de/api/lxml.etree-module.html#parse>`_, so you can also give it an URL, and lxml will download it for you. (Also see `<http://lxml.de/tutorial.html#the-parse-function>`_.) """ if parser is None: parser = lxml.etree.HTMLParser() doc = lxml.etree.parse(fp, parser=parser).getroot() return self.extract(doc, context=context)
[docs] def parse_fromstring(self, s, parser=None, context=None): """ Parse an HTML or XML document and return the extacted object following the Parsley rules give at instantiation. :param string s: an HTML or XML document as a string :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser() :param context: user-supplied context that will be passed to custom XPath extensions (as first argument) :rtype: Python :class:`dict` object with mapped extracted content :raises: :class:`.NonMatchingNonOptionalKey` """ if parser is None: parser = lxml.etree.HTMLParser() doc = lxml.etree.fromstring(s, parser=parser) return self.extract(doc, context=context)
def compile(self): """ Build the abstract Parsley tree starting from the root node (recursive) """ if not isinstance(self.parselet, dict): raise ValueError( "Parselet must be a dict of some sort. Or use .from_jsonstring() or .from_jsonfile()") self.parselet_tree = self._compile(self.parselet) VALID_KEY_CHARS = "\w-" SUPPORTED_OPERATORS = "?" # "!" not supported for now REGEX_PARSELET_KEY = re.compile( "^(?P<key>[%(validkeychars)s]+)(?P<operator>[%(suppop)s])?(\((?P<scope>.+)\))?$" % { 'validkeychars': VALID_KEY_CHARS, 'suppop': SUPPORTED_OPERATORS} ) def _compile(self, parselet_node, level=0): """ Build part of the abstract Parsley extraction tree Arguments: parselet_node (dict) -- part of the Parsley tree to compile (can be the root dict/node) level (int) -- current recursion depth (used for debug) """ if self.DEBUG: debug_offset = "".join([" " for x in range(level)]) if self.DEBUG: print(debug_offset, "%s::compile(%s)" % ( self.__class__.__name__, parselet_node)) if isinstance(parselet_node, dict): parselet_tree = ParsleyNode() for k, v in list(parselet_node.items()): # we parse the key raw elements but without much # interpretation (which is done by the SelectorHandler) try: m = self.REGEX_PARSELET_KEY.match(k) if not m: if self.DEBUG: print(debug_offset, "could not parse key", k) raise InvalidKeySyntax(k) except: raise InvalidKeySyntax("Key %s is not valid" % k) key = m.group('key') # by default, fields are required key_required = True operator = m.group('operator') if operator == '?': key_required = False # FIXME: "!" operator not supported (complete array) scope = m.group('scope') # example: get list of H3 tags # { "titles": ["h3"] } # FIXME: should we support multiple selectors in list? # e.g. { "titles": ["h1", "h2", "h3", "h4"] } if isinstance(v, (list, tuple)): v = v[0] iterate = True else: iterate = False # keys in the abstract Parsley trees are of type `ParsleyContext` try: parsley_context = ParsleyContext( key, operator=operator, required=key_required, scope=self.selector_handler.make(scope) if scope else None, iterate=iterate) except SyntaxError: if self.DEBUG: print("Invalid scope:", k, scope) raise if self.DEBUG: print(debug_offset, "current context:", parsley_context) # go deeper in the Parsley tree... try: child_tree = self._compile(v, level=level+1) except SyntaxError: if self.DEBUG: print("Invalid value: ", v) raise except: raise if self.DEBUG: print(debug_offset, "child tree:", child_tree) parselet_tree[parsley_context] = child_tree return parselet_tree # a string leaf should match some kind of selector, # let the selector handler deal with it elif isstr(parselet_node): return self.selector_handler.make(parselet_node) else: raise ValueError( "Unsupported type(%s) for Parselet node <%s>" % ( type(parselet_node), parselet_node))
[docs] def extract(self, document, context=None): """ Extract values as a dict object following the structure of the Parsley script (recursive) :param document: lxml-parsed document :param context: user-supplied context that will be passed to custom XPath extensions (as first argument) :rtype: Python *dict* object with mapped extracted content :raises: :class:`.NonMatchingNonOptionalKey` >>> import lxml.etree >>> import parslepy >>> html = ''' ... <!DOCTYPE html> ... <html> ... <head> ... <title>Sample document to test parslepy</title> ... <meta http-equiv="content-type" content="text/html;charset=utf-8" /> ... </head> ... <body> ... <h1 id="main">What&rsquo;s new</h1> ... <ul> ... <li class="newsitem"><a href="/article-001.html">This is the first article</a></li> ... <li class="newsitem"><a href="/article-002.html">A second report on something</a></li> ... <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li> ... </ul> ... </body> ... </html> ... ''' >>> html_parser = lxml.etree.HTMLParser() >>> doc = lxml.etree.fromstring(html, parser=html_parser) >>> doc <Element html at 0x7f5fb1fce9b0> >>> rules = { ... "headingcss": "#main", ... "headingxpath": "//h1[@id='main']" ... } >>> p = parslepy.Parselet(rules) >>> p.extract(doc) {'headingcss': u'What\u2019s new', 'headingxpath': u'What\u2019s new'} """ if context: self.selector_handler.context = context return self._extract(self.parselet_tree, document)
def _extract(self, parselet_node, document, level=0): """ Extract values at this document node level using the parselet_node instructions: - go deeper in tree - or call selector handler in case of a terminal selector leaf """ if self.DEBUG: debug_offset = "".join([" " for x in range(level)]) # we must go deeper in the Parsley tree if isinstance(parselet_node, ParsleyNode): # default output output = {} # process all children for ctx, v in list(parselet_node.items()): if self.DEBUG: print(debug_offset, "context:", ctx, v) extracted=None try: # scoped-extraction: # extraction should be done deeper in the document tree if ctx.scope: extracted = [] selected = self.selector_handler.select(document, ctx.scope) if selected: for i, elem in enumerate(selected, start=1): parse_result = self._extract(v, elem, level=level+1) if isinstance(parse_result, (list, tuple)): extracted.extend(parse_result) else: extracted.append(parse_result) # if we're not in an array, # we only care about the first iteration if not ctx.iterate: break if self.DEBUG: print(debug_offset, "parsed %d elements in scope (%s)" % (i, ctx.scope)) # local extraction else: extracted = self._extract(v, document, level=level+1) except NonMatchingNonOptionalKey as e: if self.DEBUG: print(debug_offset, str(e)) if not ctx.required or not self.STRICT_MODE: output[ctx.key] = {} else: raise except Exception as e: if self.DEBUG: print(str(e)) raise # replace empty-list result when not looping by empty dict if ( isinstance(extracted, list) and not extracted and not ctx.iterate): extracted = {} # keep only the first element if we're not in an array if self.KEEP_ONLY_FIRST_ELEMENT_IF_LIST: try: if ( isinstance(extracted, list) and extracted and not ctx.iterate): if self.DEBUG: print(debug_offset, "keep only 1st element") extracted = extracted[0] except Exception as e: if self.DEBUG: print(str(e)) print(debug_offset, "error getting first element") # extraction for a required key gave nothing if ( self.STRICT_MODE and ctx.required and extracted is None): raise NonMatchingNonOptionalKey( 'key "%s" is required but yield nothing\nCurrent path: %s/(%s)\n' % ( ctx.key, document.getroottree().getpath(document),v ) ) # special key to extract a selector-defined level deeper # but still output at same level # this can be useful for breaking up long selectors # or when you need to mix XPath and CSS selectors # e.g. # { # "something(#content div.main)": { # "--(.//div[re:test(@class, 'style\d{3,6}')])": { # "title": "h1", # "subtitle": "h2" # } # } # } # if ctx.key == self.SPECIAL_LEVEL_KEY: if isinstance(extracted, dict): output.update(extracted) elif isinstance(extracted, list): if extracted: raise RuntimeError( "could not merge non-empty list at higher level") else: #empty list, dont bother? pass else: # required keys are handled above if extracted is not None: output[ctx.key] = extracted else: # do not add this optional key/value pair in the output pass return output # a leaf/Selector node elif isinstance(parselet_node, Selector): return self.selector_handler.extract(document, parselet_node) else: # FIXME: can this happen? # if selector handler returned None at compile time, # probably yes pass
[docs] def keys(self): """ Return a list of 1st level keys of the output data model >>> import parslepy >>> rules = { ... "headingcss": "#main", ... "headingxpath": "//h1[@id='main']" ... } >>> p = parslepy.Parselet(rules) >>> sorted(p.keys()) ['headingcss', 'headingxpath'] """ return self._keys(self.parselet_tree)
def _keys(self, parselet_node): keys = [] if isinstance(parselet_node, ParsleyNode): for ctx, v in list(parselet_node.items()): if ctx.key == self.SPECIAL_LEVEL_KEY: keys.extend(self._keys(v)) else: keys.append(ctx.key) return keys # alias
Parslet = Parselet if __name__ == "__main__": import doctest doctest.testmod()