380 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			380 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
# -*- coding: iso-8859-1 -*-
 | 
						|
""" A SAX2 driver for libxml2, on top of it's XmlReader API
 | 
						|
 | 
						|
USAGE
 | 
						|
    # put this file (drv_libxml2.py) in PYTHONPATH
 | 
						|
    import xml.sax
 | 
						|
    reader = xml.sax.make_parser(["drv_libxml2"])
 | 
						|
    # ...and the rest is standard python sax.
 | 
						|
 | 
						|
CAVEATS
 | 
						|
    - Lexical handlers are supported, except for start/endEntity
 | 
						|
      (waiting for XmlReader.ResolveEntity) and start/endDTD
 | 
						|
    - Error callbacks are not exactly synchronous, they tend
 | 
						|
      to be invoked before the corresponding content callback,
 | 
						|
      because the underlying reader interface parses
 | 
						|
      data by chunks of 512 bytes
 | 
						|
    
 | 
						|
TODO
 | 
						|
    - search for TODO
 | 
						|
    - some ErrorHandler events (warning)
 | 
						|
    - some ContentHandler events (setDocumentLocator, skippedEntity)
 | 
						|
    - EntityResolver (using libxml2.?)
 | 
						|
    - DTDHandler (if/when libxml2 exposes such node types)
 | 
						|
    - DeclHandler (if/when libxml2 exposes such node types)
 | 
						|
    - property_xml_string?
 | 
						|
    - feature_string_interning?
 | 
						|
    - Incremental parser
 | 
						|
    - additional performance tuning:
 | 
						|
      - one might cache callbacks to avoid some name lookups
 | 
						|
      - one might implement a smarter way to pass attributes to startElement
 | 
						|
        (some kind of lazy evaluation?)
 | 
						|
      - there might be room for improvement in start/endPrefixMapping
 | 
						|
      - other?
 | 
						|
 | 
						|
"""
 | 
						|
 | 
						|
__author__  = "Stéphane Bidoul <sbi@skynet.be>"
 | 
						|
__version__ = "0.3"
 | 
						|
 | 
						|
import sys
 | 
						|
import codecs
 | 
						|
 | 
						|
if sys.version_info[0] < 3:
 | 
						|
    __author__  = codecs.unicode_escape_decode(__author__)[0]
 | 
						|
 | 
						|
    StringTypes = (str, unicode)
 | 
						|
    # libxml2 returns strings as UTF8
 | 
						|
    _decoder = codecs.lookup("utf8")[1]
 | 
						|
    def _d(s):
 | 
						|
        if s is None:
 | 
						|
            return s
 | 
						|
        else:
 | 
						|
            return _decoder(s)[0]
 | 
						|
else:
 | 
						|
    StringTypes = str
 | 
						|
    # s is Unicode `str` already
 | 
						|
    def _d(s):
 | 
						|
        return s
 | 
						|
 | 
						|
from xml.sax._exceptions import *
 | 
						|
from xml.sax import xmlreader, saxutils
 | 
						|
from xml.sax.handler import \
 | 
						|
     feature_namespaces, \
 | 
						|
     feature_namespace_prefixes, \
 | 
						|
     feature_string_interning, \
 | 
						|
     feature_validation, \
 | 
						|
     feature_external_ges, \
 | 
						|
     feature_external_pes, \
 | 
						|
     property_lexical_handler, \
 | 
						|
     property_declaration_handler, \
 | 
						|
     property_dom_node, \
 | 
						|
     property_xml_string
 | 
						|
 | 
						|
try:
 | 
						|
    import libxml2
 | 
						|
except ImportError:
 | 
						|
    raise SAXReaderNotAvailable("libxml2 not available: " \
 | 
						|
                                "import error was: %s" % sys.exc_info()[1])
 | 
						|
 | 
						|
class Locator(xmlreader.Locator):
 | 
						|
    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
 | 
						|
 | 
						|
    def __init__(self,locator):
 | 
						|
        self.__locator = locator
 | 
						|
 | 
						|
    def getColumnNumber(self):
 | 
						|
        "Return the column number where the current event ends."
 | 
						|
        return -1
 | 
						|
 | 
						|
    def getLineNumber(self):
 | 
						|
        "Return the line number where the current event ends."
 | 
						|
        return self.__locator.LineNumber()
 | 
						|
 | 
						|
    def getPublicId(self):
 | 
						|
        "Return the public identifier for the current event."
 | 
						|
        return None
 | 
						|
 | 
						|
    def getSystemId(self):
 | 
						|
        "Return the system identifier for the current event."
 | 
						|
        return self.__locator.BaseURI()
 | 
						|
 | 
						|
class LibXml2Reader(xmlreader.XMLReader):
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        xmlreader.XMLReader.__init__(self)
 | 
						|
        # features
 | 
						|
        self.__ns = 0
 | 
						|
        self.__nspfx = 0
 | 
						|
        self.__validate = 0
 | 
						|
        self.__extparams = 1
 | 
						|
        # parsing flag
 | 
						|
        self.__parsing = 0
 | 
						|
        # additional handlers
 | 
						|
        self.__lex_handler = None
 | 
						|
        self.__decl_handler = None
 | 
						|
        # error messages accumulator
 | 
						|
        self.__errors = None
 | 
						|
 | 
						|
    def _errorHandler(self,arg,msg,severity,locator):
 | 
						|
        if self.__errors is None:
 | 
						|
            self.__errors = []
 | 
						|
        self.__errors.append((severity,
 | 
						|
                              SAXParseException(msg,None,
 | 
						|
                                                Locator(locator))))
 | 
						|
 | 
						|
    def _reportErrors(self,fatal):
 | 
						|
        for severity,exception in self.__errors:
 | 
						|
            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
 | 
						|
                            libxml2.PARSER_SEVERITY_WARNING):
 | 
						|
                self._err_handler.warning(exception)
 | 
						|
            else:
 | 
						|
                # when fatal is set, the parse will stop;
 | 
						|
                # we consider that the last error reported
 | 
						|
                # is the fatal one.
 | 
						|
                if fatal and exception is self.__errors[-1][1]:
 | 
						|
                    self._err_handler.fatalError(exception)
 | 
						|
                else:
 | 
						|
                    self._err_handler.error(exception)
 | 
						|
        self.__errors = None
 | 
						|
 | 
						|
    def parse(self, source):
 | 
						|
        self.__parsing = 1
 | 
						|
        try:
 | 
						|
            # prepare source and create reader
 | 
						|
            if isinstance(source, StringTypes):
 | 
						|
                reader = libxml2.newTextReaderFilename(source)
 | 
						|
            else:
 | 
						|
                source = saxutils.prepare_input_source(source)
 | 
						|
                input = libxml2.inputBuffer(source.getByteStream())
 | 
						|
                reader = input.newTextReader(source.getSystemId())
 | 
						|
            reader.SetErrorHandler(self._errorHandler,None)
 | 
						|
            # configure reader
 | 
						|
            if self.__extparams:
 | 
						|
                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
 | 
						|
                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
 | 
						|
                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
 | 
						|
                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
 | 
						|
            else:
 | 
						|
                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
 | 
						|
            # we reuse attribute maps (for a slight performance gain)
 | 
						|
            if self.__ns:
 | 
						|
                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
 | 
						|
            else:
 | 
						|
                attributesImpl = xmlreader.AttributesImpl({})
 | 
						|
            # prefixes to pop (for endPrefixMapping)
 | 
						|
            prefixes = []
 | 
						|
            # start loop
 | 
						|
            self._cont_handler.startDocument()
 | 
						|
            while 1:
 | 
						|
                r = reader.Read()
 | 
						|
                # check for errors
 | 
						|
                if r == 1:
 | 
						|
                    if not self.__errors is None:
 | 
						|
                        self._reportErrors(0)
 | 
						|
                elif r == 0:
 | 
						|
                    if not self.__errors is None:
 | 
						|
                        self._reportErrors(0)
 | 
						|
                    break # end of parse
 | 
						|
                else:
 | 
						|
                    if not self.__errors is None:
 | 
						|
                        self._reportErrors(1)
 | 
						|
                    else:
 | 
						|
                        self._err_handler.fatalError(\
 | 
						|
                            SAXException("Read failed (no details available)"))
 | 
						|
                    break # fatal parse error
 | 
						|
                # get node type
 | 
						|
                nodeType = reader.NodeType()
 | 
						|
                # Element
 | 
						|
                if nodeType == 1: 
 | 
						|
                    if self.__ns:
 | 
						|
                        eltName = (_d(reader.NamespaceUri()),\
 | 
						|
                                   _d(reader.LocalName()))
 | 
						|
                        eltQName = _d(reader.Name())
 | 
						|
                        attributesNSImpl._attrs = attrs = {}
 | 
						|
                        attributesNSImpl._qnames = qnames = {}
 | 
						|
                        newPrefixes = []
 | 
						|
                        while reader.MoveToNextAttribute():
 | 
						|
                            qname = _d(reader.Name())
 | 
						|
                            value = _d(reader.Value())
 | 
						|
                            if qname.startswith("xmlns"):
 | 
						|
                                if len(qname) > 5:
 | 
						|
                                    newPrefix = qname[6:]
 | 
						|
                                else:
 | 
						|
                                    newPrefix = None
 | 
						|
                                newPrefixes.append(newPrefix)
 | 
						|
                                self._cont_handler.startPrefixMapping(\
 | 
						|
                                    newPrefix,value)
 | 
						|
                                if not self.__nspfx:
 | 
						|
                                    continue # don't report xmlns attribute
 | 
						|
                            attName = (_d(reader.NamespaceUri()),
 | 
						|
                                       _d(reader.LocalName()))
 | 
						|
                            qnames[attName] = qname
 | 
						|
                            attrs[attName] = value
 | 
						|
                        reader.MoveToElement()
 | 
						|
                        self._cont_handler.startElementNS( \
 | 
						|
                            eltName,eltQName,attributesNSImpl) 
 | 
						|
                        if reader.IsEmptyElement():
 | 
						|
                            self._cont_handler.endElementNS(eltName,eltQName)
 | 
						|
                            for newPrefix in newPrefixes:
 | 
						|
                                self._cont_handler.endPrefixMapping(newPrefix)
 | 
						|
                        else:
 | 
						|
                            prefixes.append(newPrefixes)
 | 
						|
                    else:
 | 
						|
                        eltName = _d(reader.Name())
 | 
						|
                        attributesImpl._attrs = attrs = {}
 | 
						|
                        while reader.MoveToNextAttribute():
 | 
						|
                            attName = _d(reader.Name())
 | 
						|
                            attrs[attName] = _d(reader.Value())
 | 
						|
                        reader.MoveToElement()
 | 
						|
                        self._cont_handler.startElement( \
 | 
						|
                            eltName,attributesImpl)
 | 
						|
                        if reader.IsEmptyElement():
 | 
						|
                            self._cont_handler.endElement(eltName)
 | 
						|
                # EndElement
 | 
						|
                elif nodeType == 15: 
 | 
						|
                    if self.__ns:
 | 
						|
                        self._cont_handler.endElementNS( \
 | 
						|
                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
 | 
						|
                             _d(reader.Name()))
 | 
						|
                        for prefix in prefixes.pop():
 | 
						|
                            self._cont_handler.endPrefixMapping(prefix)
 | 
						|
                    else:
 | 
						|
                        self._cont_handler.endElement(_d(reader.Name()))
 | 
						|
                # Text
 | 
						|
                elif nodeType == 3: 
 | 
						|
                    self._cont_handler.characters(_d(reader.Value()))
 | 
						|
                # Whitespace
 | 
						|
                elif nodeType == 13: 
 | 
						|
                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
 | 
						|
                # SignificantWhitespace
 | 
						|
                elif nodeType == 14:
 | 
						|
                    self._cont_handler.characters(_d(reader.Value()))
 | 
						|
                # CDATA
 | 
						|
                elif nodeType == 4:
 | 
						|
                    if not self.__lex_handler is None:
 | 
						|
                        self.__lex_handler.startCDATA()
 | 
						|
                    self._cont_handler.characters(_d(reader.Value()))
 | 
						|
                    if not self.__lex_handler is None:
 | 
						|
                        self.__lex_handler.endCDATA()
 | 
						|
                # EntityReference
 | 
						|
                elif nodeType == 5:
 | 
						|
                    if not self.__lex_handler is None:
 | 
						|
                        self.startEntity(_d(reader.Name()))
 | 
						|
                    reader.ResolveEntity()
 | 
						|
                # EndEntity
 | 
						|
                elif nodeType == 16:
 | 
						|
                    if not self.__lex_handler is None:
 | 
						|
                        self.endEntity(_d(reader.Name()))
 | 
						|
                # ProcessingInstruction
 | 
						|
                elif nodeType == 7: 
 | 
						|
                    self._cont_handler.processingInstruction( \
 | 
						|
                        _d(reader.Name()),_d(reader.Value()))
 | 
						|
                # Comment
 | 
						|
                elif nodeType == 8:
 | 
						|
                    if not self.__lex_handler is None:
 | 
						|
                        self.__lex_handler.comment(_d(reader.Value()))
 | 
						|
                # DocumentType
 | 
						|
                elif nodeType == 10:
 | 
						|
                    #if not self.__lex_handler is None:
 | 
						|
                    #    self.__lex_handler.startDTD()
 | 
						|
                    pass # TODO (how to detect endDTD? on first non-dtd event?)
 | 
						|
                # XmlDeclaration
 | 
						|
                elif nodeType == 17:
 | 
						|
                    pass # TODO
 | 
						|
                # Entity
 | 
						|
                elif nodeType == 6:
 | 
						|
                    pass # TODO (entity decl)
 | 
						|
                # Notation (decl)
 | 
						|
                elif nodeType == 12:
 | 
						|
                    pass # TODO
 | 
						|
                # Attribute (never in this loop)
 | 
						|
                #elif nodeType == 2: 
 | 
						|
                #    pass
 | 
						|
                # Document (not exposed)
 | 
						|
                #elif nodeType == 9: 
 | 
						|
                #    pass
 | 
						|
                # DocumentFragment (never returned by XmlReader)
 | 
						|
                #elif nodeType == 11:
 | 
						|
                #    pass
 | 
						|
                # None
 | 
						|
                #elif nodeType == 0:
 | 
						|
                #    pass
 | 
						|
                # -
 | 
						|
                else:
 | 
						|
                    raise SAXException("Unexpected node type %d" % nodeType)
 | 
						|
            if r == 0:
 | 
						|
                self._cont_handler.endDocument()
 | 
						|
            reader.Close()
 | 
						|
        finally:
 | 
						|
            self.__parsing = 0
 | 
						|
 | 
						|
    def setDTDHandler(self, handler):
 | 
						|
        # TODO (when supported, the inherited method works just fine)
 | 
						|
        raise SAXNotSupportedException("DTDHandler not supported")
 | 
						|
 | 
						|
    def setEntityResolver(self, resolver):
 | 
						|
        # TODO (when supported, the inherited method works just fine)
 | 
						|
        raise SAXNotSupportedException("EntityResolver not supported")
 | 
						|
 | 
						|
    def getFeature(self, name):
 | 
						|
        if name == feature_namespaces:
 | 
						|
            return self.__ns
 | 
						|
        elif name == feature_namespace_prefixes:
 | 
						|
            return self.__nspfx
 | 
						|
        elif name == feature_validation:
 | 
						|
            return self.__validate
 | 
						|
        elif name == feature_external_ges:
 | 
						|
            return 1 # TODO (does that relate to PARSER_LOADDTD)?
 | 
						|
        elif name == feature_external_pes:
 | 
						|
            return self.__extparams
 | 
						|
        else:
 | 
						|
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
 | 
						|
                                            name)
 | 
						|
 | 
						|
    def setFeature(self, name, state):
 | 
						|
        if self.__parsing:
 | 
						|
            raise SAXNotSupportedException("Cannot set feature %s " \
 | 
						|
                                           "while parsing" % name)
 | 
						|
        if name == feature_namespaces:
 | 
						|
            self.__ns = state
 | 
						|
        elif name == feature_namespace_prefixes:
 | 
						|
            self.__nspfx = state
 | 
						|
        elif name == feature_validation:
 | 
						|
            self.__validate = state
 | 
						|
        elif name == feature_external_ges:
 | 
						|
            if state == 0:
 | 
						|
                # TODO (does that relate to PARSER_LOADDTD)?
 | 
						|
                raise SAXNotSupportedException("Feature '%s' not supported" % \
 | 
						|
                                               name)
 | 
						|
        elif name == feature_external_pes:
 | 
						|
            self.__extparams = state
 | 
						|
        else:
 | 
						|
            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
 | 
						|
                                            name)
 | 
						|
 | 
						|
    def getProperty(self, name):
 | 
						|
        if name == property_lexical_handler:
 | 
						|
            return self.__lex_handler
 | 
						|
        elif name == property_declaration_handler:
 | 
						|
            return self.__decl_handler
 | 
						|
        else:
 | 
						|
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
 | 
						|
                                            name)
 | 
						|
 | 
						|
    def setProperty(self, name, value):     
 | 
						|
        if name == property_lexical_handler:
 | 
						|
            self.__lex_handler = value
 | 
						|
        elif name == property_declaration_handler:
 | 
						|
            # TODO: remove if/when libxml2 supports dtd events
 | 
						|
            raise SAXNotSupportedException("Property '%s' not supported" % \
 | 
						|
                                           name)
 | 
						|
            self.__decl_handler = value
 | 
						|
        else:
 | 
						|
            raise SAXNotRecognizedException("Property '%s' not recognized" % \
 | 
						|
                                            name)
 | 
						|
 | 
						|
def create_parser():
 | 
						|
    return LibXml2Reader()
 | 
						|
 |