Source code for xmltodict

#!/usr/bin/env python
"Makes working with XML feel like you are working with JSON"

from xml.parsers import expat
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
try:  # pragma no cover
    from cStringIO import StringIO
except ImportError:  # pragma no cover
    try:
        from StringIO import StringIO
    except ImportError:
        from io import StringIO
try:  # pragma no cover
    from collections import OrderedDict
except ImportError:  # pragma no cover
    try:
        from ordereddict import OrderedDict
    except ImportError:
        OrderedDict = dict

try:  # pragma no cover
    _basestring = basestring
except NameError:  # pragma no cover
    _basestring = str
try:  # pragma no cover
    _unicode = unicode
except NameError:  # pragma no cover
    _unicode = str

__author__ = 'Martin Blech'
__version__ = '0.9.0'
__license__ = 'MIT'


class ParsingInterrupted(Exception):
    pass


class _DictSAXHandler(object):
    def __init__(self,
                 item_depth=0,
                 item_callback=lambda *args: True,
                 xml_attribs=True,
                 attr_prefix='@',
                 cdata_key='#text',
                 force_cdata=False,
                 cdata_separator='',
                 postprocessor=None,
                 dict_constructor=OrderedDict,
                 strip_whitespace=True,
                 namespace_separator=':',
                 namespaces=None):
        self.path = []
        self.stack = []
        self.data = None
        self.item = None
        self.item_depth = item_depth
        self.xml_attribs = xml_attribs
        self.item_callback = item_callback
        self.attr_prefix = attr_prefix
        self.cdata_key = cdata_key
        self.force_cdata = force_cdata
        self.cdata_separator = cdata_separator
        self.postprocessor = postprocessor
        self.dict_constructor = dict_constructor
        self.strip_whitespace = strip_whitespace
        self.namespace_separator = namespace_separator
        self.namespaces = namespaces

    def _build_name(self, full_name):
        if not self.namespaces:
            return full_name
        i = full_name.rfind(self.namespace_separator)
        if i == -1:
            return full_name
        namespace, name = full_name[:i], full_name[i + 1:]
        short_namespace = self.namespaces.get(namespace, namespace)
        if not short_namespace:
            return name
        else:
            return self.namespace_separator.join((short_namespace, name))

    def _attrs_to_dict(self, attrs):
        if isinstance(attrs, dict):
            return attrs
        return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))

    def startElement(self, full_name, attrs):
        name = self._build_name(full_name)
        attrs = self._attrs_to_dict(attrs)
        self.path.append((name, attrs or None))
        if len(self.path) > self.item_depth:
            self.stack.append((self.item, self.data))
            if self.xml_attribs:
                attrs = self.dict_constructor(
                    (self.attr_prefix + key, value)
                    for (key, value) in attrs.items())
            else:
                attrs = None
            self.item = attrs or None
            self.data = None

    def endElement(self, full_name):
        name = self._build_name(full_name)
        if len(self.path) == self.item_depth:
            item = self.item
            if item is None:
                item = self.data
            should_continue = self.item_callback(self.path, item)
            if not should_continue:
                raise ParsingInterrupted()
        if len(self.stack):
            item, data = self.item, self.data
            self.item, self.data = self.stack.pop()
            if self.strip_whitespace and data is not None:
                data = data.strip() or None
            if data and self.force_cdata and item is None:
                item = self.dict_constructor()
            if item is not None:
                if data:
                    self.push_data(item, self.cdata_key, data)
                self.item = self.push_data(self.item, name, item)
            else:
                self.item = self.push_data(self.item, name, data)
        else:
            self.item = self.data = None
        self.path.pop()

    def characters(self, data):
        if not self.data:
            self.data = data
        else:
            self.data += self.cdata_separator + data

    def push_data(self, item, key, data):
        if self.postprocessor is not None:
            result = self.postprocessor(self.path, key, data)
            if result is None:
                return item
            key, data = result
        if item is None:
            item = self.dict_constructor()
        try:
            value = item[key]
            if isinstance(value, list):
                value.append(data)
            else:
                item[key] = [value, data]
        except KeyError:
            item[key] = data
        return item


[docs]def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, namespace_separator=':', **kwargs): """Parse the given XML input and convert it into a dictionary. `xml_input` can either be a `string` or a file-like object. If `xml_attribs` is `True`, element attributes are put in the dictionary among regular child elements, using `@` as a prefix to avoid collisions. If set to `False`, they are just ignored. Simple example:: >>> import xmltodict >>> doc = xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a> ... \"\"\") >>> doc['a']['@prop'] u'x' >>> doc['a']['b'] [u'1', u'2'] If `item_depth` is `0`, the function returns a dictionary for the root element (default behavior). Otherwise, it calls `item_callback` every time an item at the specified depth is found and returns `None` in the end (streaming mode). The callback function receives two parameters: the `path` from the document root to the item (name-attribs pairs), and the `item` (dict). If the callback's return value is false-ish, parsing will be stopped with the :class:`ParsingInterrupted` exception. Streaming example:: >>> def handle(path, item): ... print 'path:%s item:%s' % (path, item) ... return True ... >>> xmltodict.parse(\"\"\" ... <a prop="x"> ... <b>1</b> ... <b>2</b> ... </a>\"\"\", item_depth=2, item_callback=handle) path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2 The optional argument `postprocessor` is a function that takes `path`, `key` and `value` as positional arguments and returns a new `(key, value)` pair where both `key` and `value` may have changed. Usage example:: >>> def postprocessor(path, key, value): ... try: ... return key + ':int', int(value) ... except (ValueError, TypeError): ... return key, value >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>', ... postprocessor=postprocessor) OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))]) You can pass an alternate version of `expat` (such as `defusedexpat`) by using the `expat` parameter. E.g: >>> import defusedexpat >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat) OrderedDict([(u'a', u'hello')]) """ handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs) if isinstance(xml_input, _unicode): if not encoding: encoding = 'utf-8' xml_input = xml_input.encode(encoding) if not process_namespaces: namespace_separator = None parser = expat.ParserCreate( encoding, namespace_separator ) try: parser.ordered_attributes = True except AttributeError: # Jython's expat does not support ordered_attributes pass parser.StartElementHandler = handler.startElement parser.EndElementHandler = handler.endElement parser.CharacterDataHandler = handler.characters parser.buffer_text = True try: parser.ParseFile(xml_input) except (TypeError, AttributeError): parser.Parse(xml_input, True) return handler.item
def _emit(key, value, content_handler, attr_prefix='@', cdata_key='#text', depth=0, preprocessor=None, pretty=False, newl='\n', indent='\t'): if preprocessor is not None: result = preprocessor(key, value) if result is None: return key, value = result if not isinstance(value, (list, tuple)): value = [value] if depth == 0 and len(value) > 1: raise ValueError('document with multiple roots') for v in value: if v is None: v = OrderedDict() elif not isinstance(v, dict): v = _unicode(v) if isinstance(v, _basestring): v = OrderedDict(((cdata_key, v),)) cdata = None attrs = OrderedDict() children = [] for ik, iv in v.items(): if ik == cdata_key: cdata = iv continue if ik.startswith(attr_prefix): attrs[ik[len(attr_prefix):]] = iv continue children.append((ik, iv)) if pretty: content_handler.ignorableWhitespace(depth * indent) content_handler.startElement(key, AttributesImpl(attrs)) if pretty and children: content_handler.ignorableWhitespace(newl) for child_key, child_value in children: _emit(child_key, child_value, content_handler, attr_prefix, cdata_key, depth + 1, preprocessor, pretty, newl, indent) if cdata is not None: content_handler.characters(cdata) if pretty and children: content_handler.ignorableWhitespace(depth * indent) content_handler.endElement(key) if pretty and depth: content_handler.ignorableWhitespace(newl)
[docs]def unparse(input_dict, output=None, encoding='utf-8', full_document=True, **kwargs): """Emit an XML document for the given `input_dict` (reverse of `parse`). The resulting XML document is returned as a string, but if `output` (a file-like object) is specified, it is written there instead. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted as XML node attributes, whereas keys equal to `cdata_key` (default=`'#text'\`) are treated as character data. The `pretty` parameter (default=`False`) enables pretty-printing. In this mode, lines are terminated with `'\\n'` and indented with `'\\t'`, but this can be customized with the `newl` and `indent` parameters. """ ((key, value),) = input_dict.items() must_return = False if output is None: output = StringIO() must_return = True content_handler = XMLGenerator(output, encoding) if full_document: content_handler.startDocument() _emit(key, value, content_handler, **kwargs) if full_document: content_handler.endDocument() if must_return: value = output.getvalue() try: # pragma no cover value = value.decode(encoding) except AttributeError: # pragma no cover pass return value
if __name__ == '__main__': # pragma: no cover import sys import marshal (item_depth,) = sys.argv[1:] item_depth = int(item_depth) def handle_item(path, item): marshal.dump((path, item), sys.stdout) return True try: root = parse(sys.stdin, item_depth=item_depth, item_callback=handle_item, dict_constructor=dict) if item_depth == 0: handle_item([], root) except KeyboardInterrupt: pass