Source code for pytan.xml_clean

#!/usr/bin/env python
# -*- mode: Python; tab-width: 4; indent-tabs-mode: nil; -*-
# ex: set tabstop=4
# Please do not change the two lines above. See PEP 8, PEP 263.
"""This is a regex based XML cleaner that will replace unsupported characters"""
import sys
import re
import logging

# disable python from creating .pyc files everywhere
sys.dont_write_bytecode = True

mylog = logging.getLogger("XMLCleaner")

XML_1_0_VALID_HEX = [
    [0x0009],  # TAB
    [0x000A],  # LINEFEED
    [0x000D],  # CARRIAGE RETURN
    [0x0020, 0xD7FF],  # VALID CHARACTER RANGE 1
    [0xE000, 0xFFFD],  # VALID CHARACTER RANGE 2
]
"""Valid Unicode characters for XML documents:
    (any Unicode character, excluding the surrogate blocks, FFFE, and FFFF)
    #x9,
    #xA,
    #xD,
    [#x20-#xD7FF],
    [#xE000-#xFFFD],
    [#x10000-#x10FFFF]

Source: http://www.w3.org/TR/REC-xml/#NT-Char
"""

XML_1_0_RESTRICTED_HEX = [
    [0x007F, 0x0084],  # one C0 control character and all but one C1 control
    [0x0086, 0x009F],  # one C0 control character and all but one C1 control
    [0xFDD0, 0xFDEF],  # control characters/permanently assigned to non-characters
]
"""Restricted/discouraged Unicode characters for XML documents:
    [#x7F-#x84],
    [#x86-#x9F],
    [#xFDD0-#xFDEF],
    [#x1FFFE-#x1FFFF],
    [#x2FFFE-#x2FFFF],
    [#x3FFFE-#x3FFFF],
    [#x4FFFE-#x4FFFF],
    [#x5FFFE-#x5FFFF],
    [#x6FFFE-#x6FFFF],
    [#x7FFFE-#x7FFFF],
    [#x8FFFE-#x8FFFF],
    [#x9FFFE-#x9FFFF],
    [#xAFFFE-#xAFFFF],
    [#xBFFFE-#xBFFFF],
    [#xCFFFE-#xCFFFF],
    [#xDFFFE-#xDFFFF],
    [#xEFFFE-#xEFFFF],
    [#xFFFFE-#xFFFFF],
    [#x10FFFE-#x10FFFF]

Source: http://www.w3.org/TR/REC-xml/#NT-Char
"""

# If this python build supports unicode ranges above 10000, add to the valid range
if sys.maxunicode > 0x10000:
    XML_1_0_VALID_HEX.append((0x10000, min(sys.maxunicode, 0x10FFFF)))

# Add control characters and non-characters to the restricted range if this python
# build supports the applicable range
for i in [hex(i) for i in range(1, 17)]:
    if not sys.maxunicode >= int('{}FFFF'.format(i), 0):
        continue
    XML_1_0_RESTRICTED_HEX.append([
        int('{}FFFE'.format(i), 0),
        int('{}FFFF'.format(i), 0),
    ])

XML_1_0_VALID_UNI = ['-'.join([unichr(y) for y in x]) for x in XML_1_0_VALID_HEX]
INVALID_UNICODE_RAW_RE = ur'[^{}]'.format(''.join(XML_1_0_VALID_UNI))
"""The raw regex string to use when replacing invalid characters"""

INVALID_UNICODE_RE = re.compile(INVALID_UNICODE_RAW_RE, re.U)
"""The regex object to use when replacing invalid characters"""

XML_1_0_RESTRICTED_UNI = ['-'.join([unichr(y) for y in x]) for x in XML_1_0_RESTRICTED_HEX]
RESTRICTED_UNICODE_RAW_RE = ur'[{}]'.format(''.join(XML_1_0_RESTRICTED_UNI))
"""The raw regex string to use when replacing restricted characters"""

RESTRICTED_UNICODE_RE = re.compile(RESTRICTED_UNICODE_RAW_RE, re.U)
"""The regex object to use when replacing restricted characters"""

DEFAULT_REPLACEMENT = u'\uFFFD'
"""The default character to use when replacing characters"""


[docs]def replace_invalid_unicode(text, replacement=None): """Replaces invalid unicode characters with `replacement` Parameters ---------- text : str * str to clean replacement : str, optional * default: None * if invalid characters found, they will be replaced with this * if not supplied, will default to DEFAULT_REPLACEMENT Returns ------- str, cnt, RE : tuple * str : the cleaned version of `text` * cnt : the number of replacements that took place * RE : the regex object that was used to do the replacements """ if replacement is None: replacement = DEFAULT_REPLACEMENT s, cnt = INVALID_UNICODE_RE.subn(replacement, text) return s, cnt, INVALID_UNICODE_RE
[docs]def replace_restricted_unicode(text, replacement=None): """Replaces restricted unicode characters with `replacement` Parameters ---------- text : str * str to clean replacement : str, optional * default: None * if restricted characters found, they will be replaced with this * if not supplied, will default to DEFAULT_REPLACEMENT Returns ------- str, cnt, RE : tuple * str : the cleaned version of `text` * cnt : the number of replacements that took place * RE : the regex object that was used to do the replacements """ if replacement is None: replacement = DEFAULT_REPLACEMENT s, cnt = RESTRICTED_UNICODE_RE.subn(replacement, text) return s, cnt, RESTRICTED_UNICODE_RE
[docs]def xml_cleaner(s, encoding='utf-8', clean_restricted=True, log_clean_messages=True, log_bad_characters=False, replacement=None, **kwargs): """Removes invalid /restricted characters per XML 1.0 spec Parameters ---------- s : str * str to clean encoding : str, optional * default: 'utf-8' * encoding of `s` clean_restricted : bool, optional * default: True * remove restricted characters from `s` or not log_clean_messages : bool, optional * default: True * log messages using python logging or not log_bad_characters : bool, optional * default: False * log bad character matches or not Returns ------- str * the cleaned version of `s` """ if type(s) == str: try: # if orig_str is not unicode, decode the string into unicode with encoding s = s.decode(encoding, 'xmlcharrefreplace') except: if log_clean_messages: m = "Falling back to latin1 for decoding, unable to decode as UTF-8!".format mylog.warning(m()) try: # if can't decode as encoding, fallback to latin1 s = s.decode('latin1', 'xmlcharrefreplace') except: if log_clean_messages: m = ( "Unable to decode as latin-1 or UTF-8, decoding document as UTF-8 and " "ignoring errors" ).format mylog.warning(m()) s = unicode(s, 'utf-8', errors='ignore') # encode the string as utf-8 pass1 = s.encode('utf-8', 'xmlcharrefreplace') # decode the string from utf-8 into unicode pass2 = pass1.decode('utf-8', 'xmlcharrefreplace') # replace any invalid unicode characters pass3, pass3_cnt, pass3_re = replace_invalid_unicode(text=pass2, replacement=replacement) # if any invalid characters found, print how many were replaced if pass3_cnt and log_clean_messages: m = "Replaced {} invalid characters that match regex {!r}".format mylog.warning(m(pass3_cnt, pass3_re.pattern)) if log_bad_characters and log_clean_messages: matches = pass3_re.findall(pass2) m = "Invalid characters found: {!r}".format mylog.debug(m(matches)) if not pass3_cnt and log_clean_messages: m = "No invalid characters found that match regex {!r}".format mylog.debug(m(pass3_re.pattern)) if not clean_restricted: return pass3 # replace any restricted unicode characters pass4, pass4_cnt, pass4_re = replace_restricted_unicode(text=pass3, replacement=replacement) # if any restricted characters found, print how many were replaced if pass4_cnt and log_clean_messages: m = ( "Replaced {} restricted characters that match the regex {!r}" ).format mylog.warning(m(pass4_cnt, pass4_re.pattern)) if log_bad_characters and log_clean_messages: matches = pass4_re.findall(pass3) m = "Restricted characters found: {!r}".format mylog.debug(m(matches)) if not pass4_cnt and log_clean_messages: m = "No restricted characters found that match regex {!r}".format mylog.debug(m(pass4_re.pattern)) return pass4