diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..903e6de2 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,4 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Leif Arne Storset diff --git a/CHANGES.rst b/CHANGES.rst index 1431b3c9..ba02fde3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,8 @@ Change Log Released on XXX, 2014 * XXX +* Fix #96: replace invalid characters from "Preprocessing the input stream" with + U+FFFD, preventing problems in lxml. 0.999 diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..4c8f4774 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -270,6 +270,7 @@ def readChunk(self, chunkSize=None): # Replace invalid characters # Note U+0000 is dealt with in the tokenizer data = self.replaceCharactersRegexp.sub("\ufffd", data) + data = invalid_unicode_re.sub("\ufffd", data) data = data.replace("\r\n", "\n") data = data.replace("\r", "\n")