From 603440e0857a9d7cb7fcc8e5e0b117017cb7f7ad Mon Sep 17 00:00:00 2001 From: Leif Arne Storset Date: Fri, 6 Jun 2014 15:04:55 +0200 Subject: [PATCH] Replace invalid characters with U+FFFD (fixes #96) --- AUTHORS.rst | 1 + CHANGES.rst | 2 ++ html5lib/inputstream.py | 1 + 3 files changed, 4 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..903e6de2 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,4 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Leif Arne Storset diff --git a/CHANGES.rst b/CHANGES.rst index 1431b3c9..ba02fde3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,8 @@ Change Log Released on XXX, 2014 * XXX +* Fix #96: replace invalid characters from "Preprocessing the input stream" with + U+FFFD, preventing problems in lxml. 0.999 diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..4c8f4774 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -270,6 +270,7 @@ def readChunk(self, chunkSize=None): # Replace invalid characters # Note U+0000 is dealt with in the tokenizer data = self.replaceCharactersRegexp.sub("\ufffd", data) + data = invalid_unicode_re.sub("\ufffd", data) data = data.replace("\r\n", "\n") data = data.replace("\r", "\n")