html5lib · lastorset · Jun 6, 2014
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -32,3 +32,4 @@ Patches and suggestions
 - Juan Carlos Garcia Segovia
 - Mike West
 - Marc DM
+- Leif Arne Storset
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -7,6 +7,8 @@ Change Log
 Released on XXX, 2014
 
 * XXX
+* Fix #96: replace invalid characters from "Preprocessing the input stream" with
+  U+FFFD, preventing problems in lxml.
 
 
 0.999

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -270,6 +270,7 @@ def readChunk(self, chunkSize=None):
         # Replace invalid characters
         # Note U+0000 is dealt with in the tokenizer
         data = self.replaceCharactersRegexp.sub("\ufffd", data)
+        data = invalid_unicode_re.sub("\ufffd", data)
 
         data = data.replace("\r\n", "\n")
         data = data.replace("\r", "\n")
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,8 @@ Change Log @@
     Released on XXX, 2014
     * XXX
+    * Fix #96: replace invalid characters from "Preprocessing the input stream" with
+      U+FFFD, preventing problems in lxml.
 .999
@@ Expand Down @@