From 603440e0857a9d7cb7fcc8e5e0b117017cb7f7ad Mon Sep 17 00:00:00 2001
From: Leif Arne Storset <lstorset@opera.com>
Date: Fri, 6 Jun 2014 15:04:55 +0200
Subject: [PATCH] Replace invalid characters with U+FFFD (fixes #96)

---
 AUTHORS.rst             | 1 +
 CHANGES.rst             | 2 ++
 html5lib/inputstream.py | 1 +
 3 files changed, 4 insertions(+)

diff --git a/AUTHORS.rst b/AUTHORS.rst
index 4148a6ed..903e6de2 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -32,3 +32,4 @@ Patches and suggestions
 - Juan Carlos Garcia Segovia
 - Mike West
 - Marc DM
+- Leif Arne Storset
diff --git a/CHANGES.rst b/CHANGES.rst
index 1431b3c9..ba02fde3 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -7,6 +7,8 @@ Change Log
 Released on XXX, 2014
 
 * XXX
+* Fix #96: replace invalid characters from "Preprocessing the input stream" with
+  U+FFFD, preventing problems in lxml.
 
 
 0.999
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 9e03b931..4c8f4774 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -270,6 +270,7 @@ def readChunk(self, chunkSize=None):
         # Replace invalid characters
         # Note U+0000 is dealt with in the tokenizer
         data = self.replaceCharactersRegexp.sub("\ufffd", data)
+        data = invalid_unicode_re.sub("\ufffd", data)
 
         data = data.replace("\r\n", "\n")
         data = data.replace("\r", "\n")