From 588864d409fbd1f25649ca0a956d2bc2c23d3c9f Mon Sep 17 00:00:00 2001
From: Martin von Gagern <Martin.vGagern@gmx.net>
Date: Thu, 16 Jun 2016 15:09:12 +0200
Subject: [PATCH] string_decoder: fix handling of malformed utf8

There have been problems with utf8 decoding in cases where the input
was invalid.  Some cases would give different results depending on
chunking, while others even led to exceptions.  This commit simplifies
the code in several ways, reducing the risk of breakage.

Most importantly, the `text` method is not only used for bulk
conversion of a single chunk, but also for the conversion of the mini
buffer `lastChar` while handling characters spanning chunks.  That
should make most of the problems due to incorrect handling of special
cases disappear.

Secondly, that `text` method is now independent of encoding.  The
encoding-dependent method `complete` now has a single well-defined
task: determine the buffer position up to which the input consists of
complete characters.  The actual conversion is handled in a central
location, leading to a cleaner and leaner internal interface.

Thirdly, we no longer try to remember just how many bytes we'll need
for the next complete character.  We simply try to fill up the
`nextChar` buffer and perform a conversion on that.  This reduces the
number of internal counter variables from two to one, namely `partial`
which indicates the number of bytes currently stored in `nextChar`.
A possible drawback of this approach is that there is chance of
degraded performance if input arrives one byte at a time and is from a
script using long utf8 sequences.  As a benefit, though, this allows us
to emit a U+FFFD replacement character sooner in cases where the first
byte of an utf8 sequence is not followed by the expected number of
continuation bytes.

Fixes: https://github.com/nodejs/node/issues/7308
---
 lib/string_decoder.js                | 221 ++++++++++-----------------
 test/parallel/test-string-decoder.js |  30 ++++
 2 files changed, 114 insertions(+), 137 deletions(-)

diff --git a/lib/string_decoder.js b/lib/string_decoder.js
index 6eb71efc07d803..54b4d9cee4818f 100644
--- a/lib/string_decoder.js
+++ b/lib/string_decoder.js
@@ -44,15 +44,15 @@ function StringDecoder(encoding) {
   var nb;
   switch (this.encoding) {
     case 'utf16le':
-      this.text = utf16Text;
-      this.end = utf16End;
+      this.complete = utf16Complete;
+      this.flush = simpleFlush;
       // fall through
     case 'utf8':
       nb = 4;
       break;
     case 'base64':
-      this.text = base64Text;
-      this.end = base64End;
+      this.complete = base64Complete;
+      this.flush = simpleFlush;
       nb = 3;
       break;
     default:
@@ -60,170 +60,117 @@ function StringDecoder(encoding) {
       this.end = simpleEnd;
       return;
   }
-  this.lastNeed = 0;
-  this.lastTotal = 0;
+  this.partial = 0;
   this.lastChar = Buffer.allocUnsafe(nb);
 }
 
 StringDecoder.prototype.write = function(buf) {
   if (buf.length === 0)
     return '';
-  var r;
-  var i;
-  if (this.lastNeed) {
-    r = this.fillLast(buf);
-    if (r === undefined)
-      return '';
-    i = this.lastNeed;
-    this.lastNeed = 0;
-  } else {
-    i = 0;
-  }
-  if (i < buf.length)
-    return (r ? r + this.text(buf, i) : this.text(buf, i));
-  return r || '';
+  const partial = this.partial;
+  if (!partial)
+    return this.text(buf, 0, buf.length);
+
+  // We have incomplete characters in partial many bytes from last run.
+  // Copy bytes from buf to fill lastChar (if there is enough input).
+  const newHeadLen = Math.min(buf.length, this.lastChar.length - partial);
+  const totalHeadLen = newHeadLen + partial;
+  buf.copy(this.lastChar, partial, 0, newHeadLen);
+  // Now we have totalHeadLen bytes of input in lastChar, try to convert that.
+  let r = this.text(this.lastChar, 0, totalHeadLen);
+  if (this.partial <= newHeadLen) // consumed at least all the old head
+    r += this.text(buf, newHeadLen - this.partial, buf.length);
+  return r;
 };
 
-StringDecoder.prototype.end = utf8End;
-
 // Returns only complete characters in a Buffer
-StringDecoder.prototype.text = utf8Text;
+StringDecoder.prototype.text = function(buf, start, end) {
+  if (start === end)
+    return '';
+  const complete = this.complete(buf, start, end);
+  this.partial = end - complete;
+  if (this.partial && buf !== this.lastChar)
+    buf.copy(this.lastChar, 0, complete, end);
+  if (start === complete)
+    return '';
+  return buf.toString(this.encoding, start, complete);
+};
 
-// Attempts to complete a partial character using bytes from a Buffer
-StringDecoder.prototype.fillLast = function(buf) {
-  if (this.lastNeed <= buf.length) {
-    buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
-    return this.lastChar.toString(this.encoding, 0, this.lastTotal);
+// Returns a suitable representation of incomplete characters as well
+StringDecoder.prototype.end = function(buf) {
+  let r = (buf && buf.length ? this.write(buf) : '');
+  if (this.partial) {
+    r += this.flush();
+    this.partial = 0;
   }
-  buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
-  this.lastNeed -= buf.length;
+  return r;
 };
 
-// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
-// continuation byte.
-function utf8CheckByte(byte) {
-  if (byte <= 0x7F)
-    return 0;
-  else if (byte >> 5 === 0x06)
-    return 2;
-  else if (byte >> 4 === 0x0E)
-    return 3;
-  else if (byte >> 3 === 0x1E)
-    return 4;
-  return -1;
-}
+// Given (buf, start, end), determine the maximal n <= end such that
+// buf.slice(start, n) contains only complete characters
+StringDecoder.prototype.complete = utf8Complete;
+
+// Returns a string representation of the this.partial bytes in
+// this.lastChar which represent an incomplete character
+StringDecoder.prototype.flush = utf8Flush;
 
 // Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
-// character, returning the total number of bytes needed to complete the partial
-// character (if applicable).
-function utf8CheckIncomplete(self, buf, i) {
-  var j = buf.length - 1;
-  if (j < i)
-    return 0;
-  var nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
-  }
-  if (j < i)
-    return 0;
-  nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
-  }
-  if (j < i)
-    return 0;
-  nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
+// character, returning the position after the last complete character.
+function utf8Complete(buf, start, end) {
+  if (start > end - 3)
+    start = end - 3;
+  for (let i = end - 1; i >= start; --i) {
+    const byte = buf[i];
+    let numBytes;
+    if (byte >> 6 === 0x02)
+      continue; // continuation byte
+    else if (byte >> 5 === 0x06)
+      numBytes = 2;
+    else if (byte >> 4 === 0x0E)
+      numBytes = 3;
+    else if (byte >> 3 === 0x1E)
+      numBytes = 4;
+    else
+      numBytes = 1; // ASCII or invalid
+    if (i + numBytes > end) // incomplete
+      return i; // continue next run at leading byte
+    // Have complete sequence, possibly followed by garbage continuation.
+    return end;
   }
-  return 0;
-}
-
-// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
-// partial character, the character's bytes are buffered until the required
-// number of bytes are available.
-function utf8Text(buf, i) {
-  const total = utf8CheckIncomplete(this, buf, i);
-  if (!this.lastNeed)
-    return buf.toString('utf8', i);
-  this.lastTotal = total;
-  const end = buf.length - (total - this.lastNeed);
-  buf.copy(this.lastChar, 0, end);
-  return buf.toString('utf8', i, end);
+  // Ends in valid 4-byte sequence or invalid continuation characters.
+  // Either way the input is complete, so convert it as is.
+  return end;
 }
 
 // For UTF-8, a replacement character for each buffered byte of a (partial)
 // character needs to be added to the output.
-function utf8End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed)
-    return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
-  return r;
+function utf8Flush() {
+  return '\ufffd'.repeat(this.partial);
 }
 
 // UTF-16LE typically needs two bytes per character, but even if we have an even
 // number of bytes available, we need to check if we end on a leading/high
 // surrogate. In that case, we need to wait for the next two bytes in order to
 // decode the last character properly.
-function utf16Text(buf, i) {
-  if ((buf.length - i) % 2 === 0) {
-    const r = buf.toString('utf16le', i);
-    if (r) {
-      const c = r.charCodeAt(r.length - 1);
-      if (c >= 0xD800 && c <= 0xDBFF) {
-        this.lastNeed = 2;
-        this.lastTotal = 4;
-        this.lastChar[0] = buf[buf.length - 2];
-        this.lastChar[1] = buf[buf.length - 1];
-        return r.slice(0, -1);
-      }
-    }
-    return r;
+function utf16Complete(buf, start, end) {
+  if ((end - start) & 1)
+    --end;
+  if (end > start) {
+    const byte = buf[end - 1];
+    if (byte >= 0xD8 && byte <= 0xDB)
+      return end - 2;
   }
-  this.lastNeed = 1;
-  this.lastTotal = 2;
-  this.lastChar[0] = buf[buf.length - 1];
-  return buf.toString('utf16le', i, buf.length - 1);
+  return end;
 }
 
-// For UTF-16LE we do not explicitly append special replacement characters if we
-// end on a partial character, we simply let v8 handle that.
-function utf16End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed) {
-    const end = this.lastTotal - this.lastNeed;
-    return r + this.lastChar.toString('utf16le', 0, end);
-  }
-  return r;
+function base64Complete(buf, start, end) {
+  return end - (end - start) % 3;
 }
 
-function base64Text(buf, i) {
-  const n = (buf.length - i) % 3;
-  if (n === 0)
-    return buf.toString('base64', i);
-  this.lastNeed = 3 - n;
-  this.lastTotal = 3;
-  if (n === 1) {
-    this.lastChar[0] = buf[buf.length - 1];
-  } else {
-    this.lastChar[0] = buf[buf.length - 2];
-    this.lastChar[1] = buf[buf.length - 1];
-  }
-  return buf.toString('base64', i, buf.length - n);
-}
-
-
-function base64End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed)
-    return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
-  return r;
+// For UTF-16LE and Base64 we do not explicitly append special replacement
+// characters if we end on a partial character, we simply let v8 handle that.
+function simpleFlush() {
+  return this.lastChar.toString(this.encoding, 0, this.partial);
 }
 
 // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js
index 14933c46fcd888..c01639f54b28d1 100644
--- a/test/parallel/test-string-decoder.js
+++ b/test/parallel/test-string-decoder.js
@@ -28,6 +28,30 @@ test(
   '\u02e4\u0064\u12e4\u0030\u3045'
 );
 
+// Some invalid input, known to have caused trouble with chunking
+// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923
+// 00: |00000000 ASCII
+// 41: |01000001 ASCII
+// B8: 10|111000 continuation
+// CC: 110|01100 two-byte head
+// E2: 1110|0010 three-byte head
+// F0: 11110|000 four-byte head
+// F1: 11110|001'another four-byte head
+// FB: 111110|11 "five-byte head", not UTF-8
+test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
+test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
+test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
+test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
+test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
+test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
+test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
+test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
+test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
+     '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
+test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
+
 // UCS-2
 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
 
@@ -58,6 +82,11 @@ decoder = new StringDecoder('utf8');
 assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
 assert.strictEqual(decoder.end(), '\ufffd');
 
+decoder = new StringDecoder('utf8');
+assert.strictEqual(decoder.write(Buffer.from('f1', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('41f2', 'hex')), '\ufffdA');
+assert.strictEqual(decoder.end(), '\ufffd');
+
 
 // Additional UTF-16LE surrogate pair tests
 decoder = new StringDecoder('utf16le');
@@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) {
     sequence.forEach(function(write) {
       output += decoder.write(input.slice(write[0], write[1]));
     });
+    output += decoder.end();
     process.stdout.write('.');
     if (output !== expected) {
       var message =