nodejs · gagern · Jun 16, 2016
diff --git a/lib/string_decoder.js b/lib/string_decoder.js
@@ -44,186 +44,133 @@ function StringDecoder(encoding) {
   var nb;
   switch (this.encoding) {
     case 'utf16le':
-      this.text = utf16Text;
-      this.end = utf16End;
+      this.complete = utf16Complete;
+      this.flush = simpleFlush;
       // fall through
     case 'utf8':
       nb = 4;
       break;
     case 'base64':
-      this.text = base64Text;
-      this.end = base64End;
+      this.complete = base64Complete;
+      this.flush = simpleFlush;
       nb = 3;
       break;
     default:
       this.write = simpleWrite;
       this.end = simpleEnd;
       return;
   }
-  this.lastNeed = 0;
-  this.lastTotal = 0;
+  this.partial = 0;
   this.lastChar = Buffer.allocUnsafe(nb);
 }
 
 StringDecoder.prototype.write = function(buf) {
   if (buf.length === 0)
     return '';
-  var r;
-  var i;
-  if (this.lastNeed) {
-    r = this.fillLast(buf);
-    if (r === undefined)
-      return '';
-    i = this.lastNeed;
-    this.lastNeed = 0;
-  } else {
-    i = 0;
-  }
-  if (i < buf.length)
-    return (r ? r + this.text(buf, i) : this.text(buf, i));
-  return r || '';
+  const partial = this.partial;
+  if (!partial)
+    return this.text(buf, 0, buf.length);
+
+  // We have incomplete characters in partial many bytes from last run.
+  // Copy bytes from buf to fill lastChar (if there is enough input).
+  const newHeadLen = Math.min(buf.length, this.lastChar.length - partial);
+  const totalHeadLen = newHeadLen + partial;
+  buf.copy(this.lastChar, partial, 0, newHeadLen);
+  // Now we have totalHeadLen bytes of input in lastChar, try to convert that.
+  let r = this.text(this.lastChar, 0, totalHeadLen);
+  if (this.partial <= newHeadLen) // consumed at least all the old head
+    r += this.text(buf, newHeadLen - this.partial, buf.length);
+  return r;
 };
 
-StringDecoder.prototype.end = utf8End;
-
 // Returns only complete characters in a Buffer
-StringDecoder.prototype.text = utf8Text;
+StringDecoder.prototype.text = function(buf, start, end) {
+  if (start === end)
+    return '';
+  const complete = this.complete(buf, start, end);
+  this.partial = end - complete;
+  if (this.partial && buf !== this.lastChar)
+    buf.copy(this.lastChar, 0, complete, end);
+  if (start === complete)
+    return '';
+  return buf.toString(this.encoding, start, complete);
+};
 
-// Attempts to complete a partial character using bytes from a Buffer
-StringDecoder.prototype.fillLast = function(buf) {
-  if (this.lastNeed <= buf.length) {
-    buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
-    return this.lastChar.toString(this.encoding, 0, this.lastTotal);
+// Returns a suitable representation of incomplete characters as well
+StringDecoder.prototype.end = function(buf) {
+  let r = (buf && buf.length ? this.write(buf) : '');
+  if (this.partial) {
+    r += this.flush();
+    this.partial = 0;
   }
-  buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
-  this.lastNeed -= buf.length;
+  return r;
 };
 
-// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
-// continuation byte.
-function utf8CheckByte(byte) {
-  if (byte <= 0x7F)
-    return 0;
-  else if (byte >> 5 === 0x06)
-    return 2;
-  else if (byte >> 4 === 0x0E)
-    return 3;
-  else if (byte >> 3 === 0x1E)
-    return 4;
-  return -1;
-}
+// Given (buf, start, end), determine the maximal n <= end such that
+// buf.slice(start, n) contains only complete characters
+StringDecoder.prototype.complete = utf8Complete;
+
+// Returns a string representation of the this.partial bytes in
+// this.lastChar which represent an incomplete character
+StringDecoder.prototype.flush = utf8Flush;
 
 // Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
-// character, returning the total number of bytes needed to complete the partial
-// character (if applicable).
-function utf8CheckIncomplete(self, buf, i) {
-  var j = buf.length - 1;
-  if (j < i)
-    return 0;
-  var nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
-  }
-  if (j < i)
-    return 0;
-  nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
-  }
-  if (j < i)
-    return 0;
-  nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
+// character, returning the position after the last complete character.
+function utf8Complete(buf, start, end) {
+  if (start > end - 3)
+    start = end - 3;
+  for (let i = end - 1; i >= start; --i) {
+    const byte = buf[i];
+    let numBytes;
+    if (byte >> 6 === 0x02)
+      continue; // continuation byte
+    else if (byte >> 5 === 0x06)
+      numBytes = 2;
+    else if (byte >> 4 === 0x0E)
+      numBytes = 3;
+    else if (byte >> 3 === 0x1E)
+      numBytes = 4;
+    else
+      numBytes = 1; // ASCII or invalid
+    if (i + numBytes > end) // incomplete
+      return i; // continue next run at leading byte
+    // Have complete sequence, possibly followed by garbage continuation.
+    return end;
   }
-  return 0;
-}
-
-// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
-// partial character, the character's bytes are buffered until the required
-// number of bytes are available.
-function utf8Text(buf, i) {
-  const total = utf8CheckIncomplete(this, buf, i);
-  if (!this.lastNeed)
-    return buf.toString('utf8', i);
-  this.lastTotal = total;
-  const end = buf.length - (total - this.lastNeed);
-  buf.copy(this.lastChar, 0, end);
-  return buf.toString('utf8', i, end);
+  // Ends in valid 4-byte sequence or invalid continuation characters.
+  // Either way the input is complete, so convert it as is.
+  return end;
 }
 
 // For UTF-8, a replacement character for each buffered byte of a (partial)
 // character needs to be added to the output.
-function utf8End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed)
-    return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
-  return r;
+function utf8Flush() {
+  return '\ufffd'.repeat(this.partial);
 }
 
 // UTF-16LE typically needs two bytes per character, but even if we have an even
 // number of bytes available, we need to check if we end on a leading/high
 // surrogate. In that case, we need to wait for the next two bytes in order to
 // decode the last character properly.
-function utf16Text(buf, i) {
-  if ((buf.length - i) % 2 === 0) {
-    const r = buf.toString('utf16le', i);
-    if (r) {
-      const c = r.charCodeAt(r.length - 1);
-      if (c >= 0xD800 && c <= 0xDBFF) {
-        this.lastNeed = 2;
-        this.lastTotal = 4;
-        this.lastChar[0] = buf[buf.length - 2];
-        this.lastChar[1] = buf[buf.length - 1];
-        return r.slice(0, -1);
-      }
-    }
-    return r;
+function utf16Complete(buf, start, end) {
+  if ((end - start) & 1)
+    --end;
+  if (end > start) {
+    const byte = buf[end - 1];
+    if (byte >= 0xD8 && byte <= 0xDB)
+      return end - 2;
   }
-  this.lastNeed = 1;
-  this.lastTotal = 2;
-  this.lastChar[0] = buf[buf.length - 1];
-  return buf.toString('utf16le', i, buf.length - 1);
+  return end;
 }
 
-// For UTF-16LE we do not explicitly append special replacement characters if we
-// end on a partial character, we simply let v8 handle that.
-function utf16End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed) {
-    const end = this.lastTotal - this.lastNeed;
-    return r + this.lastChar.toString('utf16le', 0, end);
-  }
-  return r;
+function base64Complete(buf, start, end) {
+  return end - (end - start) % 3;
 }
 
-function base64Text(buf, i) {
-  const n = (buf.length - i) % 3;
-  if (n === 0)
-    return buf.toString('base64', i);
-  this.lastNeed = 3 - n;
-  this.lastTotal = 3;
-  if (n === 1) {
-    this.lastChar[0] = buf[buf.length - 1];
-  } else {
-    this.lastChar[0] = buf[buf.length - 2];
-    this.lastChar[1] = buf[buf.length - 1];
-  }
-  return buf.toString('base64', i, buf.length - n);
-}
-
-
-function base64End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed)
-    return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
-  return r;
+// For UTF-16LE and Base64 we do not explicitly append special replacement
+// characters if we end on a partial character, we simply let v8 handle that.
+function simpleFlush() {
+  return this.lastChar.toString(this.encoding, 0, this.partial);
 }
 
 // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)

diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js
@@ -28,6 +28,30 @@ test(
   '\u02e4\u0064\u12e4\u0030\u3045'
 );
 
+// Some invalid input, known to have caused trouble with chunking
+// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923
+// 00: |00000000 ASCII
+// 41: |01000001 ASCII
+// B8: 10|111000 continuation
+// CC: 110|01100 two-byte head
+// E2: 1110|0010 three-byte head
+// F0: 11110|000 four-byte head
+// F1: 11110|001'another four-byte head
+// FB: 111110|11 "five-byte head", not UTF-8
+test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
+test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
+test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
+test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
+test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
+test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
+test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
+test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
+test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
+     '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
+test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
+
 // UCS-2
 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
 
@@ -58,6 +82,11 @@ decoder = new StringDecoder('utf8');
 assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
 assert.strictEqual(decoder.end(), '\ufffd');
 
+decoder = new StringDecoder('utf8');
+assert.strictEqual(decoder.write(Buffer.from('f1', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('41f2', 'hex')), '\ufffdA');
+assert.strictEqual(decoder.end(), '\ufffd');
+
 
 // Additional UTF-16LE surrogate pair tests
 decoder = new StringDecoder('utf16le');
@@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) {
     sequence.forEach(function(write) {
       output += decoder.write(input.slice(write[0], write[1]));
     });
+    output += decoder.end();
     process.stdout.write('.');
     if (output !== expected) {
       var message =