string_decoder: fix handling of malformed utf8

gagern · gagern · commit 588864d409fb · 2016-06-16T16:22:56.000+02:00
There have been problems with utf8 decoding in cases where the input was invalid. Some cases would give different results depending on chunking, while others even led to exceptions. This commit simplifies the code in several ways, reducing the risk of breakage. Most importantly, the `text` method is not only used for bulk conversion of a single chunk, but also for the conversion of the mini buffer `lastChar` while handling characters spanning chunks. That should make most of the problems due to incorrect handling of special cases disappear. Secondly, that `text` method is now independent of encoding. The encoding-dependent method `complete` now has a single well-defined task: determine the buffer position up to which the input consists of complete characters. The actual conversion is handled in a central location, leading to a cleaner and leaner internal interface. Thirdly, we no longer try to remember just how many bytes we'll need for the next complete character. We simply try to fill up the `nextChar` buffer and perform a conversion on that. This reduces the number of internal counter variables from two to one, namely `partial` which indicates the number of bytes currently stored in `nextChar`. A possible drawback of this approach is that there is chance of degraded performance if input arrives one byte at a time and is from a script using long utf8 sequences. As a benefit, though, this allows us to emit a U+FFFD replacement character sooner in cases where the first byte of an utf8 sequence is not followed by the expected number of continuation bytes. Fixes: #7308
diff --git a/lib/string_decoder.js b/lib/string_decoder.js
@@ -44,186 +44,133 @@ function StringDecoder(encoding) {
   var nb;
   switch (this.encoding) {
     case 'utf16le':
-      this.text = utf16Text;
-      this.end = utf16End;
+      this.complete = utf16Complete;
+      this.flush = simpleFlush;
       // fall through
     case 'utf8':
       nb = 4;
       break;
     case 'base64':
-      this.text = base64Text;
-      this.end = base64End;
+      this.complete = base64Complete;
+      this.flush = simpleFlush;
       nb = 3;
       break;
     default:
       this.write = simpleWrite;
       this.end = simpleEnd;
       return;
   }
-  this.lastNeed = 0;
-  this.lastTotal = 0;
+  this.partial = 0;
   this.lastChar = Buffer.allocUnsafe(nb);
 }
 
 StringDecoder.prototype.write = function(buf) {
   if (buf.length === 0)
     return '';
-  var r;
-  var i;
-  if (this.lastNeed) {
-    r = this.fillLast(buf);
-    if (r === undefined)
-      return '';
-    i = this.lastNeed;
-    this.lastNeed = 0;
-  } else {
-    i = 0;
-  }
-  if (i < buf.length)
-    return (r ? r + this.text(buf, i) : this.text(buf, i));
-  return r || '';
+  const partial = this.partial;
+  if (!partial)
+    return this.text(buf, 0, buf.length);
+
+  // We have incomplete characters in partial many bytes from last run.
+  // Copy bytes from buf to fill lastChar (if there is enough input).
+  const newHeadLen = Math.min(buf.length, this.lastChar.length - partial);
+  const totalHeadLen = newHeadLen + partial;
+  buf.copy(this.lastChar, partial, 0, newHeadLen);
+  // Now we have totalHeadLen bytes of input in lastChar, try to convert that.
+  let r = this.text(this.lastChar, 0, totalHeadLen);
+  if (this.partial <= newHeadLen) // consumed at least all the old head
+    r += this.text(buf, newHeadLen - this.partial, buf.length);
+  return r;
 };
 
-StringDecoder.prototype.end = utf8End;
-
 // Returns only complete characters in a Buffer
-StringDecoder.prototype.text = utf8Text;
+StringDecoder.prototype.text = function(buf, start, end) {
+  if (start === end)
+    return '';
+  const complete = this.complete(buf, start, end);
+  this.partial = end - complete;
+  if (this.partial && buf !== this.lastChar)
+    buf.copy(this.lastChar, 0, complete, end);
+  if (start === complete)
+    return '';
+  return buf.toString(this.encoding, start, complete);
+};
 
-// Attempts to complete a partial character using bytes from a Buffer
-StringDecoder.prototype.fillLast = function(buf) {
-  if (this.lastNeed <= buf.length) {
-    buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
-    return this.lastChar.toString(this.encoding, 0, this.lastTotal);
+// Returns a suitable representation of incomplete characters as well
+StringDecoder.prototype.end = function(buf) {
+  let r = (buf && buf.length ? this.write(buf) : '');
+  if (this.partial) {
+    r += this.flush();
+    this.partial = 0;
   }
-  buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
-  this.lastNeed -= buf.length;
+  return r;
 };
 
-// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
-// continuation byte.
-function utf8CheckByte(byte) {
-  if (byte <= 0x7F)
-    return 0;
-  else if (byte >> 5 === 0x06)
-    return 2;
-  else if (byte >> 4 === 0x0E)
-    return 3;
-  else if (byte >> 3 === 0x1E)
-    return 4;
-  return -1;
-}
+// Given (buf, start, end), determine the maximal n <= end such that
+// buf.slice(start, n) contains only complete characters
+StringDecoder.prototype.complete = utf8Complete;
+
+// Returns a string representation of the this.partial bytes in
+// this.lastChar which represent an incomplete character
+StringDecoder.prototype.flush = utf8Flush;
 
 // Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
-// character, returning the total number of bytes needed to complete the partial
-// character (if applicable).
-function utf8CheckIncomplete(self, buf, i) {
-  var j = buf.length - 1;
-  if (j < i)
-    return 0;
-  var nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
-  }
-  if (j < i)
-    return 0;
-  nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
-  }
-  if (j < i)
-    return 0;
-  nb = utf8CheckByte(buf[j--]);
-  if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
-    return nb;
+// character, returning the position after the last complete character.
+function utf8Complete(buf, start, end) {
+  if (start > end - 3)
+    start = end - 3;
+  for (let i = end - 1; i >= start; --i) {
+    const byte = buf[i];
+    let numBytes;
+    if (byte >> 6 === 0x02)
+      continue; // continuation byte
+    else if (byte >> 5 === 0x06)
+      numBytes = 2;
+    else if (byte >> 4 === 0x0E)
+      numBytes = 3;
+    else if (byte >> 3 === 0x1E)
+      numBytes = 4;
+    else
+      numBytes = 1; // ASCII or invalid
+    if (i + numBytes > end) // incomplete
+      return i; // continue next run at leading byte
+    // Have complete sequence, possibly followed by garbage continuation.
+    return end;
   }
-  return 0;
-}
-
-// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
-// partial character, the character's bytes are buffered until the required
-// number of bytes are available.
-function utf8Text(buf, i) {
-  const total = utf8CheckIncomplete(this, buf, i);
-  if (!this.lastNeed)
-    return buf.toString('utf8', i);
-  this.lastTotal = total;
-  const end = buf.length - (total - this.lastNeed);
-  buf.copy(this.lastChar, 0, end);
-  return buf.toString('utf8', i, end);
+  // Ends in valid 4-byte sequence or invalid continuation characters.
+  // Either way the input is complete, so convert it as is.
+  return end;
 }
 
 // For UTF-8, a replacement character for each buffered byte of a (partial)
 // character needs to be added to the output.
-function utf8End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed)
-    return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
-  return r;
+function utf8Flush() {
+  return '\ufffd'.repeat(this.partial);
 }
 
 // UTF-16LE typically needs two bytes per character, but even if we have an even
 // number of bytes available, we need to check if we end on a leading/high
 // surrogate. In that case, we need to wait for the next two bytes in order to
 // decode the last character properly.
-function utf16Text(buf, i) {
-  if ((buf.length - i) % 2 === 0) {
-    const r = buf.toString('utf16le', i);
-    if (r) {
-      const c = r.charCodeAt(r.length - 1);
-      if (c >= 0xD800 && c <= 0xDBFF) {
-        this.lastNeed = 2;
-        this.lastTotal = 4;
-        this.lastChar[0] = buf[buf.length - 2];
-        this.lastChar[1] = buf[buf.length - 1];
-        return r.slice(0, -1);
-      }
-    }
-    return r;
+function utf16Complete(buf, start, end) {
+  if ((end - start) & 1)
+    --end;
+  if (end > start) {
+    const byte = buf[end - 1];
+    if (byte >= 0xD8 && byte <= 0xDB)
+      return end - 2;
   }
-  this.lastNeed = 1;
-  this.lastTotal = 2;
-  this.lastChar[0] = buf[buf.length - 1];
-  return buf.toString('utf16le', i, buf.length - 1);
+  return end;
 }
 
-// For UTF-16LE we do not explicitly append special replacement characters if we
-// end on a partial character, we simply let v8 handle that.
-function utf16End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed) {
-    const end = this.lastTotal - this.lastNeed;
-    return r + this.lastChar.toString('utf16le', 0, end);
-  }
-  return r;
+function base64Complete(buf, start, end) {
+  return end - (end - start) % 3;
 }
 
-function base64Text(buf, i) {
-  const n = (buf.length - i) % 3;
-  if (n === 0)
-    return buf.toString('base64', i);
-  this.lastNeed = 3 - n;
-  this.lastTotal = 3;
-  if (n === 1) {
-    this.lastChar[0] = buf[buf.length - 1];
-  } else {
-    this.lastChar[0] = buf[buf.length - 2];
-    this.lastChar[1] = buf[buf.length - 1];
-  }
-  return buf.toString('base64', i, buf.length - n);
-}
-
-
-function base64End(buf) {
-  const r = (buf && buf.length ? this.write(buf) : '');
-  if (this.lastNeed)
-    return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
-  return r;
+// For UTF-16LE and Base64 we do not explicitly append special replacement
+// characters if we end on a partial character, we simply let v8 handle that.
+function simpleFlush() {
+  return this.lastChar.toString(this.encoding, 0, this.partial);
 }
 
 // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js
@@ -28,6 +28,30 @@ test(
   '\u02e4\u0064\u12e4\u0030\u3045'
 );
 
+// Some invalid input, known to have caused trouble with chunking
+// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923
+// 00: |00000000 ASCII
+// 41: |01000001 ASCII
+// B8: 10|111000 continuation
+// CC: 110|01100 two-byte head
+// E2: 1110|0010 three-byte head
+// F0: 11110|000 four-byte head
+// F1: 11110|001'another four-byte head
+// FB: 111110|11 "five-byte head", not UTF-8
+test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
+test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
+test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
+test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
+test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
+test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
+test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
+test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
+test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
+     '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
+test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
+
 // UCS-2
 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
 
@@ -58,6 +82,11 @@ decoder = new StringDecoder('utf8');
 assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
 assert.strictEqual(decoder.end(), '\ufffd');
 
+decoder = new StringDecoder('utf8');
+assert.strictEqual(decoder.write(Buffer.from('f1', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('41f2', 'hex')), '\ufffdA');
+assert.strictEqual(decoder.end(), '\ufffd');
+
 
 // Additional UTF-16LE surrogate pair tests
 decoder = new StringDecoder('utf16le');
@@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) {
     sequence.forEach(function(write) {
       output += decoder.write(input.slice(write[0], write[1]));
     });
+    output += decoder.end();
     process.stdout.write('.');
     if (output !== expected) {
       var message =