diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 6eb71efc07d803..54b4d9cee4818f 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -44,15 +44,15 @@ function StringDecoder(encoding) { var nb; switch (this.encoding) { case 'utf16le': - this.text = utf16Text; - this.end = utf16End; + this.complete = utf16Complete; + this.flush = simpleFlush; // fall through case 'utf8': nb = 4; break; case 'base64': - this.text = base64Text; - this.end = base64End; + this.complete = base64Complete; + this.flush = simpleFlush; nb = 3; break; default: @@ -60,170 +60,117 @@ function StringDecoder(encoding) { this.end = simpleEnd; return; } - this.lastNeed = 0; - this.lastTotal = 0; + this.partial = 0; this.lastChar = Buffer.allocUnsafe(nb); } StringDecoder.prototype.write = function(buf) { if (buf.length === 0) return ''; - var r; - var i; - if (this.lastNeed) { - r = this.fillLast(buf); - if (r === undefined) - return ''; - i = this.lastNeed; - this.lastNeed = 0; - } else { - i = 0; - } - if (i < buf.length) - return (r ? r + this.text(buf, i) : this.text(buf, i)); - return r || ''; + const partial = this.partial; + if (!partial) + return this.text(buf, 0, buf.length); + + // We have incomplete characters in partial many bytes from last run. + // Copy bytes from buf to fill lastChar (if there is enough input). + const newHeadLen = Math.min(buf.length, this.lastChar.length - partial); + const totalHeadLen = newHeadLen + partial; + buf.copy(this.lastChar, partial, 0, newHeadLen); + // Now we have totalHeadLen bytes of input in lastChar, try to convert that. + let r = this.text(this.lastChar, 0, totalHeadLen); + if (this.partial <= newHeadLen) // consumed at least all the old head + r += this.text(buf, newHeadLen - this.partial, buf.length); + return r; }; -StringDecoder.prototype.end = utf8End; - // Returns only complete characters in a Buffer -StringDecoder.prototype.text = utf8Text; +StringDecoder.prototype.text = function(buf, start, end) { + if (start === end) + return ''; + const complete = this.complete(buf, start, end); + this.partial = end - complete; + if (this.partial && buf !== this.lastChar) + buf.copy(this.lastChar, 0, complete, end); + if (start === complete) + return ''; + return buf.toString(this.encoding, start, complete); +}; -// Attempts to complete a partial character using bytes from a Buffer -StringDecoder.prototype.fillLast = function(buf) { - if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); +// Returns a suitable representation of incomplete characters as well +StringDecoder.prototype.end = function(buf) { + let r = (buf && buf.length ? this.write(buf) : ''); + if (this.partial) { + r += this.flush(); + this.partial = 0; } - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); - this.lastNeed -= buf.length; + return r; }; -// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a -// continuation byte. -function utf8CheckByte(byte) { - if (byte <= 0x7F) - return 0; - else if (byte >> 5 === 0x06) - return 2; - else if (byte >> 4 === 0x0E) - return 3; - else if (byte >> 3 === 0x1E) - return 4; - return -1; -} +// Given (buf, start, end), determine the maximal n <= end such that +// buf.slice(start, n) contains only complete characters +StringDecoder.prototype.complete = utf8Complete; + +// Returns a string representation of the this.partial bytes in +// this.lastChar which represent an incomplete character +StringDecoder.prototype.flush = utf8Flush; // Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8 -// character, returning the total number of bytes needed to complete the partial -// character (if applicable). -function utf8CheckIncomplete(self, buf, i) { - var j = buf.length - 1; - if (j < i) - return 0; - var nb = utf8CheckByte(buf[j--]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb + 1 - (buf.length - j); - return nb; - } - if (j < i) - return 0; - nb = utf8CheckByte(buf[j--]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb + 1 - (buf.length - j); - return nb; - } - if (j < i) - return 0; - nb = utf8CheckByte(buf[j--]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb + 1 - (buf.length - j); - return nb; +// character, returning the position after the last complete character. +function utf8Complete(buf, start, end) { + if (start > end - 3) + start = end - 3; + for (let i = end - 1; i >= start; --i) { + const byte = buf[i]; + let numBytes; + if (byte >> 6 === 0x02) + continue; // continuation byte + else if (byte >> 5 === 0x06) + numBytes = 2; + else if (byte >> 4 === 0x0E) + numBytes = 3; + else if (byte >> 3 === 0x1E) + numBytes = 4; + else + numBytes = 1; // ASCII or invalid + if (i + numBytes > end) // incomplete + return i; // continue next run at leading byte + // Have complete sequence, possibly followed by garbage continuation. + return end; } - return 0; -} - -// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a -// partial character, the character's bytes are buffered until the required -// number of bytes are available. -function utf8Text(buf, i) { - const total = utf8CheckIncomplete(this, buf, i); - if (!this.lastNeed) - return buf.toString('utf8', i); - this.lastTotal = total; - const end = buf.length - (total - this.lastNeed); - buf.copy(this.lastChar, 0, end); - return buf.toString('utf8', i, end); + // Ends in valid 4-byte sequence or invalid continuation characters. + // Either way the input is complete, so convert it as is. + return end; } // For UTF-8, a replacement character for each buffered byte of a (partial) // character needs to be added to the output. -function utf8End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) - return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed); - return r; +function utf8Flush() { + return '\ufffd'.repeat(this.partial); } // UTF-16LE typically needs two bytes per character, but even if we have an even // number of bytes available, we need to check if we end on a leading/high // surrogate. In that case, we need to wait for the next two bytes in order to // decode the last character properly. -function utf16Text(buf, i) { - if ((buf.length - i) % 2 === 0) { - const r = buf.toString('utf16le', i); - if (r) { - const c = r.charCodeAt(r.length - 1); - if (c >= 0xD800 && c <= 0xDBFF) { - this.lastNeed = 2; - this.lastTotal = 4; - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; - return r.slice(0, -1); - } - } - return r; +function utf16Complete(buf, start, end) { + if ((end - start) & 1) + --end; + if (end > start) { + const byte = buf[end - 1]; + if (byte >= 0xD8 && byte <= 0xDB) + return end - 2; } - this.lastNeed = 1; - this.lastTotal = 2; - this.lastChar[0] = buf[buf.length - 1]; - return buf.toString('utf16le', i, buf.length - 1); + return end; } -// For UTF-16LE we do not explicitly append special replacement characters if we -// end on a partial character, we simply let v8 handle that. -function utf16End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - const end = this.lastTotal - this.lastNeed; - return r + this.lastChar.toString('utf16le', 0, end); - } - return r; +function base64Complete(buf, start, end) { + return end - (end - start) % 3; } -function base64Text(buf, i) { - const n = (buf.length - i) % 3; - if (n === 0) - return buf.toString('base64', i); - this.lastNeed = 3 - n; - this.lastTotal = 3; - if (n === 1) { - this.lastChar[0] = buf[buf.length - 1]; - } else { - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; - } - return buf.toString('base64', i, buf.length - n); -} - - -function base64End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) - return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed); - return r; +// For UTF-16LE and Base64 we do not explicitly append special replacement +// characters if we end on a partial character, we simply let v8 handle that. +function simpleFlush() { + return this.lastChar.toString(this.encoding, 0, this.partial); } // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index 14933c46fcd888..c01639f54b28d1 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -28,6 +28,30 @@ test( '\u02e4\u0064\u12e4\u0030\u3045' ); +// Some invalid input, known to have caused trouble with chunking +// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923 +// 00: |00000000 ASCII +// 41: |01000001 ASCII +// B8: 10|111000 continuation +// CC: 110|01100 two-byte head +// E2: 1110|0010 three-byte head +// F0: 11110|000 four-byte head +// F1: 11110|001'another four-byte head +// FB: 111110|11 "five-byte head", not UTF-8 +test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA'); +test('utf-8', Buffer.from('E2', 'hex'), '\ufffd'); +test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA'); +test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338'); +test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA'); +test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338'); +test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0'); +test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38'); +test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338'); +test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001'); +test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D + '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd'); +test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379'); + // UCS-2 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); @@ -58,6 +82,11 @@ decoder = new StringDecoder('utf8'); assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd'); assert.strictEqual(decoder.end(), '\ufffd'); +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('f1', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('41f2', 'hex')), '\ufffdA'); +assert.strictEqual(decoder.end(), '\ufffd'); + // Additional UTF-16LE surrogate pair tests decoder = new StringDecoder('utf16le'); @@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) { sequence.forEach(function(write) { output += decoder.write(input.slice(write[0], write[1])); }); + output += decoder.end(); process.stdout.write('.'); if (output !== expected) { var message =