Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 84 additions & 137 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,186 +44,133 @@ function StringDecoder(encoding) {
var nb;
switch (this.encoding) {
case 'utf16le':
this.text = utf16Text;
this.end = utf16End;
this.complete = utf16Complete;
this.flush = simpleFlush;
// fall through
case 'utf8':
nb = 4;
break;
case 'base64':
this.text = base64Text;
this.end = base64End;
this.complete = base64Complete;
this.flush = simpleFlush;
nb = 3;
break;
default:
this.write = simpleWrite;
this.end = simpleEnd;
return;
}
this.lastNeed = 0;
this.lastTotal = 0;
this.partial = 0;
this.lastChar = Buffer.allocUnsafe(nb);
}

StringDecoder.prototype.write = function(buf) {
if (buf.length === 0)
return '';
var r;
var i;
if (this.lastNeed) {
r = this.fillLast(buf);
if (r === undefined)
return '';
i = this.lastNeed;
this.lastNeed = 0;
} else {
i = 0;
}
if (i < buf.length)
return (r ? r + this.text(buf, i) : this.text(buf, i));
return r || '';
const partial = this.partial;
if (!partial)
return this.text(buf, 0, buf.length);

// We have incomplete characters in partial many bytes from last run.
// Copy bytes from buf to fill lastChar (if there is enough input).
const newHeadLen = Math.min(buf.length, this.lastChar.length - partial);
const totalHeadLen = newHeadLen + partial;
buf.copy(this.lastChar, partial, 0, newHeadLen);
// Now we have totalHeadLen bytes of input in lastChar, try to convert that.
let r = this.text(this.lastChar, 0, totalHeadLen);
if (this.partial <= newHeadLen) // consumed at least all the old head
r += this.text(buf, newHeadLen - this.partial, buf.length);
return r;
};

StringDecoder.prototype.end = utf8End;

// Returns only complete characters in a Buffer
StringDecoder.prototype.text = utf8Text;
StringDecoder.prototype.text = function(buf, start, end) {
if (start === end)
return '';
const complete = this.complete(buf, start, end);
this.partial = end - complete;
if (this.partial && buf !== this.lastChar)
buf.copy(this.lastChar, 0, complete, end);
if (start === complete)
return '';
return buf.toString(this.encoding, start, complete);
};

// Attempts to complete a partial character using bytes from a Buffer
StringDecoder.prototype.fillLast = function(buf) {
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
// Returns a suitable representation of incomplete characters as well
StringDecoder.prototype.end = function(buf) {
let r = (buf && buf.length ? this.write(buf) : '');
if (this.partial) {
r += this.flush();
this.partial = 0;
}
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
this.lastNeed -= buf.length;
return r;
};

// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
// continuation byte.
function utf8CheckByte(byte) {
if (byte <= 0x7F)
return 0;
else if (byte >> 5 === 0x06)
return 2;
else if (byte >> 4 === 0x0E)
return 3;
else if (byte >> 3 === 0x1E)
return 4;
return -1;
}
// Given (buf, start, end), determine the maximal n <= end such that
// buf.slice(start, n) contains only complete characters
StringDecoder.prototype.complete = utf8Complete;

// Returns a string representation of the this.partial bytes in
// this.lastChar which represent an incomplete character
StringDecoder.prototype.flush = utf8Flush;

// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
// character, returning the total number of bytes needed to complete the partial
// character (if applicable).
function utf8CheckIncomplete(self, buf, i) {
var j = buf.length - 1;
if (j < i)
return 0;
var nb = utf8CheckByte(buf[j--]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
return nb;
}
if (j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
return nb;
}
if (j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
return nb;
// character, returning the position after the last complete character.
function utf8Complete(buf, start, end) {
if (start > end - 3)
start = end - 3;
for (let i = end - 1; i >= start; --i) {
const byte = buf[i];
let numBytes;
if (byte >> 6 === 0x02)
continue; // continuation byte
else if (byte >> 5 === 0x06)
numBytes = 2;
else if (byte >> 4 === 0x0E)
numBytes = 3;
else if (byte >> 3 === 0x1E)
numBytes = 4;
else
numBytes = 1; // ASCII or invalid
if (i + numBytes > end) // incomplete
return i; // continue next run at leading byte
// Have complete sequence, possibly followed by garbage continuation.
return end;
}
return 0;
}

// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
// partial character, the character's bytes are buffered until the required
// number of bytes are available.
function utf8Text(buf, i) {
const total = utf8CheckIncomplete(this, buf, i);
if (!this.lastNeed)
return buf.toString('utf8', i);
this.lastTotal = total;
const end = buf.length - (total - this.lastNeed);
buf.copy(this.lastChar, 0, end);
return buf.toString('utf8', i, end);
// Ends in valid 4-byte sequence or invalid continuation characters.
// Either way the input is complete, so convert it as is.
return end;
}

// For UTF-8, a replacement character for each buffered byte of a (partial)
// character needs to be added to the output.
function utf8End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed)
return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
return r;
function utf8Flush() {
return '\ufffd'.repeat(this.partial);
}

// UTF-16LE typically needs two bytes per character, but even if we have an even
// number of bytes available, we need to check if we end on a leading/high
// surrogate. In that case, we need to wait for the next two bytes in order to
// decode the last character properly.
function utf16Text(buf, i) {
if ((buf.length - i) % 2 === 0) {
const r = buf.toString('utf16le', i);
if (r) {
const c = r.charCodeAt(r.length - 1);
if (c >= 0xD800 && c <= 0xDBFF) {
this.lastNeed = 2;
this.lastTotal = 4;
this.lastChar[0] = buf[buf.length - 2];
this.lastChar[1] = buf[buf.length - 1];
return r.slice(0, -1);
}
}
return r;
function utf16Complete(buf, start, end) {
if ((end - start) & 1)
--end;
if (end > start) {
const byte = buf[end - 1];
if (byte >= 0xD8 && byte <= 0xDB)
return end - 2;
}
this.lastNeed = 1;
this.lastTotal = 2;
this.lastChar[0] = buf[buf.length - 1];
return buf.toString('utf16le', i, buf.length - 1);
return end;
}

// For UTF-16LE we do not explicitly append special replacement characters if we
// end on a partial character, we simply let v8 handle that.
function utf16End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed) {
const end = this.lastTotal - this.lastNeed;
return r + this.lastChar.toString('utf16le', 0, end);
}
return r;
function base64Complete(buf, start, end) {
return end - (end - start) % 3;
}

function base64Text(buf, i) {
const n = (buf.length - i) % 3;
if (n === 0)
return buf.toString('base64', i);
this.lastNeed = 3 - n;
this.lastTotal = 3;
if (n === 1) {
this.lastChar[0] = buf[buf.length - 1];
} else {
this.lastChar[0] = buf[buf.length - 2];
this.lastChar[1] = buf[buf.length - 1];
}
return buf.toString('base64', i, buf.length - n);
}


function base64End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed)
return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
return r;
// For UTF-16LE and Base64 we do not explicitly append special replacement
// characters if we end on a partial character, we simply let v8 handle that.
function simpleFlush() {
return this.lastChar.toString(this.encoding, 0, this.partial);
}

// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
Expand Down
30 changes: 30 additions & 0 deletions test/parallel/test-string-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,30 @@ test(
'\u02e4\u0064\u12e4\u0030\u3045'
);

// Some invalid input, known to have caused trouble with chunking
// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923
// 00: |00000000 ASCII
// 41: |01000001 ASCII
// B8: 10|111000 continuation
// CC: 110|01100 two-byte head
// E2: 1110|0010 three-byte head
// F0: 11110|000 four-byte head
// F1: 11110|001'another four-byte head
// FB: 111110|11 "five-byte head", not UTF-8
test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
'\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');

// UCS-2
test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');

Expand Down Expand Up @@ -58,6 +82,11 @@ decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
assert.strictEqual(decoder.end(), '\ufffd');

decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('f1', 'hex')), '');
assert.strictEqual(decoder.write(Buffer.from('41f2', 'hex')), '\ufffdA');
assert.strictEqual(decoder.end(), '\ufffd');


// Additional UTF-16LE surrogate pair tests
decoder = new StringDecoder('utf16le');
Expand Down Expand Up @@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) {
sequence.forEach(function(write) {
output += decoder.write(input.slice(write[0], write[1]));
});
output += decoder.end();
process.stdout.write('.');
if (output !== expected) {
var message =
Expand Down