Skip to content

Commit 588864d

Browse files
committed
string_decoder: fix handling of malformed utf8
There have been problems with utf8 decoding in cases where the input was invalid. Some cases would give different results depending on chunking, while others even led to exceptions. This commit simplifies the code in several ways, reducing the risk of breakage. Most importantly, the `text` method is not only used for bulk conversion of a single chunk, but also for the conversion of the mini buffer `lastChar` while handling characters spanning chunks. That should make most of the problems due to incorrect handling of special cases disappear. Secondly, that `text` method is now independent of encoding. The encoding-dependent method `complete` now has a single well-defined task: determine the buffer position up to which the input consists of complete characters. The actual conversion is handled in a central location, leading to a cleaner and leaner internal interface. Thirdly, we no longer try to remember just how many bytes we'll need for the next complete character. We simply try to fill up the `nextChar` buffer and perform a conversion on that. This reduces the number of internal counter variables from two to one, namely `partial` which indicates the number of bytes currently stored in `nextChar`. A possible drawback of this approach is that there is chance of degraded performance if input arrives one byte at a time and is from a script using long utf8 sequences. As a benefit, though, this allows us to emit a U+FFFD replacement character sooner in cases where the first byte of an utf8 sequence is not followed by the expected number of continuation bytes. Fixes: #7308
1 parent 1a1ff77 commit 588864d

File tree

2 files changed

+114
-137
lines changed

2 files changed

+114
-137
lines changed

lib/string_decoder.js

Lines changed: 84 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -44,186 +44,133 @@ function StringDecoder(encoding) {
4444
var nb;
4545
switch (this.encoding) {
4646
case 'utf16le':
47-
this.text = utf16Text;
48-
this.end = utf16End;
47+
this.complete = utf16Complete;
48+
this.flush = simpleFlush;
4949
// fall through
5050
case 'utf8':
5151
nb = 4;
5252
break;
5353
case 'base64':
54-
this.text = base64Text;
55-
this.end = base64End;
54+
this.complete = base64Complete;
55+
this.flush = simpleFlush;
5656
nb = 3;
5757
break;
5858
default:
5959
this.write = simpleWrite;
6060
this.end = simpleEnd;
6161
return;
6262
}
63-
this.lastNeed = 0;
64-
this.lastTotal = 0;
63+
this.partial = 0;
6564
this.lastChar = Buffer.allocUnsafe(nb);
6665
}
6766

6867
StringDecoder.prototype.write = function(buf) {
6968
if (buf.length === 0)
7069
return '';
71-
var r;
72-
var i;
73-
if (this.lastNeed) {
74-
r = this.fillLast(buf);
75-
if (r === undefined)
76-
return '';
77-
i = this.lastNeed;
78-
this.lastNeed = 0;
79-
} else {
80-
i = 0;
81-
}
82-
if (i < buf.length)
83-
return (r ? r + this.text(buf, i) : this.text(buf, i));
84-
return r || '';
70+
const partial = this.partial;
71+
if (!partial)
72+
return this.text(buf, 0, buf.length);
73+
74+
// We have incomplete characters in partial many bytes from last run.
75+
// Copy bytes from buf to fill lastChar (if there is enough input).
76+
const newHeadLen = Math.min(buf.length, this.lastChar.length - partial);
77+
const totalHeadLen = newHeadLen + partial;
78+
buf.copy(this.lastChar, partial, 0, newHeadLen);
79+
// Now we have totalHeadLen bytes of input in lastChar, try to convert that.
80+
let r = this.text(this.lastChar, 0, totalHeadLen);
81+
if (this.partial <= newHeadLen) // consumed at least all the old head
82+
r += this.text(buf, newHeadLen - this.partial, buf.length);
83+
return r;
8584
};
8685

87-
StringDecoder.prototype.end = utf8End;
88-
8986
// Returns only complete characters in a Buffer
90-
StringDecoder.prototype.text = utf8Text;
87+
StringDecoder.prototype.text = function(buf, start, end) {
88+
if (start === end)
89+
return '';
90+
const complete = this.complete(buf, start, end);
91+
this.partial = end - complete;
92+
if (this.partial && buf !== this.lastChar)
93+
buf.copy(this.lastChar, 0, complete, end);
94+
if (start === complete)
95+
return '';
96+
return buf.toString(this.encoding, start, complete);
97+
};
9198

92-
// Attempts to complete a partial character using bytes from a Buffer
93-
StringDecoder.prototype.fillLast = function(buf) {
94-
if (this.lastNeed <= buf.length) {
95-
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
96-
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
99+
// Returns a suitable representation of incomplete characters as well
100+
StringDecoder.prototype.end = function(buf) {
101+
let r = (buf && buf.length ? this.write(buf) : '');
102+
if (this.partial) {
103+
r += this.flush();
104+
this.partial = 0;
97105
}
98-
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
99-
this.lastNeed -= buf.length;
106+
return r;
100107
};
101108

102-
// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
103-
// continuation byte.
104-
function utf8CheckByte(byte) {
105-
if (byte <= 0x7F)
106-
return 0;
107-
else if (byte >> 5 === 0x06)
108-
return 2;
109-
else if (byte >> 4 === 0x0E)
110-
return 3;
111-
else if (byte >> 3 === 0x1E)
112-
return 4;
113-
return -1;
114-
}
109+
// Given (buf, start, end), determine the maximal n <= end such that
110+
// buf.slice(start, n) contains only complete characters
111+
StringDecoder.prototype.complete = utf8Complete;
112+
113+
// Returns a string representation of the this.partial bytes in
114+
// this.lastChar which represent an incomplete character
115+
StringDecoder.prototype.flush = utf8Flush;
115116

116117
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
117-
// character, returning the total number of bytes needed to complete the partial
118-
// character (if applicable).
119-
function utf8CheckIncomplete(self, buf, i) {
120-
var j = buf.length - 1;
121-
if (j < i)
122-
return 0;
123-
var nb = utf8CheckByte(buf[j--]);
124-
if (nb >= 0) {
125-
if (nb > 0)
126-
self.lastNeed = nb + 1 - (buf.length - j);
127-
return nb;
128-
}
129-
if (j < i)
130-
return 0;
131-
nb = utf8CheckByte(buf[j--]);
132-
if (nb >= 0) {
133-
if (nb > 0)
134-
self.lastNeed = nb + 1 - (buf.length - j);
135-
return nb;
136-
}
137-
if (j < i)
138-
return 0;
139-
nb = utf8CheckByte(buf[j--]);
140-
if (nb >= 0) {
141-
if (nb > 0)
142-
self.lastNeed = nb + 1 - (buf.length - j);
143-
return nb;
118+
// character, returning the position after the last complete character.
119+
function utf8Complete(buf, start, end) {
120+
if (start > end - 3)
121+
start = end - 3;
122+
for (let i = end - 1; i >= start; --i) {
123+
const byte = buf[i];
124+
let numBytes;
125+
if (byte >> 6 === 0x02)
126+
continue; // continuation byte
127+
else if (byte >> 5 === 0x06)
128+
numBytes = 2;
129+
else if (byte >> 4 === 0x0E)
130+
numBytes = 3;
131+
else if (byte >> 3 === 0x1E)
132+
numBytes = 4;
133+
else
134+
numBytes = 1; // ASCII or invalid
135+
if (i + numBytes > end) // incomplete
136+
return i; // continue next run at leading byte
137+
// Have complete sequence, possibly followed by garbage continuation.
138+
return end;
144139
}
145-
return 0;
146-
}
147-
148-
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
149-
// partial character, the character's bytes are buffered until the required
150-
// number of bytes are available.
151-
function utf8Text(buf, i) {
152-
const total = utf8CheckIncomplete(this, buf, i);
153-
if (!this.lastNeed)
154-
return buf.toString('utf8', i);
155-
this.lastTotal = total;
156-
const end = buf.length - (total - this.lastNeed);
157-
buf.copy(this.lastChar, 0, end);
158-
return buf.toString('utf8', i, end);
140+
// Ends in valid 4-byte sequence or invalid continuation characters.
141+
// Either way the input is complete, so convert it as is.
142+
return end;
159143
}
160144

161145
// For UTF-8, a replacement character for each buffered byte of a (partial)
162146
// character needs to be added to the output.
163-
function utf8End(buf) {
164-
const r = (buf && buf.length ? this.write(buf) : '');
165-
if (this.lastNeed)
166-
return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
167-
return r;
147+
function utf8Flush() {
148+
return '\ufffd'.repeat(this.partial);
168149
}
169150

170151
// UTF-16LE typically needs two bytes per character, but even if we have an even
171152
// number of bytes available, we need to check if we end on a leading/high
172153
// surrogate. In that case, we need to wait for the next two bytes in order to
173154
// decode the last character properly.
174-
function utf16Text(buf, i) {
175-
if ((buf.length - i) % 2 === 0) {
176-
const r = buf.toString('utf16le', i);
177-
if (r) {
178-
const c = r.charCodeAt(r.length - 1);
179-
if (c >= 0xD800 && c <= 0xDBFF) {
180-
this.lastNeed = 2;
181-
this.lastTotal = 4;
182-
this.lastChar[0] = buf[buf.length - 2];
183-
this.lastChar[1] = buf[buf.length - 1];
184-
return r.slice(0, -1);
185-
}
186-
}
187-
return r;
155+
function utf16Complete(buf, start, end) {
156+
if ((end - start) & 1)
157+
--end;
158+
if (end > start) {
159+
const byte = buf[end - 1];
160+
if (byte >= 0xD8 && byte <= 0xDB)
161+
return end - 2;
188162
}
189-
this.lastNeed = 1;
190-
this.lastTotal = 2;
191-
this.lastChar[0] = buf[buf.length - 1];
192-
return buf.toString('utf16le', i, buf.length - 1);
163+
return end;
193164
}
194165

195-
// For UTF-16LE we do not explicitly append special replacement characters if we
196-
// end on a partial character, we simply let v8 handle that.
197-
function utf16End(buf) {
198-
const r = (buf && buf.length ? this.write(buf) : '');
199-
if (this.lastNeed) {
200-
const end = this.lastTotal - this.lastNeed;
201-
return r + this.lastChar.toString('utf16le', 0, end);
202-
}
203-
return r;
166+
function base64Complete(buf, start, end) {
167+
return end - (end - start) % 3;
204168
}
205169

206-
function base64Text(buf, i) {
207-
const n = (buf.length - i) % 3;
208-
if (n === 0)
209-
return buf.toString('base64', i);
210-
this.lastNeed = 3 - n;
211-
this.lastTotal = 3;
212-
if (n === 1) {
213-
this.lastChar[0] = buf[buf.length - 1];
214-
} else {
215-
this.lastChar[0] = buf[buf.length - 2];
216-
this.lastChar[1] = buf[buf.length - 1];
217-
}
218-
return buf.toString('base64', i, buf.length - n);
219-
}
220-
221-
222-
function base64End(buf) {
223-
const r = (buf && buf.length ? this.write(buf) : '');
224-
if (this.lastNeed)
225-
return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
226-
return r;
170+
// For UTF-16LE and Base64 we do not explicitly append special replacement
171+
// characters if we end on a partial character, we simply let v8 handle that.
172+
function simpleFlush() {
173+
return this.lastChar.toString(this.encoding, 0, this.partial);
227174
}
228175

229176
// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)

test/parallel/test-string-decoder.js

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,30 @@ test(
2828
'\u02e4\u0064\u12e4\u0030\u3045'
2929
);
3030

31+
// Some invalid input, known to have caused trouble with chunking
32+
// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923
33+
// 00: |00000000 ASCII
34+
// 41: |01000001 ASCII
35+
// B8: 10|111000 continuation
36+
// CC: 110|01100 two-byte head
37+
// E2: 1110|0010 three-byte head
38+
// F0: 11110|000 four-byte head
39+
// F1: 11110|001'another four-byte head
40+
// FB: 111110|11 "five-byte head", not UTF-8
41+
test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
42+
test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
43+
test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
44+
test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
45+
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
46+
test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
47+
test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
48+
test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
49+
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
50+
test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
51+
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
52+
'\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
53+
test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
54+
3155
// UCS-2
3256
test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
3357

@@ -58,6 +82,11 @@ decoder = new StringDecoder('utf8');
5882
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
5983
assert.strictEqual(decoder.end(), '\ufffd');
6084

85+
decoder = new StringDecoder('utf8');
86+
assert.strictEqual(decoder.write(Buffer.from('f1', 'hex')), '');
87+
assert.strictEqual(decoder.write(Buffer.from('41f2', 'hex')), '\ufffdA');
88+
assert.strictEqual(decoder.end(), '\ufffd');
89+
6190

6291
// Additional UTF-16LE surrogate pair tests
6392
decoder = new StringDecoder('utf16le');
@@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) {
93122
sequence.forEach(function(write) {
94123
output += decoder.write(input.slice(write[0], write[1]));
95124
});
125+
output += decoder.end();
96126
process.stdout.write('.');
97127
if (output !== expected) {
98128
var message =

0 commit comments

Comments
 (0)