Skip to content

Commit 9fef5eb

Browse files
committed
Reduce in-memory size of named entities data from ~8.5Mb to ~250Kb (closes inikulin#52)
1 parent fefd045 commit 9fef5eb

File tree

13 files changed

+697
-152
lines changed

13 files changed

+697
-152
lines changed

.eslintignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
lib/tokenizer/named_entity_trie.js
1+
lib/tokenizer/named_entity_data.js
22
test/benchmark/node_modules/**/*.js
33
test/memory_benchmark/node_modules/**/*.js

Gulpfile.js

Lines changed: 38 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -11,63 +11,13 @@ var fork = require('child_process').fork,
1111
through = require('through2'),
1212
concat = require('gulp-concat'),
1313
jsdoc = require('gulp-jsdoc-to-markdown'),
14-
insert = require('gulp-insert');
14+
insert = require('gulp-insert'),
15+
generateNamedEntityData = require('./scripts/generate_named_entity_data'),
16+
generateParserFeedbackTest = require('./scripts/generate_parser_feedback_test');
1517

1618

17-
gulp.task('generate-trie', function () {
18-
function createTrie(entitiesData) {
19-
return Object.keys(entitiesData).reduce(function (trie, entity) {
20-
var resultCp = entitiesData[entity].codepoints;
21-
22-
entity = entity.replace(/^&/, '');
23-
24-
var entityLength = entity.length,
25-
last = entityLength - 1,
26-
leaf = trie;
27-
28-
for (var i = 0; i < entityLength; i++) {
29-
var key = entity.charCodeAt(i);
30-
31-
if (!leaf[key])
32-
leaf[key] = {};
33-
34-
if (i === last)
35-
leaf[key].c = resultCp;
36-
37-
else {
38-
if (!leaf[key].l)
39-
leaf[key].l = {};
40-
41-
leaf = leaf[key].l;
42-
}
43-
}
44-
45-
return trie;
46-
}, {});
47-
}
48-
49-
function trieCodeGen(file, encoding, callback) {
50-
var entitiesData = JSON.parse(file.contents.toString()),
51-
trie = createTrie(entitiesData),
52-
out = '\'use strict\';\n\n' +
53-
'//NOTE: this file contains auto-generated trie structure that is used for named entity references consumption\n' +
54-
'//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references and\n' +
55-
'//http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html#named-character-references)\n' +
56-
'module.exports = ' + JSON.stringify(trie).replace(/"/g, '') + ';\n';
57-
58-
59-
file.contents = new Buffer(out);
60-
61-
callback(null, file);
62-
}
63-
64-
return download('https://html.spec.whatwg.org/multipage/entities.json')
65-
.pipe(through.obj(trieCodeGen))
66-
.pipe(rename('named_entity_trie.js'))
67-
.pipe(gulp.dest('lib/tokenizer'));
68-
});
69-
70-
gulp.task('generate-api-reference', function () {
19+
// Docs
20+
gulp.task('update-api-reference', function () {
7121
return gulp
7222
.src('lib/**/*.js')
7323
.pipe(concat('05_api_reference.md'))
@@ -76,13 +26,15 @@ gulp.task('generate-api-reference', function () {
7626
.pipe(gulp.dest('docs'));
7727
});
7828

79-
gulp.task('docs', ['generate-api-reference'], function () {
29+
gulp.task('docs', ['update-api-reference'], function () {
8030
return gulp
8131
.src('docs/*.md')
8232
.pipe(concat('index.md'))
8333
.pipe(gulp.dest('docs/build'));
8434
});
8535

36+
37+
// Benchmarks
8638
gulp.task('install-upstream-parse5', function () {
8739
return gulp
8840
.src('test/benchmark/package.json')
@@ -112,99 +64,57 @@ gulp.task('named-entity-data-memory-benchmark', function (done) {
11264
fork('./test/memory_benchmark/named_entity_data').once('close', done);
11365
});
11466

67+
68+
// Test
11569
gulp.task('lint', function () {
11670
return gulp
11771
.src([
11872
'lib/**/*.js',
11973
'test/**/*.js',
74+
'scripts/**/*.js',
12075
'Gulpfile.js'
12176
])
12277
.pipe(eslint())
12378
.pipe(eslint.format())
12479
.pipe(eslint.failAfterError());
12580
});
12681

127-
gulp.task('update-feedback-tests', function () {
128-
var Parser = require('./lib/Parser');
129-
var Tokenizer = require('./lib/tokenizer');
130-
var defaultTreeAdapter = require('./lib/tree_adapters/default');
131-
var testUtils = require('./test/test_utils');
132-
133-
function appendToken(dest, token) {
134-
switch (token.type) {
135-
case Tokenizer.EOF_TOKEN:
136-
return false;
137-
case Tokenizer.NULL_CHARACTER_TOKEN:
138-
case Tokenizer.WHITESPACE_CHARACTER_TOKEN:
139-
token.type = Tokenizer.CHARACTER_TOKEN;
140-
/* falls through */
141-
case Tokenizer.CHARACTER_TOKEN:
142-
if (dest.length > 0 && dest[dest.length - 1].type === Tokenizer.CHARACTER_TOKEN) {
143-
dest[dest.length - 1].chars += token.chars;
144-
return true;
145-
}
146-
break;
147-
}
148-
dest.push(token);
149-
return true;
150-
}
151-
152-
function collectParserTokens(html) {
153-
var tokens = [];
154-
var parser = new Parser();
155-
156-
parser._processInputToken = function (token) {
157-
Parser.prototype._processInputToken.call(this, token);
158-
159-
// Needed to split attributes of duplicate <html> and <body>
160-
// which are otherwise merged as per tree constructor spec
161-
if (token.type === Tokenizer.START_TAG_TOKEN)
162-
token.attrs = token.attrs.slice();
163-
164-
appendToken(tokens, token);
165-
};
166-
167-
parser.parse(html);
168-
169-
return tokens.map(testUtils.convertTokenToHtml5Lib);
170-
}
82+
gulp.task('test', ['lint'], function () {
83+
return gulp
84+
.src('test/fixtures/*_test.js')
85+
.pipe(mocha({
86+
ui: 'exports',
87+
reporter: 'progress',
88+
timeout: typeof v8debug === 'undefined' ? 20000 : Infinity // NOTE: disable timeouts in debug
89+
}));
90+
});
91+
17192

93+
// Scripts
94+
gulp.task('update-feedback-tests', function () {
17295
return gulp
17396
.src(['test/data/tree_construction/*.dat', 'test/data/tree_construction_regression/*.dat'])
17497
.pipe(through.obj(function (file, encoding, callback) {
175-
var tests = testUtils.parseTreeConstructionTestData(
176-
file.contents.toString(),
177-
defaultTreeAdapter
178-
);
179-
180-
var out = {
181-
tests: tests.filter(function (test) {
182-
return !test.fragmentContext; // TODO
183-
}).map(function (test) {
184-
var input = test.input;
185-
186-
return {
187-
description: testUtils.addSlashes(input),
188-
input: input,
189-
output: collectParserTokens(input)
190-
};
191-
})
192-
};
193-
194-
file.contents = new Buffer(JSON.stringify(out, null, 4));
98+
var test = generateParserFeedbackTest(file.contents.toString());
99+
100+
file.contents = new Buffer(test);
195101

196102
callback(null, file);
197103
}))
198104
.pipe(rename({extname: '.test'}))
199105
.pipe(gulp.dest('test/data/parser_feedback'));
200106
});
201107

202-
gulp.task('test', ['lint'], function () {
203-
return gulp
204-
.src('test/fixtures/*_test.js')
205-
.pipe(mocha({
206-
ui: 'exports',
207-
reporter: 'progress',
208-
timeout: typeof v8debug === 'undefined' ? 20000 : Infinity // NOTE: disable timeouts in debug
209-
}));
108+
109+
gulp.task('update-named-entities-data', function () {
110+
return download('https://html.spec.whatwg.org/multipage/entities.json')
111+
.pipe(through.obj(function (file, encoding, callback) {
112+
var entitiesData = JSON.parse(file.contents.toString());
113+
114+
file.contents = new Buffer(generateNamedEntityData(entitiesData));
115+
116+
callback(null, file);
117+
}))
118+
.pipe(rename('named_entity_data.js'))
119+
.pipe(gulp.dest('lib/tokenizer'));
210120
});

lib/tokenizer/index.js

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
var Preprocessor = require('./preprocessor'),
44
locationInfoMixin = require('../location_info/tokenizer_mixin'),
55
UNICODE = require('../common/unicode'),
6-
NAMED_ENTITY_TRIE = require('./named_entity_trie');
6+
neTree = require('./named_entity_data');
77

88
//Aliases
99
var $ = UNICODE.CODE_POINTS,
@@ -18,6 +18,13 @@ var NUMERIC_ENTITY_REPLACEMENTS = {
1818
0x9A: 0x0161, 0x9B: 0x203A, 0x9C: 0x0153, 0x9D: 0x009D, 0x9E: 0x017E, 0x9F: 0x0178
1919
};
2020

21+
// Named entity tree flags
22+
var HAS_DATA_FLAG = 1 << 0;
23+
var DATA_DUPLET_FLAG = 1 << 1;
24+
var HAS_BRANCHES_FLAG = 1 << 2;
25+
var MAX_BRANCH_MARKER_VALUE = HAS_DATA_FLAG | DATA_DUPLET_FLAG | HAS_BRANCHES_FLAG;
26+
27+
2128
//States
2229
var DATA_STATE = 'DATA_STATE',
2330
CHARACTER_REFERENCE_IN_DATA_STATE = 'CHARACTER_REFERENCE_IN_DATA_STATE',
@@ -142,6 +149,29 @@ function toAsciiLowerChar(cp) {
142149
return String.fromCharCode(toAsciiLowerCodePoint(cp));
143150
}
144151

152+
function findNamedEntityTreeBranch(nodeIx, cp) {
153+
var branchCount = neTree[++nodeIx],
154+
lo = ++nodeIx,
155+
hi = lo + branchCount - 1;
156+
157+
while (lo <= hi) {
158+
var mid = lo + hi >>> 1,
159+
midCp = neTree[mid];
160+
161+
if (midCp < cp)
162+
lo = mid + 1;
163+
164+
else if (midCp > cp)
165+
hi = mid - 1;
166+
167+
else
168+
return neTree[mid + branchCount];
169+
}
170+
171+
return -1;
172+
}
173+
174+
145175
//Tokenizer
146176
var Tokenizer = module.exports = function (options) {
147177
this.preprocessor = new Preprocessor();
@@ -482,33 +512,48 @@ Tokenizer.prototype._consumeNumericEntity = function (isHex) {
482512
return referencedCp;
483513
};
484514

485-
Tokenizer.prototype._consumeNamedEntity = function (startCp, inAttr) {
515+
// NOTE: for the details on this algorithm see
516+
// https://github.com/inikulin/parse5/tree/master/scripts/generate_named_entity_data/README.md
517+
Tokenizer.prototype._consumeNamedEntity = function (inAttr) {
486518
var referencedCodePoints = null,
487-
entityCodePointsCount = 0,
488-
cp = startCp,
489-
leaf = NAMED_ENTITY_TRIE[cp],
490-
consumedCount = 1,
519+
referenceSize = 0,
520+
cp = null,
521+
consumedCount = 0,
491522
semicolonTerminated = false;
492523

493-
for (; leaf && cp !== $.EOF; cp = this._consume(), consumedCount++, leaf = leaf.l && leaf.l[cp]) {
494-
if (leaf.c) {
495-
//NOTE: we have at least one named reference match. But we don't stop lookup at this point,
496-
//because longer matches still can be found (e.g. '&not' and '&notin;') except the case
497-
//then found match is terminated by semicolon.
498-
referencedCodePoints = leaf.c;
499-
entityCodePointsCount = consumedCount;
524+
for (var i = 0; i > -1;) {
525+
var current = neTree[i],
526+
inNode = current < MAX_BRANCH_MARKER_VALUE,
527+
nodeWithData = inNode && current & HAS_DATA_FLAG;
528+
529+
if (nodeWithData) {
530+
referencedCodePoints = current & DATA_DUPLET_FLAG ? [neTree[++i], neTree[++i]] : [neTree[++i]];
531+
referenceSize = consumedCount;
500532

501533
if (cp === $.SEMICOLON) {
502534
semicolonTerminated = true;
503535
break;
504536
}
505537
}
538+
539+
cp = this._consume();
540+
consumedCount++;
541+
542+
if (cp === $.EOF)
543+
break;
544+
545+
if (inNode)
546+
i = current & HAS_BRANCHES_FLAG ? findNamedEntityTreeBranch(i, cp) : -1;
547+
548+
else
549+
i = cp === current ? ++i : -1;
506550
}
507551

552+
508553
if (referencedCodePoints) {
509554
if (!semicolonTerminated) {
510555
//NOTE: unconsume excess (e.g. 'it' in '&notit')
511-
this._unconsumeSeveral(consumedCount - entityCodePointsCount);
556+
this._unconsumeSeveral(consumedCount - referenceSize);
512557

513558
//NOTE: If the character reference is being consumed as part of an attribute and the next character
514559
//is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII character, then, for historical
@@ -521,7 +566,7 @@ Tokenizer.prototype._consumeNamedEntity = function (startCp, inAttr) {
521566
var nextCp = this._lookahead();
522567

523568
if (nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp)) {
524-
this._unconsumeSeveral(entityCodePointsCount);
569+
this._unconsumeSeveral(referenceSize);
525570
return null;
526571
}
527572
}
@@ -565,7 +610,9 @@ Tokenizer.prototype._consumeCharacterReference = function (startCp, inAttr) {
565610
return null;
566611
}
567612

568-
return this._consumeNamedEntity(startCp, inAttr);
613+
this._unconsume();
614+
615+
return this._consumeNamedEntity(inAttr);
569616
};
570617

571618
//State machine

lib/tokenizer/named_entity_data.js

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/tokenizer/named_entity_trie.js

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)