Skip to content

Commit 8d95086

Browse files
davidbenmarcoscaceres
authored andcommitted
cpp: Fully support C++11 raw strings. (#1897)
See https://en.cppreference.com/w/cpp/language/string_literal for the syntax. This requires a fix in highlight.js itself. mode.terminators joins each node's begin regexps with |. This breaks if one of the begin regexps has backreferences. Backreferences count capturing parenthesized groups, and adding new groups in front will change that count. Thus far, the only language that uses backreferences is Rust (also for raw strings), which happens to be the first in the list and avoids this bug. C++ cannot as easily avoid this because, even were raw strings the first option in STRINGS, STRINGS itself is included in other lists. Rather than carefully order things, rewrite the regularly expressions to fix the backreferences.
1 parent 5b1b86c commit 8d95086

File tree

4 files changed

+104
-17
lines changed

4 files changed

+104
-17
lines changed

src/highlight.js

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,47 @@ https://highlightjs.org/
237237
);
238238
}
239239

240+
// joinRe logically computes regexps.join(separator), but fixes the
241+
// backreferences so they continue to match.
242+
function joinRe(regexps, separator) {
243+
// backreferenceRe matches an open parenthesis or backreference. To avoid
244+
// an incorrect parse, it additionally matches the following:
245+
// - [...] elements, where the meaning of parentheses and escapes change
246+
// - other escape sequences, so we do not misparse escape sequences as
247+
// interesting elements
248+
// - non-matching or lookahead parentheses, which do not capture. These
249+
// follow the '(' with a '?'.
250+
var backreferenceRe = /\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./;
251+
var numCaptures = 0;
252+
var ret = '';
253+
for (var i = 0; i < regexps.length; i++) {
254+
var offset = numCaptures;
255+
var re = reStr(regexps[i]);
256+
if (i > 0) {
257+
ret += separator;
258+
}
259+
while (re.length > 0) {
260+
var match = backreferenceRe.exec(re);
261+
if (match == null) {
262+
ret += re;
263+
break;
264+
}
265+
ret += re.substring(0, match.index);
266+
re = re.substring(match.index + match[0].length);
267+
if (match[0][0] == '\\' && match[1]) {
268+
// Adjust the backreference.
269+
ret += '\\' + String(Number(match[1]) + offset);
270+
} else {
271+
ret += match[0];
272+
if (match[0] == '(') {
273+
numCaptures++;
274+
}
275+
}
276+
}
277+
}
278+
return ret;
279+
}
280+
240281
function compileMode(mode, parent) {
241282
if (mode.compiled)
242283
return;
@@ -302,12 +343,12 @@ https://highlightjs.org/
302343

303344
var terminators =
304345
mode.contains.map(function(c) {
305-
return c.beginKeywords ? '\\.?(' + c.begin + ')\\.?' : c.begin;
346+
return c.beginKeywords ? '\\.?(?:' + c.begin + ')\\.?' : c.begin;
306347
})
307348
.concat([mode.terminator_end, mode.illegal])
308349
.map(reStr)
309350
.filter(Boolean);
310-
mode.terminators = terminators.length ? langRe(terminators.join('|'), true) : {exec: function(/*s*/) {return null;}};
351+
mode.terminators = terminators.length ? langRe(joinRe(terminators, '|'), true) : {exec: function(/*s*/) {return null;}};
311352
}
312353

313354
compileMode(language);

src/languages/cpp.js

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,7 @@ function(hljs) {
1919
illegal: '\\n',
2020
contains: [hljs.BACKSLASH_ESCAPE]
2121
},
22-
{
23-
// TODO: This does not handle raw string literals with prefixes. Using
24-
// a single regex with backreferences would work (note to use *?
25-
// instead of * to make it non-greedy), but the mode.terminators
26-
// computation in highlight.js breaks the counting.
27-
begin: '(u8?|U|L)?R"\\(', end: '\\)"',
28-
},
22+
{ begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\((?:.|\n)*?\)\1"/ },
2923
{
3024
begin: '\'\\\\?.', end: '\'',
3125
illegal: '.'

test/markup/cpp/string-literals.expect.txt

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,47 @@
1010
<span class="hljs-comment">// Raw string literals (multiline)</span>
1111
<span class="hljs-keyword">auto</span> char_multi = <span class="hljs-string">R"(Hello
1212
"normal"
13-
muliline
13+
multiline
1414
string.)"</span>;
1515
<span class="hljs-keyword">auto</span> utf8_multi = <span class="hljs-string">u8R"(Hello
1616
"utf-8"
17-
muliline
17+
multiline
1818
string)"</span>;
1919
<span class="hljs-keyword">auto</span> utf16_multi = <span class="hljs-string">uR"(Hello
2020
"utf-16"
21-
muliline
21+
multiline
2222
string)"</span>;
2323
<span class="hljs-keyword">auto</span> utf32_multi = <span class="hljs-string">UR"(Hello
2424
"utf-32"
25-
muliline
25+
multiline
2626
string)"</span>;
2727

28+
<span class="hljs-comment">// Raw string literals with delimiter (multiline)</span>
29+
<span class="hljs-keyword">auto</span> char_multi = <span class="hljs-string">R"blah1(Hello
30+
"normal"
31+
multiline
32+
)"
33+
)blah"
34+
string.)blah1"</span>;
35+
<span class="hljs-keyword">auto</span> utf8_multi = <span class="hljs-string">u8R"blah2(Hello
36+
"utf-8"
37+
multiline
38+
)"
39+
)blah"
40+
string)blah2"</span>;
41+
<span class="hljs-keyword">auto</span> utf16_multi = <span class="hljs-string">uR"blah3(Hello
42+
"utf-16"
43+
multiline
44+
)"
45+
)blah"
46+
string)blah3"</span>;
47+
<span class="hljs-keyword">auto</span> utf32_multi = <span class="hljs-string">UR"blah4(Hello
48+
"utf-32"
49+
multiline
50+
)"
51+
)blah"
52+
string)blah4"</span>;
53+
2854
<span class="hljs-comment">// Meta strings</span>
2955
<span class="hljs-meta">#<span class="hljs-meta-keyword">include</span> <span class="hljs-meta-string">&lt;stdio&gt;</span></span>
3056
<span class="hljs-meta">#<span class="hljs-meta-keyword">include</span> <span class="hljs-meta-string">"lib.h"</span></span>

test/markup/cpp/string-literals.txt

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,47 @@ auto wide_char = L"Hello wchar_t string";
1010
// Raw string literals (multiline)
1111
auto char_multi = R"(Hello
1212
"normal"
13-
muliline
13+
multiline
1414
string.)";
1515
auto utf8_multi = u8R"(Hello
1616
"utf-8"
17-
muliline
17+
multiline
1818
string)";
1919
auto utf16_multi = uR"(Hello
2020
"utf-16"
21-
muliline
21+
multiline
2222
string)";
2323
auto utf32_multi = UR"(Hello
2424
"utf-32"
25-
muliline
25+
multiline
2626
string)";
2727

28+
// Raw string literals with delimiter (multiline)
29+
auto char_multi = R"blah1(Hello
30+
"normal"
31+
multiline
32+
)"
33+
)blah"
34+
string.)blah1";
35+
auto utf8_multi = u8R"blah2(Hello
36+
"utf-8"
37+
multiline
38+
)"
39+
)blah"
40+
string)blah2";
41+
auto utf16_multi = uR"blah3(Hello
42+
"utf-16"
43+
multiline
44+
)"
45+
)blah"
46+
string)blah3";
47+
auto utf32_multi = UR"blah4(Hello
48+
"utf-32"
49+
multiline
50+
)"
51+
)blah"
52+
string)blah4";
53+
2854
// Meta strings
2955
#include <stdio>
3056
#include "lib.h"

0 commit comments

Comments
 (0)