Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 850077d

Browse files
author
Ethan Pailes
committedSep 15, 2018
Add function to determine if a regex is onepass.
This patch adds the analysis function, `is_onepass` found in `analysis.rs`, which is required in order to determine if a particular regex can be executed using the onepass DFA. A regex is said to be onepass iff there are no non-deterministic splits in it. An example of a non-determinism in a regex is `/alex|apple/`. Here we can't know which branch to take because both of them start with `a`. A more subtle example is `/(?:alex)*apple/`. After every iteration of the Kleene star, we might branch back to `alex` or continue on to `apple`.
1 parent cd6f1e2 commit 850077d

File tree

4 files changed

+573
-0
lines changed

4 files changed

+573
-0
lines changed
 

‎regex-syntax/src/hir/interval.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,11 @@ impl<I: Interval> IntervalSet<I> {
309309
}
310310
true
311311
}
312+
313+
/// Returns true iff this class is empty.
314+
pub fn is_empty(&self) -> bool {
315+
self.ranges.is_empty()
316+
}
312317
}
313318

314319
/// An iterator over intervals.

‎regex-syntax/src/hir/mod.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,16 @@ impl ClassUnicode {
797797
pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
798798
self.set.symmetric_difference(&other.set);
799799
}
800+
801+
/// Returns true iff this character class contains no characters.
802+
///
803+
/// This should never be true for a character class which was
804+
/// constructed by the regex parser, but a notion of character
805+
/// class emptiness can be useful for code that wants to
806+
/// programmatically generate character classes.
807+
pub fn is_empty(&self) -> bool {
808+
self.set.is_empty()
809+
}
800810
}
801811

802812
/// An iterator over all ranges in a Unicode character class.
@@ -998,6 +1008,16 @@ impl ClassBytes {
9981008
pub fn is_all_ascii(&self) -> bool {
9991009
self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
10001010
}
1011+
1012+
/// Returns true iff this character class contains no characters.
1013+
///
1014+
/// This should never be true for a character class which was
1015+
/// constructed by the regex parser, but a notion of character
1016+
/// class emptiness can be useful for code that wants to
1017+
/// programmatically generate character classes.
1018+
pub fn is_empty(&self) -> bool {
1019+
self.set.is_empty()
1020+
}
10011021
}
10021022

10031023
/// An iterator over all ranges in a byte character class.

‎src/analysis.rs

Lines changed: 547 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,547 @@
1+
use syntax::hir::{
2+
Hir, HirKind, Literal, ClassBytes, ClassBytesRange,
3+
Class, Visitor, RepetitionRange, RepetitionKind
4+
};
5+
use syntax::hir;
6+
use utf8_ranges::Utf8Sequences;
7+
8+
/// True iff the given expression is one-pass
9+
pub fn is_onepass(expr: &Hir) -> bool {
10+
hir::visit(expr, IsOnePassVisitor::new()).unwrap()
11+
}
12+
13+
struct IsOnePassVisitor(bool);
14+
15+
impl Visitor for IsOnePassVisitor {
16+
type Output = bool;
17+
type Err = ();
18+
19+
fn finish(self) -> Result<bool, ()> {
20+
Ok(self.0)
21+
}
22+
23+
fn visit_pre(&mut self, hir: &Hir) -> Result<(), ()> {
24+
if !self.0 {
25+
return Ok(())
26+
}
27+
28+
match hir.kind() {
29+
&HirKind::Concat(ref es) => self.check_concat(&es),
30+
&HirKind::Alternation(ref es) => self.check_alternation(&es),
31+
&HirKind::Repetition(ref rep) => {
32+
if fset_of(&*rep.hir).is_empty() {
33+
self.0 = false;
34+
}
35+
}
36+
&HirKind::Class(ref cls) => self.check_cls(cls),
37+
_ => ()
38+
}
39+
40+
Ok(())
41+
}
42+
}
43+
44+
impl IsOnePassVisitor {
45+
fn new() -> Self {
46+
IsOnePassVisitor(true)
47+
}
48+
49+
fn check_concat(&mut self, es: &[Hir]) {
50+
let mut empty_run = vec![];
51+
52+
for e in NestedConcat::new(es) {
53+
// TODO(ethan):yakshaving factor the determination of when
54+
// a regex accepts_empty out into a separate function,
55+
// so that we don't compute the whole first set when we
56+
// don't need to.
57+
let fset = fset_of(e);
58+
let is_rep = match e.kind() {
59+
&HirKind::Repetition(_) => true,
60+
_ => false,
61+
};
62+
63+
empty_run.push(e);
64+
if !(fset.accepts_empty || is_rep) {
65+
// this is the last one in the run
66+
break;
67+
}
68+
}
69+
70+
if empty_run.len() > 0 {
71+
self.0 = self.0 && !fsets_clash(&empty_run);
72+
}
73+
}
74+
75+
fn check_alternation(&mut self, es: &[Hir]) {
76+
self.0 = self.0 && !fsets_clash(&es.iter().collect::<Vec<_>>());
77+
}
78+
79+
// Unicode classes are really big alternatives from the byte
80+
// oriented point of view.
81+
//
82+
// This function translates a unicode class into the
83+
// byte space and checks for intersecting first sets.
84+
fn check_cls(&mut self, cls: &Class) {
85+
match cls {
86+
&Class::Unicode(ref ucls) => {
87+
let mut seen_char: [bool; 256] = [false; 256];
88+
89+
for cr in ucls.iter() {
90+
for br in Utf8Sequences::new(cr.start(), cr.end()) {
91+
let first = br.as_slice()[0];
92+
for b in first.start..(first.end+1) {
93+
if seen_char[b as usize] {
94+
self.0 = false;
95+
return;
96+
}
97+
seen_char[b as usize] = true;
98+
}
99+
}
100+
}
101+
}
102+
_ => {} // FALLTHROUGH
103+
}
104+
}
105+
106+
}
107+
108+
/// Check if a list of first sets is incompatable.
109+
///
110+
/// O(n^2), but n will usually be quite small.
111+
fn fsets_clash(es: &[&Hir]) -> bool {
112+
for (i, e1) in es.iter().enumerate() {
113+
for (j, e2) in es.iter().enumerate() {
114+
if i != j {
115+
let mut fset = fset_of(e1);
116+
let fset2 = fset_of(e2);
117+
118+
// For the regex /a|()+/, we don't have a way to
119+
// differentiate the branches, so we are not onepass.
120+
//
121+
// We might be able to loosen this restriction by
122+
// considering the expression after the alternative
123+
// if there is one.
124+
if fset.is_empty() || fset2.is_empty() {
125+
return true;
126+
}
127+
128+
fset.intersect(&fset2);
129+
if ! fset.is_empty() {
130+
return true;
131+
}
132+
}
133+
}
134+
}
135+
false
136+
}
137+
138+
139+
/// Compute the first set of a given regular expression.
140+
///
141+
/// The first set of a regular expression is the set of all characters
142+
/// which might begin it. This is a less general version of the
143+
/// notion of a regular expression preview (the first set can be
144+
/// thought of as the 1-preview of a regular expression).
145+
///
146+
/// Note that first sets are byte-oriented because the DFA is
147+
/// byte oriented. This means an expression like /Δ|δ/ is actually not
148+
/// one-pass, even though there is clearly no non-determinism inherent
149+
/// to the regex at a unicode code point level (big delta and little
150+
/// delta start with the same byte).
151+
fn fset_of(expr: &Hir) -> FirstSet {
152+
fn singleton(b: u8) -> FirstSet {
153+
let mut f = FirstSet::empty();
154+
f.push_bytes(ClassBytesRange::new(b, b));
155+
f
156+
}
157+
158+
match expr.kind() {
159+
&HirKind::Empty => FirstSet::epsilon(),
160+
&HirKind::Literal(ref lit) => {
161+
match lit {
162+
&Literal::Unicode(c) => singleton(first_byte(c)),
163+
&Literal::Byte(b) => singleton(b),
164+
}
165+
}
166+
&HirKind::Class(ref class) => {
167+
match class {
168+
&Class::Unicode(ref c) => {
169+
// Get all the bytes which might begin this unicode
170+
// class.
171+
let mut cb = FirstSet::empty();
172+
for cr in c.iter() {
173+
for br in Utf8Sequences::new(cr.start(), cr.end()) {
174+
let first = br.as_slice()[0];
175+
cb.push_bytes(
176+
ClassBytesRange::new(first.start, first.end));
177+
}
178+
}
179+
cb
180+
}
181+
&Class::Bytes(ref b) =>
182+
FirstSet::new(b.iter().map(|x| *x), false),
183+
}
184+
}
185+
186+
// When an empty look (Anchor or WordBoundary) is at the start of
187+
// a concatenation, we conservatively assume that the assertion
188+
// will pass, so we just drop it. Then we can only get to this
189+
// point if we are dealing with some sort of naked empty look.
190+
// For now we just do the most conservative thing and say
191+
// that such an emptylook could potentially match on any character.
192+
&HirKind::Anchor(_) | &HirKind::WordBoundary(_) => FirstSet::anychar(),
193+
194+
&HirKind::Repetition(ref rep) => {
195+
let mut f = fset_of(&*rep.hir);
196+
match rep.kind {
197+
RepetitionKind::ZeroOrOne => f.accepts_empty = true,
198+
RepetitionKind::ZeroOrMore => f.accepts_empty = true,
199+
RepetitionKind::OneOrMore => {},
200+
RepetitionKind::Range(ref range) => {
201+
match range {
202+
&RepetitionRange::Exactly(0)
203+
| &RepetitionRange::AtLeast(0)
204+
| &RepetitionRange::Bounded(0, _) =>
205+
f.accepts_empty = true,
206+
_ => {}
207+
}
208+
}
209+
}
210+
f
211+
},
212+
&HirKind::Group(ref group) => fset_of(&group.hir),
213+
214+
// The most involved case. We need to strip leading empty-looks
215+
// as well as take the union of the first sets of the first n+1
216+
// expressions where n is the number of leading repetitions.
217+
&HirKind::Concat(ref es) => {
218+
let mut fset = FirstSet::empty();
219+
for (i, e) in es.iter().enumerate() {
220+
match e.kind() {
221+
&HirKind::Anchor(_) | &HirKind::WordBoundary(_) => {
222+
// Ignore any leading emptylooks, but any in tail
223+
// position have to be accounted for.
224+
if i == es.len() - 1 {
225+
fset.union(&FirstSet::anychar());
226+
}
227+
}
228+
_ => {
229+
let inner_fset = fset_of(e);
230+
fset.union(&inner_fset);
231+
232+
if !inner_fset.accepts_empty() {
233+
// We can stop accumulating after we stop seeing
234+
// first sets which contain epsilon.
235+
// Also, a contatination which terminated by
236+
// one or more expressions which do not accept
237+
// epsilon itself does not acceept epsilon.
238+
fset.accepts_empty = false;
239+
break;
240+
}
241+
}
242+
}
243+
}
244+
fset
245+
}
246+
&HirKind::Alternation(ref es) => {
247+
let mut fset = FirstSet::empty();
248+
for e in es {
249+
fset.union(&fset_of(e));
250+
}
251+
fset
252+
}
253+
}
254+
}
255+
256+
/// The first byte of a unicode code point.
257+
///
258+
/// We only ever care about the first byte of a particular character,
259+
/// because the onepass DFA is implemented in the byte space, not the
260+
/// character space. This means, for example, that a branch between
261+
/// lowercase delta and uppercase delta is actually non-deterministic.
262+
fn first_byte(c: char) -> u8 {
263+
let mut b: [u8; 4] = [0; 4];
264+
c.encode_utf8(&mut b);
265+
b[0]
266+
}
267+
268+
/// A representation of all the possible ways a word in the language
269+
/// of a regex could begin. ClassBytes has no way to express the empty
270+
/// string, so we add an extra flag to indicate if a FirstSet includes
271+
/// epsilon. Put in a more theoretical way all firstsets are subsets of
272+
/// SIGMA `union` { epsilon }.
273+
#[derive(Debug, PartialEq, Eq)]
274+
struct FirstSet {
275+
bytes: ClassBytes,
276+
pub accepts_empty: bool,
277+
}
278+
279+
impl FirstSet {
280+
fn empty() -> Self {
281+
FirstSet {
282+
bytes: ClassBytes::empty(),
283+
accepts_empty: false,
284+
}
285+
}
286+
287+
pub fn new<I>(ranges: I, accepts_empty: bool) -> Self
288+
where I: IntoIterator<Item=ClassBytesRange>
289+
{
290+
FirstSet {
291+
bytes: ClassBytes::new(ranges),
292+
accepts_empty: accepts_empty,
293+
}
294+
}
295+
296+
fn anychar() -> FirstSet {
297+
let mut f = FirstSet::empty();
298+
f.push_bytes(ClassBytesRange::new(b'\0', b'\xFF'));
299+
f
300+
}
301+
302+
fn epsilon() -> FirstSet {
303+
FirstSet {
304+
bytes: ClassBytes::empty(),
305+
accepts_empty: true,
306+
}
307+
}
308+
309+
fn push_bytes(&mut self, byte_range: ClassBytesRange) {
310+
self.bytes.push(byte_range);
311+
}
312+
313+
fn union(&mut self, other: &FirstSet) {
314+
self.bytes.union(&other.bytes);
315+
self.accepts_empty = self.accepts_empty || other.accepts_empty;
316+
}
317+
318+
fn intersect(&mut self, other: &FirstSet) {
319+
self.bytes.intersect(&other.bytes);
320+
self.accepts_empty = self.accepts_empty && other.accepts_empty;
321+
}
322+
323+
fn is_empty(&self) -> bool {
324+
self.bytes.is_empty() && !self.accepts_empty
325+
}
326+
327+
fn accepts_empty(&self) -> bool {
328+
self.accepts_empty
329+
}
330+
}
331+
332+
/// An iterator over a concatenation of expressions which
333+
/// drills down into other embedded concatenations.
334+
struct NestedConcat<'a>(Vec<(&'a [Hir], usize)>);
335+
impl<'a> NestedConcat<'a> {
336+
fn new(es: &'a [Hir]) -> Self {
337+
NestedConcat(vec![(es, 0)])
338+
}
339+
}
340+
impl<'a> Iterator for NestedConcat<'a> {
341+
type Item = &'a Hir;
342+
343+
fn next(&mut self) -> Option<&'a Hir> {
344+
if self.0.len() == 0 {
345+
return None;
346+
}
347+
348+
let tip = self.0.len() - 1;
349+
let (es, idx) = self.0[tip];
350+
351+
if idx >= es.len() {
352+
self.0.pop();
353+
return self.next();
354+
}
355+
356+
self.0[tip].1 += 1;
357+
358+
match es[idx].kind() {
359+
&HirKind::Concat(ref es) => {
360+
self.0.push((es, 0));
361+
self.next()
362+
}
363+
_ => Some(&es[idx]),
364+
}
365+
}
366+
}
367+
368+
#[cfg(test)]
369+
mod tests {
370+
use syntax::Parser;
371+
use syntax::hir::Hir;
372+
use super::*;
373+
374+
fn is_intersecting_fset(e1: &Hir, e2: &Hir) -> bool {
375+
let mut fset = fset_of(e1);
376+
fset.intersect(&fset_of(e2));
377+
! fset.is_empty()
378+
}
379+
380+
//
381+
// First Set intersection smoke tests
382+
//
383+
384+
#[test]
385+
fn fset_lit() {
386+
let e1 = Parser::new().parse("a").unwrap();
387+
let e2 = Parser::new().parse("a").unwrap();
388+
let e3 = Parser::new().parse("b").unwrap();
389+
390+
assert!(is_intersecting_fset(&e1, &e2));
391+
assert!(!is_intersecting_fset(&e1, &e3));
392+
}
393+
394+
#[test]
395+
fn fset_class() {
396+
let e1 = Parser::new().parse("[a]").unwrap();
397+
let e2 = Parser::new().parse("[a]").unwrap();
398+
let e3 = Parser::new().parse("[b]").unwrap();
399+
400+
assert!(is_intersecting_fset(&e1, &e2));
401+
assert!(!is_intersecting_fset(&e1, &e3));
402+
}
403+
404+
#[test]
405+
fn fset_class_n() {
406+
let e1 = Parser::new().parse("[xamn]").unwrap();
407+
let e2 = Parser::new().parse("[rlwa]").unwrap();
408+
let e3 = Parser::new().parse("[bcq]").unwrap();
409+
410+
assert!(is_intersecting_fset(&e1, &e2));
411+
assert!(!is_intersecting_fset(&e1, &e3));
412+
}
413+
414+
#[test]
415+
fn fset_alt() {
416+
let e1 = Parser::new().parse("ab|bc|ad").unwrap();
417+
let e2 = Parser::new().parse("yyyy|am|zz").unwrap();
418+
let e3 = Parser::new().parse("cc|ww").unwrap();
419+
420+
assert!(is_intersecting_fset(&e1, &e2));
421+
assert!(!is_intersecting_fset(&e1, &e3));
422+
}
423+
424+
#[test]
425+
fn fset_group() {
426+
let e1 = Parser::new().parse("(?:ab)").unwrap();
427+
let e2 = Parser::new().parse("(?:aq)").unwrap();
428+
let e3 = Parser::new().parse("(?:m)").unwrap();
429+
430+
assert!(is_intersecting_fset(&e1, &e2));
431+
assert!(!is_intersecting_fset(&e1, &e3));
432+
}
433+
434+
#[test]
435+
fn fset_concat() {
436+
let e1 = Parser::new().parse("aa(?:nb)").unwrap();
437+
let e2 = Parser::new().parse("aa(?:rq)").unwrap();
438+
let e3 = Parser::new().parse("bb(?:m)").unwrap();
439+
440+
assert!(is_intersecting_fset(&e1, &e2));
441+
assert!(!is_intersecting_fset(&e1, &e3));
442+
}
443+
444+
#[test]
445+
fn fset_word_boundary_dropped() {
446+
let e1 = Parser::new().parse(r"aa").unwrap();
447+
let e2 = Parser::new().parse(r"\baa").unwrap();
448+
let e3 = Parser::new().parse(r"\bbb").unwrap();
449+
450+
assert!(is_intersecting_fset(&e1, &e2));
451+
assert!(!is_intersecting_fset(&e1, &e3));
452+
}
453+
454+
#[test]
455+
fn fset_word_boundary_all() {
456+
let e1 = Parser::new().parse(r"aa").unwrap();
457+
let e2 = Parser::new().parse(r"\b").unwrap();
458+
459+
assert!(is_intersecting_fset(&e1, &e2));
460+
}
461+
462+
#[test]
463+
fn fset_not_word_boundary_dropped() {
464+
let e1 = Parser::new().parse(r"aa").unwrap();
465+
let e2 = Parser::new().parse(r"\Baa").unwrap();
466+
let e3 = Parser::new().parse(r"\Bbb").unwrap();
467+
468+
assert!(is_intersecting_fset(&e1, &e2));
469+
assert!(!is_intersecting_fset(&e1, &e3));
470+
}
471+
472+
#[test]
473+
fn fset_not_word_boundary_all() {
474+
let e1 = Parser::new().parse(r"aa").unwrap();
475+
let e2 = Parser::new().parse(r"\B").unwrap();
476+
477+
assert!(is_intersecting_fset(&e1, &e2));
478+
}
479+
480+
#[test]
481+
fn fset_start_anchor_dropped() {
482+
let e1 = Parser::new().parse(r"aa").unwrap();
483+
let e2 = Parser::new().parse(r"^aa").unwrap();
484+
let e3 = Parser::new().parse(r"^bb").unwrap();
485+
486+
assert!(is_intersecting_fset(&e1, &e2));
487+
assert!(!is_intersecting_fset(&e1, &e3));
488+
}
489+
490+
#[test]
491+
fn fset_terminal_emptylook_all() {
492+
let e = Parser::new().parse(r"a*\b").unwrap();
493+
494+
let mut total_accept = FirstSet::anychar();
495+
total_accept.accepts_empty = true;
496+
497+
assert_eq!(total_accept, fset_of(&e));
498+
}
499+
500+
#[test]
501+
fn fset_empty_alt() {
502+
let e1 = Parser::new().parse(r"(?:a|())b").unwrap();
503+
let e2 = Parser::new().parse(r"b").unwrap();
504+
505+
assert!(is_intersecting_fset(&e1, &e2));
506+
}
507+
508+
//
509+
// onepass smoke tests
510+
//
511+
512+
// This test is pulled right from some of Russ Cox's
513+
// comments on onepass regex.
514+
//
515+
// Note that Russ Cox's other example of a onepass regex
516+
// (r"(\d+)-(\d+)") is actually not onepass for us because
517+
// there is byte-level nondeterminism in the \d character
518+
// class, and we care about things in the byte space rather
519+
// than the character space. If you do a onepass engine at
520+
// the character level, Cox's example is indeed onepass.
521+
#[test]
522+
fn is_onepass_smoke_test1() {
523+
let e1 = Parser::new().parse(r"([^x]*)x(.*)").unwrap();
524+
let e2 = Parser::new().parse(r"(.*)x(.*)").unwrap();
525+
526+
assert!(is_onepass(&e1));
527+
assert!(!is_onepass(&e2));
528+
}
529+
530+
#[test]
531+
fn is_onepass_empty_alt() {
532+
let e1 = Parser::new().parse(r"(a|())b").unwrap();
533+
let e2 = Parser::new().parse(r"(a|())a").unwrap();
534+
535+
assert!(is_onepass(&e1));
536+
assert!(!is_onepass(&e2));
537+
}
538+
539+
#[test]
540+
fn is_onepass_rep() {
541+
let e1 = Parser::new().parse(r"a+a").unwrap();
542+
let e2 = Parser::new().parse(r"a*a").unwrap();
543+
544+
assert!(!is_onepass(&e1));
545+
assert!(!is_onepass(&e2));
546+
}
547+
}

‎src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,7 @@ mod re_unicode;
668668
mod sparse;
669669
#[cfg(any(regex_runtime_teddy_ssse3, regex_runtime_teddy_avx2))]
670670
mod vector;
671+
mod analysis;
671672

672673
/// The `internal` module exists to support suspicious activity, such as
673674
/// testing different matching engines and supporting the `regex-debug` CLI

0 commit comments

Comments
 (0)
Please sign in to comment.