From 9f1a5e13d60497041ccd04455b415cd4a8682ffd Mon Sep 17 00:00:00 2001 From: Robin Date: Wed, 5 Mar 2025 12:42:43 +0100 Subject: [PATCH 01/68] Add lookaround expressions to HIR This is the first step to supporting captureless lookbehind assertions --- regex-automata/src/meta/reverse_inner.rs | 2 + regex-automata/src/nfa/thompson/compiler.rs | 1 + regex-syntax/src/hir/literal.rs | 4 +- regex-syntax/src/hir/mod.rs | 56 +++++++++++++++++++++ regex-syntax/src/hir/print.rs | 9 +++- 5 files changed, 70 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/reverse_inner.rs b/regex-automata/src/meta/reverse_inner.rs index 3d78779f6..b236cf2e1 100644 --- a/regex-automata/src/meta/reverse_inner.rs +++ b/regex-automata/src/meta/reverse_inner.rs @@ -170,6 +170,7 @@ fn top_concat(mut hir: &Hir) -> Option> { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) + | HirKind::Lookaround(_) | HirKind::Repetition(_) | HirKind::Alternation(_) => return None, HirKind::Capture(hir::Capture { ref sub, .. }) => sub, @@ -206,6 +207,7 @@ fn flatten(hir: &Hir) -> Hir { HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), + HirKind::Lookaround(ref x) => Hir::lookaround(x.clone()), HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), // This is the interesting case. We just drop the group information // entirely and use the child HIR itself. diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 569f60acd..95f9b18af 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1003,6 +1003,7 @@ impl Compiler { Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), + Lookaround(_) => todo!("implement lookaround NFA compilation"), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 2a6350e64..e5b9fd29b 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,7 +172,9 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Empty | Look(_) | Lookaround(_) => { + Seq::singleton(self::Literal::exact(vec![])) + } Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6d57fe3fd..597bdd12b 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -373,6 +373,13 @@ impl Hir { Hir { kind: HirKind::Look(look), props } } + /// Creates a look-around subexpression HIR expression. + #[inline] + pub fn lookaround(lookaround: Lookaround) -> Hir { + let props = Properties::lookaround(&lookaround); + Hir { kind: HirKind::Lookaround(lookaround), props } + } + /// Creates a repetition HIR expression. #[inline] pub fn repetition(mut rep: Repetition) -> Hir { @@ -728,6 +735,8 @@ pub enum HirKind { Class(Class), /// A look-around assertion. A look-around match always has zero length. Look(Look), + /// A look-around subexpression + Lookaround(Lookaround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), /// A capturing group, which contains a sub-expression. @@ -761,6 +770,7 @@ impl HirKind { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => &[], + HirKind::Lookaround(ref lookaround) => from_ref(lookaround.sub()), HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, @@ -1786,6 +1796,37 @@ impl Look { } } +/// Represents a general lookaround assertion +/// +/// Currently, only lookbehind assertions are supported. +/// Furthermore, capture groups inside assertions are not supported. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Lookaround { + /// A positive lookbehind assertion + PositiveLookBehind(Box), + /// A negative lookbehind assertion + NegativeLookBehind(Box), +} + +impl Lookaround { + /// Returns a reference to the inner expression that must match for this + /// lookaround assertion to hold. + pub fn sub(&self) -> &Hir { + match self { + Lookaround::PositiveLookBehind(sub) + | Lookaround::NegativeLookBehind(sub) => sub, + } + } + + /// Returns a mutable reference to the inner expression + pub fn sub_mut(&mut self) -> &mut Hir { + match self { + Lookaround::PositiveLookBehind(sub) + | Lookaround::NegativeLookBehind(sub) => sub, + } + } +} + /// The high-level intermediate representation for a capturing group. /// /// A capturing group always has an index and a child expression. It may @@ -1935,6 +1976,9 @@ impl Drop for Hir { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} + HirKind::Lookaround(ref mut x) => { + stack.push(mem::replace(x.sub_mut(), Hir::empty())); + } HirKind::Capture(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } @@ -2499,6 +2543,18 @@ impl Properties { Properties(Box::new(inner)) } + fn lookaround(lookaround: &Lookaround) -> Properties { + let sub_p = lookaround.sub().properties(); + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + literal: false, + alternation_literal: false, + ..*sub_p.0.clone() + }; + Properties(Box::new(inner)) + } + /// Create a new set of HIR properties for a repetition. fn repetition(rep: &Repetition) -> Properties { let p = rep.sub.properties(); diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index dfa6d4032..547e579e9 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -227,6 +227,12 @@ impl Visitor for Writer { self.wtr.write_str(r"\b{end-half}")?; } }, + HirKind::Lookaround(hir::Lookaround::PositiveLookBehind(_)) => { + self.wtr.write_str(r"(?<=)")?; + } + HirKind::Lookaround(hir::Lookaround::NegativeLookBehind(_)) => { + self.wtr.write_str(r"(? { self.wtr.write_str("(")?; if let Some(ref name) = *name { @@ -293,7 +299,8 @@ impl Visitor for Writer { } HirKind::Capture(_) | HirKind::Concat(_) - | HirKind::Alternation(_) => { + | HirKind::Alternation(_) + | HirKind::Lookaround(_) => { self.wtr.write_str(r")")?; } } From e7c2584275adecbfcf30a918bc5030ee8b9086b9 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 20:48:09 +0100 Subject: [PATCH 02/68] Change how flatten works on hir::Lookaround The lack of recursing into the inner expression of a lookaround is correct under the current assumption that lookarounds cannot have capture groups. But once the restriction is lifted, this wrong implementation can be very subtle to find. Instead, we can already do the filtering and accept it being a no-op for now. --- regex-automata/src/meta/reverse_inner.rs | 4 +++- regex-syntax/src/hir/mod.rs | 29 ++++++++++++++++++------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/regex-automata/src/meta/reverse_inner.rs b/regex-automata/src/meta/reverse_inner.rs index b236cf2e1..8d9099600 100644 --- a/regex-automata/src/meta/reverse_inner.rs +++ b/regex-automata/src/meta/reverse_inner.rs @@ -207,7 +207,9 @@ fn flatten(hir: &Hir) -> Hir { HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), - HirKind::Lookaround(ref x) => Hir::lookaround(x.clone()), + HirKind::Lookaround(ref x) => { + Hir::lookaround(x.with(flatten(x.sub()))) + } HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), // This is the interesting case. We just drop the group information // entirely and use the child HIR itself. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 597bdd12b..8a029a501 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1796,15 +1796,15 @@ impl Look { } } -/// Represents a general lookaround assertion +/// Represents a general lookaround assertion. /// /// Currently, only lookbehind assertions are supported. /// Furthermore, capture groups inside assertions are not supported. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Lookaround { - /// A positive lookbehind assertion + /// A positive lookbehind assertion. PositiveLookBehind(Box), - /// A negative lookbehind assertion + /// A negative lookbehind assertion. NegativeLookBehind(Box), } @@ -1813,16 +1813,31 @@ impl Lookaround { /// lookaround assertion to hold. pub fn sub(&self) -> &Hir { match self { - Lookaround::PositiveLookBehind(sub) - | Lookaround::NegativeLookBehind(sub) => sub, + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } } } /// Returns a mutable reference to the inner expression pub fn sub_mut(&mut self) -> &mut Hir { match self { - Lookaround::PositiveLookBehind(sub) - | Lookaround::NegativeLookBehind(sub) => sub, + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } + } + } + + /// Returns a new lookaround of the same kind, but with its + /// sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> Lookaround { + match self { + Self::PositiveLookBehind(_) => { + Self::PositiveLookBehind(Box::new(sub)) + } + Self::NegativeLookBehind(_) => { + Self::NegativeLookBehind(Box::new(sub)) + } } } } From 96548c1fd2617e0655dcf36b7e2bbf56b8ea4bb6 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:08:52 +0100 Subject: [PATCH 03/68] Add hir::Lookaround to the visitor --- regex-syntax/src/hir/visitor.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index f30f0a163..a946d9ddb 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -83,6 +83,9 @@ enum Frame<'a> { /// A stack frame allocated just before descending into a capture's child /// node. Capture(&'a hir::Capture), + /// A stack frame allocated just before descending into a look-around's + /// child node. + LookAround(&'a hir::Lookaround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -162,6 +165,7 @@ impl<'a> HeapVisitor<'a> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), HirKind::Capture(ref x) => Some(Frame::Capture(x)), + HirKind::Lookaround(ref x) => Some(Frame::LookAround(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) @@ -180,6 +184,7 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Capture(_) => None, + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -208,6 +213,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.sub, Frame::Capture(capture) => &capture.sub, + Frame::LookAround(lookaround) => &lookaround.sub(), Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } From 23764ce1c2e6b0dd2c06bd0aa5904e9f20bcfc5e Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:09:12 +0100 Subject: [PATCH 04/68] Fix hir::Lookaround printing and add test --- regex-syntax/src/hir/print.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 547e579e9..8ff5c85e2 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -228,10 +228,10 @@ impl Visitor for Writer { } }, HirKind::Lookaround(hir::Lookaround::PositiveLookBehind(_)) => { - self.wtr.write_str(r"(?<=)")?; + self.wtr.write_str(r"(?<=")?; } HirKind::Lookaround(hir::Lookaround::NegativeLookBehind(_)) => { - self.wtr.write_str(r"(? { self.wtr.write_str("(")?; @@ -484,6 +484,18 @@ mod tests { roundtrip("((((a))))", "((((a))))"); } + #[test] + #[ignore = "Missing parser support for lookaround"] + fn print_look_around() { + roundtrip("(?<=)", "(?<=(?:))"); + roundtrip("(? Date: Sat, 8 Mar 2025 21:12:58 +0100 Subject: [PATCH 05/68] Remove useless ref --- regex-syntax/src/hir/visitor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index a946d9ddb..8ba304683 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -213,7 +213,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.sub, Frame::Capture(capture) => &capture.sub, - Frame::LookAround(lookaround) => &lookaround.sub(), + Frame::LookAround(lookaround) => lookaround.sub(), Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } From 258d36c00b94b331d868faadb501f82c1a1323d4 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:21:08 +0100 Subject: [PATCH 06/68] Add missing drop case for hir::Lookaround --- regex-syntax/src/hir/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 8a029a501..99d3fb3e2 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1976,6 +1976,9 @@ impl Drop for Hir { | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Lookaround(ref x) if x.sub().kind.subs().is_empty() => { + return + } HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { return } From 07be9abe11a2739670f82566bd9d02d086e6948b Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:25:23 +0100 Subject: [PATCH 07/68] Rename Lookaround to LookAround This makes it consistent with parser's ErrorKind::UnsupportedLookAround. --- regex-automata/src/meta/reverse_inner.rs | 4 ++-- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-syntax/src/hir/literal.rs | 2 +- regex-syntax/src/hir/mod.rs | 20 ++++++++++---------- regex-syntax/src/hir/print.rs | 6 +++--- regex-syntax/src/hir/visitor.rs | 4 ++-- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/regex-automata/src/meta/reverse_inner.rs b/regex-automata/src/meta/reverse_inner.rs index 8d9099600..14e260a1e 100644 --- a/regex-automata/src/meta/reverse_inner.rs +++ b/regex-automata/src/meta/reverse_inner.rs @@ -170,7 +170,7 @@ fn top_concat(mut hir: &Hir) -> Option> { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) - | HirKind::Lookaround(_) + | HirKind::LookAround(_) | HirKind::Repetition(_) | HirKind::Alternation(_) => return None, HirKind::Capture(hir::Capture { ref sub, .. }) => sub, @@ -207,7 +207,7 @@ fn flatten(hir: &Hir) -> Hir { HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), - HirKind::Lookaround(ref x) => { + HirKind::LookAround(ref x) => { Hir::lookaround(x.with(flatten(x.sub()))) } HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 95f9b18af..9fe99be98 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1003,7 +1003,7 @@ impl Compiler { Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), - Lookaround(_) => todo!("implement lookaround NFA compilation"), + LookAround(_) => todo!("implement lookaround NFA compilation"), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index e5b9fd29b..d47330ea9 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,7 +172,7 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) | Lookaround(_) => { + Empty | Look(_) | LookAround(_) => { Seq::singleton(self::Literal::exact(vec![])) } Literal(hir::Literal(ref bytes)) => { diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 99d3fb3e2..c53620a07 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -375,9 +375,9 @@ impl Hir { /// Creates a look-around subexpression HIR expression. #[inline] - pub fn lookaround(lookaround: Lookaround) -> Hir { + pub fn lookaround(lookaround: LookAround) -> Hir { let props = Properties::lookaround(&lookaround); - Hir { kind: HirKind::Lookaround(lookaround), props } + Hir { kind: HirKind::LookAround(lookaround), props } } /// Creates a repetition HIR expression. @@ -736,7 +736,7 @@ pub enum HirKind { /// A look-around assertion. A look-around match always has zero length. Look(Look), /// A look-around subexpression - Lookaround(Lookaround), + LookAround(LookAround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), /// A capturing group, which contains a sub-expression. @@ -770,7 +770,7 @@ impl HirKind { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => &[], - HirKind::Lookaround(ref lookaround) => from_ref(lookaround.sub()), + HirKind::LookAround(ref lookaround) => from_ref(lookaround.sub()), HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, @@ -1801,14 +1801,14 @@ impl Look { /// Currently, only lookbehind assertions are supported. /// Furthermore, capture groups inside assertions are not supported. #[derive(Clone, Debug, Eq, PartialEq)] -pub enum Lookaround { +pub enum LookAround { /// A positive lookbehind assertion. PositiveLookBehind(Box), /// A negative lookbehind assertion. NegativeLookBehind(Box), } -impl Lookaround { +impl LookAround { /// Returns a reference to the inner expression that must match for this /// lookaround assertion to hold. pub fn sub(&self) -> &Hir { @@ -1830,7 +1830,7 @@ impl Lookaround { /// Returns a new lookaround of the same kind, but with its /// sub-expression replaced with the one given. - pub fn with(&self, sub: Hir) -> Lookaround { + pub fn with(&self, sub: Hir) -> LookAround { match self { Self::PositiveLookBehind(_) => { Self::PositiveLookBehind(Box::new(sub)) @@ -1976,7 +1976,7 @@ impl Drop for Hir { | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, - HirKind::Lookaround(ref x) if x.sub().kind.subs().is_empty() => { + HirKind::LookAround(ref x) if x.sub().kind.subs().is_empty() => { return } HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { @@ -1994,7 +1994,7 @@ impl Drop for Hir { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} - HirKind::Lookaround(ref mut x) => { + HirKind::LookAround(ref mut x) => { stack.push(mem::replace(x.sub_mut(), Hir::empty())); } HirKind::Capture(ref mut x) => { @@ -2561,7 +2561,7 @@ impl Properties { Properties(Box::new(inner)) } - fn lookaround(lookaround: &Lookaround) -> Properties { + fn lookaround(lookaround: &LookAround) -> Properties { let sub_p = lookaround.sub().properties(); let inner = PropertiesI { minimum_len: Some(0), diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 8ff5c85e2..86e0018c6 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -227,10 +227,10 @@ impl Visitor for Writer { self.wtr.write_str(r"\b{end-half}")?; } }, - HirKind::Lookaround(hir::Lookaround::PositiveLookBehind(_)) => { + HirKind::LookAround(hir::LookAround::PositiveLookBehind(_)) => { self.wtr.write_str(r"(?<=")?; } - HirKind::Lookaround(hir::Lookaround::NegativeLookBehind(_)) => { + HirKind::LookAround(hir::LookAround::NegativeLookBehind(_)) => { self.wtr.write_str(r"(? { @@ -300,7 +300,7 @@ impl Visitor for Writer { HirKind::Capture(_) | HirKind::Concat(_) | HirKind::Alternation(_) - | HirKind::Lookaround(_) => { + | HirKind::LookAround(_) => { self.wtr.write_str(r")")?; } } diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index 8ba304683..0af0aeca1 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -85,7 +85,7 @@ enum Frame<'a> { Capture(&'a hir::Capture), /// A stack frame allocated just before descending into a look-around's /// child node. - LookAround(&'a hir::Lookaround), + LookAround(&'a hir::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -165,7 +165,7 @@ impl<'a> HeapVisitor<'a> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), HirKind::Capture(ref x) => Some(Frame::Capture(x)), - HirKind::Lookaround(ref x) => Some(Frame::LookAround(x)), + HirKind::LookAround(ref x) => Some(Frame::LookAround(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) From d89e90a8d5e553fcf56e718f4029b4504f3f02c0 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 21:37:48 +0100 Subject: [PATCH 08/68] Fix properties of LookArounds --- regex-syntax/src/hir/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index c53620a07..6c7176c47 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -735,7 +735,7 @@ pub enum HirKind { Class(Class), /// A look-around assertion. A look-around match always has zero length. Look(Look), - /// A look-around subexpression + /// A look-around subexpression. LookAround(LookAround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), @@ -2561,6 +2561,7 @@ impl Properties { Properties(Box::new(inner)) } + /// Create a new set of HIR properties for a look-around. fn lookaround(lookaround: &LookAround) -> Properties { let sub_p = lookaround.sub().properties(); let inner = PropertiesI { @@ -2568,6 +2569,8 @@ impl Properties { maximum_len: Some(0), literal: false, alternation_literal: false, + explicit_captures_len: sub_p.explicit_captures_len(), + static_explicit_captures_len: sub_p.static_explicit_captures_len(), ..*sub_p.0.clone() }; Properties(Box::new(inner)) From 31b09a5f3608d4e7d93066975df76b2985af42e2 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 22:32:43 +0100 Subject: [PATCH 09/68] Add missing literal lookaround test --- regex-syntax/src/hir/literal.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index d47330ea9..031ff0736 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2455,6 +2455,22 @@ mod tests { assert_eq!(expected, e(r"^aZ*b")); } + #[test] + #[ignore = "Missing parser support for lookaround"] + fn lookaround() { + assert_eq!(exact(["ab"]), e(r"a(?<=qwe)b")); + assert_eq!(exact(["ab"]), e(r"a(? Date: Tue, 11 Mar 2025 15:26:20 +0100 Subject: [PATCH 10/68] Fix literal test and useless property computation --- regex-syntax/src/hir/literal.rs | 4 ++-- regex-syntax/src/hir/mod.rs | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 031ff0736..923cb18ec 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2458,13 +2458,13 @@ mod tests { #[test] #[ignore = "Missing parser support for lookaround"] fn lookaround() { - assert_eq!(exact(["ab"]), e(r"a(?<=qwe)b")); + assert_eq!(exact(["ab"]), e(r"a(?<=qwa)b")); assert_eq!(exact(["ab"]), e(r"a(? Date: Sat, 8 Mar 2025 21:54:56 +0100 Subject: [PATCH 11/68] Adjust parsing errors for lookarounds --- regex-syntax/src/ast/mod.rs | 20 +++++++++++--------- regex-syntax/src/ast/parse.rs | 24 ++++++++++++++---------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index ce79a89ab..d217bf836 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -181,12 +181,15 @@ pub enum ErrorKind { /// escape is used. The octal escape is assumed to be an invocation of /// a backreference, which is the common case. UnsupportedBackreference, - /// When syntax similar to PCRE's look-around is used, this error is + /// When syntax similar to PCRE's look-ahead is used, this error is /// returned. Some example syntaxes that are rejected include, but are - /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and - /// `(? { write!(f, "backreferences are not supported") } - UnsupportedLookAround => write!( - f, - "look-around, including look-ahead and look-behind, \ - is not supported" - ), + UnsupportedLookAhead => write!(f, "look-aheads are not supported"), + UsupportedCaptureInLookBehind => { + write!(f, "capture groups are not supported in look-behinds") + } } } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 0c2a35265..fb6876f21 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1232,7 +1232,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { if self.is_lookaround_prefix() { return Err(self.error( Span::new(open_span.start, self.span().end), - ast::ErrorKind::UnsupportedLookAround, + ast::ErrorKind::UnsupportedLookAhead, )); } let inner_span = self.span(); @@ -3736,33 +3736,37 @@ bar } #[test] - fn parse_unsupported_lookaround() { + fn parse_unsupported_lookahead() { assert_eq!( parser(r"(?=a)").parse().unwrap_err(), TestError { span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, + kind: ast::ErrorKind::UnsupportedLookAhead, } ); assert_eq!( parser(r"(?!a)").parse().unwrap_err(), TestError { span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, + kind: ast::ErrorKind::UnsupportedLookAhead, } ); + } + + #[test] + fn parse_unsupported_capture_in_lookbehind() { assert_eq!( - parser(r"(?<=a)").parse().unwrap_err(), + parser(r"(?<=(?<=(a)))").parse().unwrap_err(), TestError { - span: span(0..4), - kind: ast::ErrorKind::UnsupportedLookAround, + span: span(8..10), + kind: ast::ErrorKind::UsupportedCaptureInLookBehind, } ); assert_eq!( - parser(r"(? Date: Sat, 8 Mar 2025 22:27:04 +0100 Subject: [PATCH 12/68] Add LookAround to Ast --- regex-syntax/src/ast/mod.rs | 35 +++++++++++++++++++++++++++++++ regex-syntax/src/ast/parse.rs | 7 +++++++ regex-syntax/src/ast/print.rs | 20 ++++++++++++++++++ regex-syntax/src/ast/visitor.rs | 6 ++++++ regex-syntax/src/hir/translate.rs | 6 ++++++ 5 files changed, 74 insertions(+) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index d217bf836..a2daef197 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -479,6 +479,8 @@ pub enum Ast { Dot(Box), /// A single zero-width assertion. Assertion(Box), + /// A single look-around regular expression. + LookAround(Box), /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. ClassUnicode(Box), /// A single perl character class, e.g., `\d` or `\W`. @@ -523,6 +525,11 @@ impl Ast { Ast::Assertion(Box::new(e)) } + /// Create a "look-around" AST item. + pub fn look_around(e: LookAround) -> Ast { + Ast::LookAround(Box::new(e)) + } + /// Create a "Unicode class" AST item. pub fn class_unicode(e: ClassUnicode) -> Ast { Ast::ClassUnicode(Box::new(e)) @@ -566,6 +573,7 @@ impl Ast { Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, + Ast::LookAround(ref x) => &x.span, Ast::ClassUnicode(ref x) => &x.span, Ast::ClassPerl(ref x) => &x.span, Ast::ClassBracketed(ref x) => &x.span, @@ -598,6 +606,7 @@ impl Ast { Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) + | Ast::LookAround(_) | Ast::Alternation(_) | Ast::Concat(_) => true, } @@ -1344,6 +1353,28 @@ pub enum AssertionKind { WordBoundaryEndHalf, } +/// A single zero-width look-around. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct LookAround { + /// The span of this look-around. + pub span: Span, + /// The look-around kind, e.g. negative/positive look-behind. + pub kind: LookAroundKind, + /// The regular expression inside the look-around. + pub ast: Box, +} + +/// A look-around kind. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum LookAroundKind { + /// `(?<=...)` + PositiveLookBehind, + /// `(? return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::LookAround(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} @@ -1675,6 +1707,9 @@ impl Drop for Ast { Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } + Ast::LookAround(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index fb6876f21..13975919d 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2328,6 +2328,7 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, + Ast::LookAround(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, Ast::Concat(ref x) => &x.span, }; @@ -2349,6 +2350,7 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) + | Ast::LookAround(_) | Ast::Alternation(_) | Ast::Concat(_) => { self.decrement_depth(); @@ -3753,6 +3755,11 @@ bar ); } + #[test] + fn parse_lookbehinds() { + todo!() + } + #[test] fn parse_unsupported_capture_in_lookbehind() { assert_eq!( diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 1ceb3c7fa..69be55064 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,6 +80,7 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::LookAround(ref x) => self.fmt_lookaround_pre(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } @@ -92,6 +93,7 @@ impl Visitor for Writer { Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::LookAround(ref x) => self.fmt_lookaround_post(x), Ast::ClassPerl(ref x) => self.fmt_class_perl(x), Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), @@ -174,6 +176,18 @@ impl Writer { self.wtr.write_str(")") } + fn fmt_lookaround_pre(&mut self, ast: &ast::LookAround) -> fmt::Result { + use crate::ast::LookAroundKind::*; + match ast.kind { + PositiveLookBehind => self.wtr.write_str("(?<="), + NegativeLookBehind => self.wtr.write_str("(? fmt::Result { + self.wtr.write_str(")") + } + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { use crate::ast::RepetitionKind::*; match ast.op.kind { @@ -511,6 +525,12 @@ mod tests { roundtrip("(a)"); } + #[test] + fn print_lookaround() { + roundtrip("(? { /// A stack frame allocated just before descending into a group's child /// node. Group(&'a ast::Group), + /// A stack frame allocated just before descending into a look-around's + /// child node. + LookAround(&'a ast::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -270,6 +273,7 @@ impl<'a> HeapVisitor<'a> { } Ast::Repetition(ref x) => Some(Frame::Repetition(x)), Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::LookAround(ref x) => Some(Frame::LookAround(x)), Ast::Concat(ref x) if x.asts.is_empty() => None, Ast::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) @@ -289,6 +293,7 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Group(_) => None, + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -444,6 +449,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.ast, Frame::Group(group) => &group.ast, + Frame::LookAround(look) => &look.ast, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index c210f1a26..f3bc2100c 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -354,6 +354,9 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } + Ast::LookAround(ref x) => { + todo!("translation from AST to HIR"); + } Ast::Concat(_) => { self.push(HirFrame::Concat); } @@ -446,6 +449,9 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } + Ast::LookAround(_) => { + todo!("translation from AST to HIR"); + } Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { From ffdb9429ebfed9aadcf03818e82c1b45f4b32397 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sat, 8 Mar 2025 22:42:58 +0100 Subject: [PATCH 13/68] Disable failing tests --- regex-syntax/src/ast/parse.rs | 4 +++- regex-syntax/src/ast/print.rs | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 13975919d..6090485a7 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -3756,11 +3756,13 @@ bar } #[test] + #[ignore = "Missing parser support for lookaround"] fn parse_lookbehinds() { - todo!() + todo!("write tests for lookbehinds"); } #[test] + #[ignore = "Missing parser support for lookaround"] fn parse_unsupported_capture_in_lookbehind() { assert_eq!( parser(r"(?<=(?<=(a)))").parse().unwrap_err(), diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 69be55064..112c0bda1 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -526,6 +526,7 @@ mod tests { } #[test] + #[ignore = "Missing parser support for lookaround"] fn print_lookaround() { roundtrip("(? Date: Sat, 8 Mar 2025 22:44:24 +0100 Subject: [PATCH 14/68] Fix UnsupportedCaptureInLookBehind typo --- regex-syntax/src/ast/mod.rs | 4 ++-- regex-syntax/src/ast/parse.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index a2daef197..0eca1d4db 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -189,7 +189,7 @@ pub enum ErrorKind { UnsupportedLookAhead, /// When a capture group is used in a look-behind assertion, this error is /// returned. Look-behind assertions do not support capturing groups. - UsupportedCaptureInLookBehind, + UnsupportedCaptureInLookBehind, } #[cfg(feature = "std")] @@ -305,7 +305,7 @@ impl core::fmt::Display for ErrorKind { write!(f, "backreferences are not supported") } UnsupportedLookAhead => write!(f, "look-aheads are not supported"), - UsupportedCaptureInLookBehind => { + UnsupportedCaptureInLookBehind => { write!(f, "capture groups are not supported in look-behinds") } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 6090485a7..012185537 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -3768,14 +3768,14 @@ bar parser(r"(?<=(?<=(a)))").parse().unwrap_err(), TestError { span: span(8..10), - kind: ast::ErrorKind::UsupportedCaptureInLookBehind, + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); assert_eq!( parser(r"(? Date: Sun, 9 Mar 2025 10:51:09 +0100 Subject: [PATCH 15/68] Add unclosed lookaround error --- regex-syntax/src/ast/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 0eca1d4db..25f3b9280 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -144,6 +144,10 @@ pub enum ErrorKind { /// /// The span of this error corresponds to the unclosed parenthesis. GroupUnclosed, + /// An unclosed look-around, e.g., `(? write!(f, "invalid capture group character"), GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), GroupUnclosed => write!(f, "unclosed group"), + LookAroundUnclosed => write!(f, "unclosed look-around"), GroupUnopened => write!(f, "unopened group"), NestLimitExceeded(limit) => write!( f, @@ -526,7 +531,7 @@ impl Ast { } /// Create a "look-around" AST item. - pub fn look_around(e: LookAround) -> Ast { + pub fn lookaround(e: LookAround) -> Ast { Ast::LookAround(Box::new(e)) } From 830cabef7da1e1a32a2f4f586682fa447397f9d6 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 10:53:32 +0100 Subject: [PATCH 16/68] Rename lookaround to look-around --- regex-syntax/src/hir/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index fcc75c1a3..28c660031 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1796,7 +1796,7 @@ impl Look { } } -/// Represents a general lookaround assertion. +/// Represents a general look-around assertion. /// /// Currently, only lookbehind assertions are supported. /// Furthermore, capture groups inside assertions are not supported. @@ -1810,7 +1810,7 @@ pub enum LookAround { impl LookAround { /// Returns a reference to the inner expression that must match for this - /// lookaround assertion to hold. + /// look-around assertion to hold. pub fn sub(&self) -> &Hir { match self { Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { @@ -1828,7 +1828,7 @@ impl LookAround { } } - /// Returns a new lookaround of the same kind, but with its + /// Returns a new look-around of the same kind, but with its /// sub-expression replaced with the one given. pub fn with(&self, sub: Hir) -> LookAround { match self { From c353b35a2b462da779a7033d2194789e355f1a55 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 11:00:13 +0100 Subject: [PATCH 17/68] Support parsing of look-behinds --- regex-syntax/src/ast/parse.rs | 204 +++++++++++++++++++++++----------- regex-syntax/src/ast/print.rs | 3 +- 2 files changed, 141 insertions(+), 66 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 012185537..026e26883 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, Position, Span}, + ast::{self, Ast, LookAroundKind, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -299,9 +299,9 @@ struct ParserI<'s, P> { pattern: &'s str, } -/// GroupState represents a single stack frame while parsing nested groups -/// and alternations. Each frame records the state up to an opening parenthesis -/// or a alternating bracket `|`. +/// GroupState represents a single stack frame while parsing nested groups, +/// look-arounds and alternations. Each frame records the state up to an opening +/// parenthesis or a alternating bracket `|`. #[derive(Clone, Debug)] enum GroupState { /// This state is pushed whenever an opening group is found. @@ -313,6 +313,13 @@ enum GroupState { /// Whether this group has the `x` flag enabled or not. ignore_whitespace: bool, }, + /// This state is pushed whenever an opening look-around is found. + LookAround { + /// The concatenation immediately preceding the opening look-around. + concat: ast::Concat, + /// The look-around that has been opened. Its sub-AST is always empty. + lookaround: ast::LookAround, + }, /// This state is pushed whenever a new alternation branch is found. If /// an alternation branch is found and this state is at the top of the /// stack, then this state should be modified to include the new @@ -521,18 +528,15 @@ impl<'s, P: Borrow> ParserI<'s, P> { } } - /// Returns true if and only if the parser is positioned at a look-around + /// Returns true if and only if the parser is positioned at a look-ahead /// prefix. The conditions under which this returns true must always /// correspond to a regular expression that would otherwise be consider /// invalid. /// /// This should only be called immediately after parsing the opening of /// a group or a set of flags. - fn is_lookaround_prefix(&self) -> bool { - self.bump_if("?=") - || self.bump_if("?!") - || self.bump_if("?<=") - || self.bump_if("? bool { + self.bump_if("?=") || self.bump_if("?!") } /// Bump the parser, and if the `x` flag is enabled, bump through any @@ -686,9 +690,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { })); } - /// Parse and push a group AST (and its parent concatenation) on to the - /// parser's internal stack. Return a fresh concatenation corresponding - /// to the group's sub-AST. + /// Parse and push a group or look-around AST (and its parent + /// concatenation) on to the parser's internal stack. Return a fresh + /// concatenation corresponding to the grouping's sub-AST. /// /// If a set of flags was found (with no group), then the concatenation /// is returned with that set of flags added. @@ -697,12 +701,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// parenthesis. It advances the parser to the character at the start /// of the sub-expression (or adjoining expression). /// - /// If there was a problem parsing the start of the group, then an error - /// is returned. + /// If there was a problem parsing the start of the grouping, then an + /// error is returned. #[inline(never)] - fn push_group(&self, mut concat: ast::Concat) -> Result { + fn push_grouping(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '('); - match self.parse_group()? { + match self.parse_grouping()? { Either::Left(set) => { let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); if let Some(v) = ignore { @@ -712,7 +716,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { concat.asts.push(Ast::flags(set)); Ok(concat) } - Either::Right(group) => { + Either::Right(Either::Left(group)) => { let old_ignore_whitespace = self.ignore_whitespace(); let new_ignore_whitespace = group .flags() @@ -728,61 +732,105 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(new_ignore_whitespace); Ok(ast::Concat { span: self.span(), asts: vec![] }) } + Either::Right(Either::Right(lookaround)) => { + self.parser() + .stack_group + .borrow_mut() + .push(GroupState::LookAround { concat, lookaround }); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } } } - /// Pop a group AST from the parser's internal stack and set the group's - /// AST to the given concatenation. Return the concatenation containing - /// the group. + /// Pop a group or look-around AST from the parser's internal stack and + /// set the grouping's AST to the given concatenation. Return the + /// concatenation containing the grouping. /// /// This assumes that the parser is currently positioned on the closing /// parenthesis and advances the parser to the character following the `)`. /// - /// If no such group could be popped, then an unopened group error is + /// If no such grouping could be popped, then an unopened group error is /// returned. #[inline(never)] - fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + fn pop_grouping( + &self, + mut grouping_concat: ast::Concat, + ) -> Result { use self::GroupState::*; assert_eq!(self.char(), ')'); let mut stack = self.parser().stack_group.borrow_mut(); - let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack - .pop() - { - Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, None) - } - Some(Alternation(alt)) => match stack.pop() { + let (mut prior_concat, mut grouping, ignore_whitespace, alt) = + match stack.pop() { Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, Some(alt)) + (concat, Either::Left(group), ignore_whitespace, None) } - None | Some(Alternation(_)) => { + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.parser().ignore_whitespace.get(), + None, + ), + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => ( + concat, + Either::Left(group), + ignore_whitespace, + Some(alt), + ), + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.parser().ignore_whitespace.get(), + Some(alt), + ), + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { return Err(self.error( self.span_char(), ast::ErrorKind::GroupUnopened, )); } - }, - None => { - return Err(self - .error(self.span_char(), ast::ErrorKind::GroupUnopened)); - } - }; + }; self.parser().ignore_whitespace.set(ignore_whitespace); - group_concat.span.end = self.pos(); + grouping_concat.span.end = self.pos(); self.bump(); - group.span.end = self.pos(); + match &mut grouping { + Either::Left(group) => group.span.end = self.pos(), + Either::Right(lookaround) => lookaround.span.end = self.pos(), + } match alt { Some(mut alt) => { - alt.span.end = group_concat.span.end; - alt.asts.push(group_concat.into_ast()); - group.ast = Box::new(alt.into_ast()); - } - None => { - group.ast = Box::new(group_concat.into_ast()); + alt.span.end = grouping_concat.span.end; + alt.asts.push(grouping_concat.into_ast()); + match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(alt.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(alt.into_ast()) + } + } } + None => match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(grouping_concat.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(grouping_concat.into_ast()) + } + }, } - prior_concat.asts.push(Ast::group(group)); + prior_concat.asts.push(match grouping { + Either::Left(group) => Ast::group(group), + Either::Right(lookaround) => Ast::lookaround(lookaround), + }); Ok(prior_concat) } @@ -793,7 +841,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// This assumes that the parser has advanced to the end. #[inline(never)] - fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + fn pop_grouping_end(&self, mut concat: ast::Concat) -> Result { concat.span.end = self.pos(); let mut stack = self.parser().stack_group.borrow_mut(); let ast = match stack.pop() { @@ -808,6 +856,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(group.span, ast::ErrorKind::GroupUnclosed) ); } + Some(GroupState::LookAround { lookaround, .. }) => { + return Err(self.error( + lookaround.span, + ast::ErrorKind::LookAroundUnclosed, + )); + } }; // If we try to pop again, there should be nothing. match stack.pop() { @@ -824,6 +878,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Group { group, .. }) => { Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) } + Some(GroupState::LookAround { lookaround, .. }) => Err(self + .error(lookaround.span, ast::ErrorKind::LookAroundUnclosed)), } } @@ -989,8 +1045,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { break; } match self.char() { - '(' => concat = self.push_group(concat)?, - ')' => concat = self.pop_group(concat)?, + '(' => concat = self.push_grouping(concat)?, + ')' => concat = self.pop_grouping(concat)?, '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; @@ -1020,7 +1076,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { _ => concat.asts.push(self.parse_primitive()?.into_ast()), } } - let ast = self.pop_group_end(concat)?; + let ast = self.pop_grouping_end(concat)?; NestLimiter::new(self).check(&ast)?; Ok(ast::WithComments { ast, @@ -1205,16 +1261,17 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(concat) } - /// Parse a group (which contains a sub-expression) or a set of flags. + /// Parse a group or look-around (which contain a sub-expression), or a + /// set of flags. /// - /// If a group was found, then it is returned with an empty AST. If a set - /// of flags is found, then that set is returned. + /// If a group or look-around was found, then it is returned with an + /// empty AST. If a set of flags is found, then that set is returned. /// /// The parser should be positioned at the opening parenthesis. /// /// This advances the parser to the character before the start of the - /// sub-expression (in the case of a group) or to the closing parenthesis - /// immediately following the set of flags. + /// sub-expression (in the case of a group or look-around) or to the + /// closing parenthesis immediately following the set of flags. /// /// # Errors /// @@ -1223,19 +1280,38 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If a capture name is given and it is incorrectly specified, then a /// corresponding error is returned. + /// + /// If a look-ahead is given (which is currently unsupported), then an + /// error is returned. #[inline(never)] - fn parse_group(&self) -> Result> { + fn parse_grouping( + &self, + ) -> Result>> + { assert_eq!(self.char(), '('); let open_span = self.span_char(); self.bump(); self.bump_space(); - if self.is_lookaround_prefix() { + if self.is_lookahead_prefix() { return Err(self.error( Span::new(open_span.start, self.span().end), ast::ErrorKind::UnsupportedLookAhead, )); } let inner_span = self.span(); + + let mut lookaround_kind = LookAroundKind::PositiveLookBehind; + if self.bump_if("?<=") || { + lookaround_kind = LookAroundKind::NegativeLookBehind; + self.bump_if("?> ParserI<'s, P> { } { let capture_index = self.next_capture_index(open_span)?; let name = self.parse_capture_name(capture_index)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::empty(self.span())), - })) + }))) } else if self.bump_if("?") { if self.is_eof() { return Err( @@ -1272,19 +1348,19 @@ impl<'s, P: Borrow> ParserI<'s, P> { })) } else { assert_eq!(char_end, ':'); - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } else { let capture_index = self.next_capture_index(open_span)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 112c0bda1..0e87599d2 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -526,9 +526,8 @@ mod tests { } #[test] - #[ignore = "Missing parser support for lookaround"] fn print_lookaround() { - roundtrip("(? Date: Sun, 9 Mar 2025 11:40:51 +0100 Subject: [PATCH 18/68] Reject lookbehinds with capture groups --- regex-syntax/src/ast/parse.rs | 76 +++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 026e26883..e8c65eadb 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -751,6 +751,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If no such grouping could be popped, then an unopened group error is /// returned. + /// + /// If a look-behind contains a capture group, then an error is returned. #[inline(never)] fn pop_grouping( &self, @@ -829,7 +831,16 @@ impl<'s, P: Borrow> ParserI<'s, P> { } prior_concat.asts.push(match grouping { Either::Left(group) => Ast::group(group), - Either::Right(lookaround) => Ast::lookaround(lookaround), + Either::Right(lookaround) => { + if let Some(span) = first_capture_group_span(&lookaround.ast) { + return Err(self.error( + span, + ast::ErrorKind::UnsupportedCaptureInLookBehind, + )); + } + + Ast::lookaround(lookaround) + } }); Ok(prior_concat) } @@ -2511,6 +2522,29 @@ fn specialize_err( } } +/// Returns the span of the first capture group found. Returns None in case there are no capture groups. +fn first_capture_group_span(ast: &Ast) -> Option { + struct CaptureGroupSearcher; + + impl ast::Visitor for CaptureGroupSearcher { + type Output = (); + type Err = Span; + + fn finish(self) -> core::result::Result { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> std::result::Result<(), Span> { + match ast { + Ast::Group(group) => Err(group.span), + _ => Ok(()), + } + } + } + + ast::visit(ast, CaptureGroupSearcher).err() +} + #[cfg(test)] mod tests { use core::ops::Range; @@ -3838,19 +3872,53 @@ bar } #[test] - #[ignore = "Missing parser support for lookaround"] fn parse_unsupported_capture_in_lookbehind() { assert_eq!( parser(r"(?<=(?<=(a)))").parse().unwrap_err(), TestError { - span: span(8..10), + span: span(8..11), kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); assert_eq!( parser(r"(?a))").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, + } + ); + assert_eq!( + parser(r"(?a)|b)").parse().unwrap_err(), + TestError { + span: span(6..16), kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); From 8352a9e37d719c8d8d2c884b622616e86d46c034 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 11:56:22 +0100 Subject: [PATCH 19/68] Add tests for parsing lookbehinds --- regex-syntax/src/ast/parse.rs | 83 ++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index e8c65eadb..0713ce948 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -3866,9 +3866,88 @@ bar } #[test] - #[ignore = "Missing parser support for lookaround"] fn parse_lookbehinds() { - todo!("write tests for lookbehinds"); + assert_eq!( + parser(r"(?<=)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..5), + ast: Box::new(Ast::empty(span(4..4))), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=a)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..6), + ast: Box::new(lit('a', 4)), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(? Date: Sun, 9 Mar 2025 12:35:32 +0100 Subject: [PATCH 20/68] Add AST -> HIR translation for lookarounds --- regex-syntax/src/hir/literal.rs | 1 - regex-syntax/src/hir/print.rs | 26 ++++++++++++++++++++------ regex-syntax/src/hir/translate.rs | 15 ++++++++++----- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 923cb18ec..9b21abf94 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2456,7 +2456,6 @@ mod tests { } #[test] - #[ignore = "Missing parser support for lookaround"] fn lookaround() { assert_eq!(exact(["ab"]), e(r"a(?<=qwa)b")); assert_eq!(exact(["ab"]), e(r"a(? Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::LookAround(ref x) => { - todo!("translation from AST to HIR"); - } Ast::Concat(_) => { self.push(HirFrame::Concat); } @@ -449,8 +446,16 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } - Ast::LookAround(_) => { - todo!("translation from AST to HIR"); + Ast::LookAround(ref x) => { + let expr = Box::new(self.pop().unwrap().unwrap_expr()); + self.push(HirFrame::Expr(Hir::lookaround(match x.kind { + ast::LookAroundKind::PositiveLookBehind => { + hir::LookAround::PositiveLookBehind(expr) + } + ast::LookAroundKind::NegativeLookBehind => { + hir::LookAround::NegativeLookBehind(expr) + } + }))); } Ast::Concat(_) => { let mut exprs = vec![]; From d35e65a05db00a20bb5a577b3630b2fb03440a32 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Sun, 9 Mar 2025 12:38:26 +0100 Subject: [PATCH 21/68] Fix typo --- regex-syntax/src/hir/print.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 90587a605..e32e222c6 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -486,8 +486,8 @@ mod tests { #[test] fn print_look_around() { - // we do not want to do a roundtrip: printed lookarounds are not - // can contain capture groups which are unsupported by the parser. + // we do not want to do a roundtrip: printed lookarounds can + // contain capture groups which are unsupported by the parser. // TODO(shilangyu): is this a problem that some regexes are not // roundtrippable? fn test(given: &str, expected: &str) { From ae33591bf2d3cd890158fcaa0ddc6179f2cff41d Mon Sep 17 00:00:00 2001 From: shilangyu Date: Tue, 11 Mar 2025 21:46:06 +0100 Subject: [PATCH 22/68] Allow for non-capturing groups in lookbehinds --- regex-syntax/src/ast/parse.rs | 38 ++++++++++++++++++++++++++++++++++- regex-syntax/src/hir/print.rs | 25 +++++------------------ 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 0713ce948..171f01683 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2536,7 +2536,14 @@ fn first_capture_group_span(ast: &Ast) -> Option { fn visit_pre(&mut self, ast: &Ast) -> std::result::Result<(), Span> { match ast { - Ast::Group(group) => Err(group.span), + Ast::Group(group) + if !matches!( + group.kind, + ast::GroupKind::NonCapturing(_) + ) => + { + Err(group.span) + } _ => Ok(()), } } @@ -3883,6 +3890,21 @@ bar kind: ast::LookAroundKind::PositiveLookBehind })) ); + assert_eq!( + parser(r"(?<=(?:a))").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::group(ast::Group { + span: span(4..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(6..6), + items: vec![], + }), + ast: Box::new(lit('a', 7)), + })), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); assert_eq!( parser(r"(? Date: Tue, 11 Mar 2025 23:18:07 +0100 Subject: [PATCH 23/68] Fix missing LookAround in regex-cli --- regex-cli/cmd/generate/fowler.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 404c47721..052d59ef8 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -412,6 +412,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { let this = if group.is_capturing() { 1 } else { 0 }; this + count_capturing_groups_ast(&*group.ast) } + Ast::LookAround(ref lookaround) => { + count_capturing_groups_ast(&lookaround.ast) + } Ast::Alternation(ref alt) => { alt.asts.iter().map(count_capturing_groups_ast).sum() } From cd070f82441b6dd72add6d39e23a17e9e5ff050c Mon Sep 17 00:00:00 2001 From: shilangyu Date: Tue, 11 Mar 2025 23:27:33 +0100 Subject: [PATCH 24/68] Detect capture groups in lookarounds for cheaper --- regex-syntax/src/ast/parse.rs | 78 +++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 171f01683..138be2905 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -159,6 +159,7 @@ impl ParserBuilder { stack_class: RefCell::new(vec![]), capture_names: RefCell::new(vec![]), scratch: RefCell::new(String::new()), + lookaround_depth: Cell::new(0), } } @@ -280,6 +281,9 @@ pub struct Parser { /// A scratch buffer used in various places. Mostly this is used to /// accumulate relevant characters from parts of a pattern. scratch: RefCell, + /// Whether the parser is currently in a look-around. This is used to + /// detect capture groups within look-arounds, which are not supported. + lookaround_depth: Cell, } /// ParserI is the internal parser implementation. @@ -392,6 +396,7 @@ impl Parser { self.comments.borrow_mut().clear(); self.stack_group.borrow_mut().clear(); self.stack_class.borrow_mut().clear(); + self.lookaround_depth.set(0); } } @@ -477,6 +482,11 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.get() } + /// Return whether the parser is currently in a look-around. + fn in_lookaround(&self) -> bool { + self.parser().lookaround_depth.get() != 0 + } + /// Return the character at the current position of the parser. /// /// This panics if the current position does not point to a valid char. @@ -737,6 +747,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { .stack_group .borrow_mut() .push(GroupState::LookAround { concat, lookaround }); + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() + 1); Ok(ast::Concat { span: self.span(), asts: vec![] }) } } @@ -770,7 +783,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(LookAround { concat, lookaround }) => ( concat, Either::Right(lookaround), - self.parser().ignore_whitespace.get(), + self.ignore_whitespace(), None, ), Some(Alternation(alt)) => match stack.pop() { @@ -783,7 +796,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(LookAround { concat, lookaround }) => ( concat, Either::Right(lookaround), - self.parser().ignore_whitespace.get(), + self.ignore_whitespace(), Some(alt), ), None | Some(Alternation(_)) => { @@ -830,15 +843,20 @@ impl<'s, P: Borrow> ParserI<'s, P> { }, } prior_concat.asts.push(match grouping { - Either::Left(group) => Ast::group(group), - Either::Right(lookaround) => { - if let Some(span) = first_capture_group_span(&lookaround.ast) { + Either::Left(group) => { + if group.is_capturing() && self.in_lookaround() { return Err(self.error( - span, + group.span, ast::ErrorKind::UnsupportedCaptureInLookBehind, )); } + Ast::group(group) + } + Either::Right(lookaround) => { + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() - 1); Ast::lookaround(lookaround) } }); @@ -2522,36 +2540,6 @@ fn specialize_err( } } -/// Returns the span of the first capture group found. Returns None in case there are no capture groups. -fn first_capture_group_span(ast: &Ast) -> Option { - struct CaptureGroupSearcher; - - impl ast::Visitor for CaptureGroupSearcher { - type Output = (); - type Err = Span; - - fn finish(self) -> core::result::Result { - Ok(()) - } - - fn visit_pre(&mut self, ast: &Ast) -> std::result::Result<(), Span> { - match ast { - Ast::Group(group) - if !matches!( - group.kind, - ast::GroupKind::NonCapturing(_) - ) => - { - Err(group.span) - } - _ => Ok(()), - } - } - } - - ast::visit(ast, CaptureGroupSearcher).err() -} - #[cfg(test)] mod tests { use core::ops::Range; @@ -3882,6 +3870,24 @@ bar kind: ast::LookAroundKind::PositiveLookBehind })) ); + assert_eq!( + parser(r"(?<=(?<=))(a)").parse(), + Ok(concat( + 0..13, + vec![ + Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::lookaround(ast::LookAround { + span: span(4..9), + ast: Box::new(Ast::empty(span(8..8))), + kind: ast::LookAroundKind::PositiveLookBehind + })), + kind: ast::LookAroundKind::PositiveLookBehind + }), + group(10..13, 1, lit('a', 11)), + ] + )) + ); assert_eq!( parser(r"(?<=a)").parse(), Ok(Ast::lookaround(ast::LookAround { From bdc9bbdad6116f30cc1473b7facd5e5e011c91c5 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Tue, 11 Mar 2025 23:35:27 +0100 Subject: [PATCH 25/68] Remove accidental import --- regex-syntax/src/ast/parse.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 138be2905..5883a0dd4 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, LookAroundKind, Position, Span}, + ast::{self, Ast, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -1329,9 +1329,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { } let inner_span = self.span(); - let mut lookaround_kind = LookAroundKind::PositiveLookBehind; + let mut lookaround_kind = ast::LookAroundKind::PositiveLookBehind; if self.bump_if("?<=") || { - lookaround_kind = LookAroundKind::NegativeLookBehind; + lookaround_kind = ast::LookAroundKind::NegativeLookBehind; self.bump_if("? Date: Thu, 6 Mar 2025 09:21:38 +0100 Subject: [PATCH 26/68] Add new instructions to NFA We require two vm instructions 'CheckLookaround' and 'WriteLookaround' to be able to track the state of lookaround expressions at the current position in the haystack. Both instructions access a new 'lookaround' vector of booleans, which contains one entry per lookaround expression in the regex. --- regex-automata/src/dfa/onepass.rs | 4 + regex-automata/src/nfa/thompson/backtrack.rs | 4 + regex-automata/src/nfa/thompson/nfa.rs | 66 ++++++++++- regex-automata/src/nfa/thompson/pikevm.rs | 113 ++++++++++++++++--- regex-automata/src/util/determinize/mod.rs | 12 ++ 5 files changed, 179 insertions(+), 20 deletions(-) diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index e06d37cf4..3a175bce3 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -638,6 +638,10 @@ impl<'a> InternalBuilder<'a> { self.stack_push(nfa_id, Epsilons::empty())?; while let Some((id, epsilons)) = self.stack.pop() { match *self.nfa.state(id) { + thompson::State::WriteLookaround { .. } + | thompson::State::CheckLookaround { .. } => { + todo!("check how to handle") + } thompson::State::ByteRange { ref trans } => { self.compile_transition(dfa_id, trans, epsilons)?; } diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index df99e456d..b63a47fd5 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -1519,6 +1519,10 @@ impl BoundedBacktracker { } sid = next; } + State::WriteLookaround { .. } + | State::CheckLookaround { .. } => { + todo!("check how to handle") + } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return None, diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 59a62f4ed..e6c81345d 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,6 +1100,12 @@ impl NFA { self.0.look_set_prefix_any } + /// Returns how many lookaround sub-expressions this nfa contains + #[inline] + pub fn look_count(&self) -> usize { + self.0.look_count + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1260,6 +1266,7 @@ pub(super) struct Inner { /// zero-length prefix for any of the patterns in this NFA. look_set_prefix_all: LookSet, */ + look_count: usize, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1288,7 +1295,11 @@ impl Inner { match self.states[sid] { State::ByteRange { .. } | State::Dense { .. } - | State::Fail => continue, + | State::Fail + | State::WriteLookaround { .. } => continue, + State::CheckLookaround { next, .. } => { + stack.push(next); + } State::Sparse(_) => { // This snippet below will rewrite this sparse state // as a dense state. By doing it here, we apply this @@ -1371,6 +1382,10 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } + State::CheckLookaround { look_idx, .. } + | State::WriteLookaround { look_idx } => { + self.look_count = self.look_count.max(look_idx); + } State::Union { .. } | State::BinaryUnion { .. } | State::Fail @@ -1545,6 +1560,25 @@ pub enum State { /// satisfied. next: StateID, }, + /// This is like a match state but for a lookaround expression + /// executing this state will write a `true` into the lookaround oracle at + /// index `look_idx` + WriteLookaround { + /// The index of the lookaround expression that matches + look_idx: usize, + }, + /// This indicates that we need to check whether lookaround expression with + /// index `look_idx` holds at the current position in the haystack + /// If `positive` is false, then the lookaround expression is negative and + /// hence must NOT hold. + CheckLookaround { + /// The index of the lookaround expression that must be satisfied + look_idx: usize, + /// Whether this is a positive lookaround expression + positive: bool, + /// The next state to transition if the lookaround assertion is satisfied + next: StateID, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. @@ -1658,11 +1692,13 @@ impl State { | State::Sparse { .. } | State::Dense { .. } | State::Fail - | State::Match { .. } => false, + | State::Match { .. } + | State::WriteLookaround { .. } => false, State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } - | State::Capture { .. } => true, + | State::Capture { .. } + | State::CheckLookaround { .. } => true, } } @@ -1674,7 +1710,9 @@ impl State { | State::BinaryUnion { .. } | State::Capture { .. } | State::Match { .. } - | State::Fail => 0, + | State::Fail + | State::WriteLookaround { .. } + | State::CheckLookaround { .. } => 0, State::Sparse(SparseTransitions { ref transitions }) => { transitions.len() * mem::size_of::() } @@ -1707,6 +1745,9 @@ impl State { } } State::Look { ref mut next, .. } => *next = remap[*next], + State::CheckLookaround { ref mut next, .. } => { + *next = remap[*next] + } State::Union { ref mut alternates } => { for alt in alternates.iter_mut() { *alt = remap[*alt]; @@ -1717,8 +1758,9 @@ impl State { *alt2 = remap[*alt2]; } State::Capture { ref mut next, .. } => *next = remap[*next], - State::Fail => {} - State::Match { .. } => {} + State::Fail + | State::Match { .. } + | State::WriteLookaround { .. } => {} } } } @@ -1748,6 +1790,18 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } + State::WriteLookaround { look_idx } => { + write!(f, "Write Lookaround: {}", look_idx) + } + State::CheckLookaround { look_idx, positive, next } => { + write!( + f, + "Check Lookaround {} is {} => {}", + look_idx, + positive, + next.as_usize() + ) + } State::Union { ref alternates } => { let alts = alternates .iter() diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 4eb47c85c..ccdd124a0 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1216,6 +1216,10 @@ impl PikeVM { } impl PikeVM { + fn look_count(&self) -> usize { + self.nfa.look_count() + } + /// The implementation of standard leftmost search. /// /// Capturing group spans are written to `slots`, but only if requested. @@ -1254,7 +1258,12 @@ impl PikeVM { let pre = if anchored { None } else { self.get_config().get_prefilter() }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + ref mut lookaround, + } = cache; let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1361,9 +1370,12 @@ impl PikeVM { // transitions, and thus must be able to write offsets to the // slots given which are later copied to slot values in 'curr'. let slots = next.slot_table.all_absent(); - self.epsilon_closure(stack, slots, curr, input, at, start_id); + self.epsilon_closure( + stack, slots, curr, lookaround, input, at, start_id, + ); } - if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + if let Some(pid) = + self.nexts(stack, curr, next, lookaround, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); } @@ -1425,7 +1437,12 @@ impl PikeVM { Some(config) => config, }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + ref mut lookaround, + } = cache; for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1438,9 +1455,13 @@ impl PikeVM { } if !any_matches || allmatches { let slots = &mut []; - self.epsilon_closure(stack, slots, curr, input, at, start_id); + self.epsilon_closure( + stack, slots, curr, lookaround, input, at, start_id, + ); } - self.nexts_overlapping(stack, curr, next, input, at, patset); + self.nexts_overlapping( + stack, curr, next, lookaround, input, at, patset, + ); // If we found a match and filled our set, then there is no more // additional info that we can provide. Thus, we can quit. We also // quit if the caller asked us to stop at the earliest point that @@ -1469,6 +1490,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, slots: &mut [Option], @@ -1477,7 +1499,15 @@ impl PikeVM { let mut pid = None; let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { - pid = match self.next(stack, slot_table, next, input, at, sid) { + pid = match self.next( + stack, + slot_table, + next, + lookarounds, + input, + at, + sid, + ) { None => continue, Some(pid) => Some(pid), }; @@ -1497,6 +1527,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, patset: &mut PatternSet, @@ -1505,8 +1536,15 @@ impl PikeVM { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { - let pid = match self.next(stack, slot_table, next, input, at, sid) - { + let pid = match self.next( + stack, + slot_table, + next, + lookarounds, + input, + at, + sid, + ) { None => continue, Some(pid) => pid, }; @@ -1543,6 +1581,7 @@ impl PikeVM { stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, sid: StateID, @@ -1553,7 +1592,9 @@ impl PikeVM { | State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } - | State::Capture { .. } => None, + | State::Capture { .. } + | State::WriteLookaround { .. } + | State::CheckLookaround { .. } => None, State::ByteRange { ref trans } => { if trans.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); @@ -1561,7 +1602,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, trans.next, + stack, + slots, + next, + lookarounds, + input, + at, + trans.next, ); } None @@ -1573,7 +1620,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, next_sid, + stack, + slots, + next, + lookarounds, + input, + at, + next_sid, ); } None @@ -1585,7 +1638,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, next_sid, + stack, + slots, + next, + lookarounds, + input, + at, + next_sid, ); } None @@ -1613,6 +1672,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, sid: StateID, @@ -1629,7 +1689,13 @@ impl PikeVM { } FollowEpsilon::Explore(sid) => { self.epsilon_closure_explore( - stack, curr_slots, next, input, at, sid, + stack, + curr_slots, + next, + lookarounds, + input, + at, + sid, ); } } @@ -1666,6 +1732,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, + lookarounds: &mut Vec, input: &Input<'_>, at: usize, mut sid: StateID, @@ -1705,6 +1772,16 @@ impl PikeVM { } sid = next; } + State::WriteLookaround { look_idx } => { + lookarounds[look_idx] = true; + return; + } + State::CheckLookaround { look_idx, positive, next } => { + if lookarounds[look_idx] != positive { + return; + } + sid = next; + } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return, @@ -1886,6 +1963,9 @@ pub struct Cache { /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, + /// This answers the question: "Does lookaround assertion x hold at the + /// current position in the haystack" + lookaround: Vec, } impl Cache { @@ -1902,6 +1982,11 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), + lookaround: { + let mut res = Vec::new(); + res.resize(re.look_count(), false); + res + }, } } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 22e38c94c..08839ceaf 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -251,6 +251,10 @@ pub(crate) fn next( | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} + thompson::State::CheckLookaround { .. } + | thompson::State::WriteLookaround { .. } => { + todo!("check how to handle") + } thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match // state if the OLD state we are transitioning from @@ -399,6 +403,10 @@ pub(crate) fn epsilon_closure( | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, + thompson::State::WriteLookaround { .. } + | thompson::State::CheckLookaround { .. } => { + todo!("check how to handle") + } thompson::State::Look { look, next } => { if !look_have.contains(look) { break; @@ -465,6 +473,10 @@ pub(crate) fn add_nfa_states( builder.add_nfa_state_id(nfa_id); builder.set_look_need(|need| need.insert(look)); } + thompson::State::CheckLookaround { .. } + | thompson::State::WriteLookaround { .. } => { + todo!("check how to handle") + } thompson::State::Union { .. } | thompson::State::BinaryUnion { .. } => { // Pure epsilon transitions don't need to be tracked as part From f97aa92d810dff201fa66d70aa922341c049e49b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 6 Mar 2025 16:38:13 +0100 Subject: [PATCH 27/68] Implement lookaround compilation These changes implement the compilation of lookaround assertions from HIR to NFA. Subexpressions of lookaround assertions are patched to a top level reverse union. This is necessary so that the NFA will explore the innermost subexpression first and thereby make sure that all subexpression results are available when they need to be checked. I.e. any `WriteLookaround` state must be visited before any `CheckLookaround` state with the same index. --- regex-automata/src/nfa/thompson/builder.rs | 81 +++++++++++++++++++-- regex-automata/src/nfa/thompson/compiler.rs | 63 +++++++++++++++- regex-cli/cmd/generate/fowler.rs | 1 + 3 files changed, 134 insertions(+), 11 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 6b69e8784..8c6eb0e85 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -41,7 +41,9 @@ enum State { }, /// A state that only transitions to another state if the current input /// byte is in a particular range of bytes. - ByteRange { trans: Transition }, + ByteRange { + trans: Transition, + }, /// A state with possibly many transitions, represented in a sparse /// fashion. Transitions must be ordered lexicographically by input range /// and be non-overlapping. As such, this may only be used when every @@ -55,10 +57,15 @@ enum State { /// that `Sparse` is used for via `Union`. But this creates a more bloated /// NFA with more epsilon transitions than is necessary in the special case /// of character classes. - Sparse { transitions: Vec }, + Sparse { + transitions: Vec, + }, /// A conditional epsilon transition satisfied via some sort of /// look-around. - Look { look: Look, next: StateID }, + Look { + look: Look, + next: StateID, + }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to /// record position information for a capture group when using the NFA for @@ -91,10 +98,20 @@ enum State { /// The next state that this state should transition to. next: StateID, }, + WriteLookaround { + lookaround_index: usize, + }, + CheckLookaround { + lookaround_index: usize, + positive: bool, + next: StateID, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. - Union { alternates: Vec }, + Union { + alternates: Vec, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via later transitions are /// preferred over earlier transitions. @@ -110,7 +127,9 @@ enum State { /// to be amortized constant time. But if we used a `Union`, we'd need to /// prepend the state, which takes O(n) time. There are other approaches we /// could use to solve this, but this seems simple enough. - UnionReverse { alternates: Vec }, + UnionReverse { + alternates: Vec, + }, /// A state that cannot be transitioned out of. This is useful for cases /// where you want to prevent matching from occurring. For example, if your /// regex parser permits empty character classes, then one could choose a @@ -124,7 +143,9 @@ enum State { /// /// `pattern_id` refers to the ID of the pattern itself, which corresponds /// to the pattern's index (starting at 0). - Match { pattern_id: PatternID }, + Match { + pattern_id: PatternID, + }, } impl State { @@ -154,7 +175,9 @@ impl State { | State::CaptureStart { .. } | State::CaptureEnd { .. } | State::Fail - | State::Match { .. } => 0, + | State::Match { .. } + | State::CheckLookaround { .. } + | State::WriteLookaround { .. } => 0, State::Sparse { ref transitions } => { transitions.len() * mem::size_of::() } @@ -470,6 +493,22 @@ impl Builder { State::Look { look, next } => { remap[sid] = nfa.add(nfa::State::Look { look, next }); } + State::WriteLookaround { lookaround_index } => { + remap[sid] = nfa.add(nfa::State::WriteLookaround { + look_idx: lookaround_index, + }); + } + State::CheckLookaround { + lookaround_index, + positive, + next, + } => { + remap[sid] = nfa.add(nfa::State::CheckLookaround { + look_idx: lookaround_index, + positive, + next, + }); + } State::CaptureStart { pattern_id, group_index, next } => { // We can't remove this empty state because of the side // effect of capturing an offset for this capture slot. @@ -693,6 +732,30 @@ impl Builder { self.add(State::Empty { next: StateID::ZERO }) } + /// Add a state which will record that the lookaround with the given index + /// is satisfied at the current position. + pub fn add_write_lookaround( + &mut self, + index: usize, + ) -> Result { + self.add(State::WriteLookaround { lookaround_index: index }) + } + + /// Add a state which will check whether the lookaround with the given + /// index is satisfied at the current position. + pub fn add_check_lookaround( + &mut self, + index: usize, + positive: bool, + next: StateID, + ) -> Result { + self.add(State::CheckLookaround { + lookaround_index: index, + positive, + next, + }) + } + /// Add a "union" NFA state. /// /// A "union" NFA state that contains zero or more unconditional epsilon @@ -1159,6 +1222,9 @@ impl Builder { State::Look { ref mut next, .. } => { *next = to; } + State::CheckLookaround { ref mut next, .. } => { + *next = to; + } State::Union { ref mut alternates } => { alternates.push(to); self.memory_states += mem::size_of::(); @@ -1173,6 +1239,7 @@ impl Builder { State::CaptureEnd { ref mut next, .. } => { *next = to; } + State::WriteLookaround { .. } => {} State::Fail => {} State::Match { .. } => {} } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 9fe99be98..5f8229a9d 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -3,7 +3,7 @@ use core::{borrow::Borrow, cell::RefCell}; use alloc::{sync::Arc, vec, vec::Vec}; use regex_syntax::{ - hir::{self, Hir}, + hir::{self, Hir, LookAround}, utf8::{Utf8Range, Utf8Sequences}, ParserBuilder, }; @@ -711,6 +711,7 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, + lookaround_alt: RefCell>, } impl Compiler { @@ -723,6 +724,7 @@ impl Compiler { utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + lookaround_alt: RefCell::new(None), } } @@ -977,11 +979,20 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; + let lookaround_prefix = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + let lookaround_alt = self.add_union_reverse()?; + self.patch(lookaround_prefix.end, lookaround_alt)?; + let top_level_alt = self.add_union()?; + self.patch(top_level_alt, lookaround_prefix.start)?; + self.lookaround_alt.borrow_mut().replace(lookaround_alt); let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - let _ = self.finish_pattern(one.start)?; - Ok(ThompsonRef { start: one.start, end: match_state_id }) + self.patch(top_level_alt, one.start)?; + let _ = self.finish_pattern(top_level_alt)?; + self.lookaround_alt.borrow_mut().take(); + Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) }))?; self.patch(unanchored_prefix.end, compiled.start)?; let nfa = self @@ -1003,7 +1014,7 @@ impl Compiler { Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), - LookAround(_) => todo!("implement lookaround NFA compilation"), + LookAround(ref lookaround) => self.c_lookaround(lookaround), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), @@ -1011,6 +1022,31 @@ impl Compiler { } } + fn c_lookaround( + &self, + lookaround: &LookAround, + ) -> Result { + let sub = match lookaround { + LookAround::NegativeLookBehind(ref sub) + | LookAround::PositiveLookBehind(ref sub) => self.c(sub)?, + }; + let pos = match lookaround { + LookAround::NegativeLookBehind(_) => false, + LookAround::PositiveLookBehind(_) => true, + }; + let idx = todo!("get index"); + let check = self.add_check_lookaround(idx, pos)?; + let write = self.add_write_lookaround(idx)?; + self.patch(sub.end, write)?; + self.patch( + self.lookaround_alt + .borrow() + .expect("Cannot compile lookaround outside pattern"), + sub.start, + )?; + Ok(ThompsonRef { start: check, end: check }) + } + /// Compile a concatenation of the sub-expressions yielded by the given /// iterator. If the iterator yields no elements, then this compiles down /// to an "empty" state that always matches. @@ -1631,6 +1667,25 @@ impl Compiler { self.builder.borrow_mut().add_empty() } + fn add_write_lookaround( + &self, + index: usize, + ) -> Result { + self.builder.borrow_mut().add_write_lookaround(index) + } + + fn add_check_lookaround( + &self, + index: usize, + positive: bool, + ) -> Result { + self.builder.borrow_mut().add_check_lookaround( + index, + positive, + StateID::ZERO, + ) + } + fn add_range(&self, start: u8, end: u8) -> Result { self.builder.borrow_mut().add_range(Transition { start, diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 052d59ef8..70db71fb0 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -421,5 +421,6 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { Ast::Concat(ref concat) => { concat.asts.iter().map(count_capturing_groups_ast).sum() } + Ast::LookAround(_) => todo!(), } } From ccdab18dedda21da45d01ee67f72de2750140e89 Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 17:31:52 +0100 Subject: [PATCH 28/68] Restore compilation behaviour for regexes without lookarounds The machinery necessary to perform the parallel lookbehind checking should only be compiled in when there is actually a lookbehind expression in the regex. This restores compilation to the expected outputs for regexes without lookbehind expressions. --- regex-automata/src/nfa/thompson/compiler.rs | 28 +++++++++++++++------ regex-syntax/src/hir/mod.rs | 26 +++++++++++++++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 5f8229a9d..6e369f976 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -979,17 +979,29 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; - let lookaround_prefix = - self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union_reverse()?; - self.patch(lookaround_prefix.end, lookaround_alt)?; - let top_level_alt = self.add_union()?; - self.patch(top_level_alt, lookaround_prefix.start)?; - self.lookaround_alt.borrow_mut().replace(lookaround_alt); + let has_lookarounds = + (e.borrow() as &Hir).properties().contains_lookaround_expr(); + let mut top_level_alt = if has_lookarounds { + self.add_union()? + } else { + StateID::ZERO + }; + if has_lookarounds { + let lookaround_prefix = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + let lookaround_alt = self.add_union_reverse()?; + self.patch(lookaround_prefix.end, lookaround_alt)?; + self.patch(top_level_alt, lookaround_prefix.start)?; + self.lookaround_alt.borrow_mut().replace(lookaround_alt); + } let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - self.patch(top_level_alt, one.start)?; + if has_lookarounds { + self.patch(top_level_alt, one.start)?; + } else { + top_level_alt = one.start; + } let _ = self.finish_pattern(top_level_alt)?; self.lookaround_alt.borrow_mut().take(); Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 28c660031..41311371d 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2041,6 +2041,7 @@ struct PropertiesI { look_set_suffix: LookSet, look_set_prefix_any: LookSet, look_set_suffix_any: LookSet, + contains_lookaround_expr: bool, utf8: bool, explicit_captures_len: usize, static_explicit_captures_len: Option, @@ -2134,6 +2135,17 @@ impl Properties { self.0.look_set_suffix_any } + /// Returns whether there are any look-around expressions in this HIR value. + /// + /// Only returns true for [`HirKind::LookAround`] and not for + /// [`HirKind::Look`], which can be queried by [`look_set`] instead. + /// Currently, only lookbehind assertions without capture groups are + /// supported. + #[inline] + pub fn contains_lookaround_expr(&self) -> bool { + self.0.contains_lookaround_expr + } + /// Return true if and only if the corresponding HIR will always match /// valid UTF-8. /// @@ -2403,6 +2415,7 @@ impl Properties { look_set_suffix: fix, look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len, @@ -2418,6 +2431,8 @@ impl Properties { props.look_set_suffix.set_intersect(p.look_set_suffix()); props.look_set_prefix_any.set_union(p.look_set_prefix_any()); props.look_set_suffix_any.set_union(p.look_set_suffix_any()); + props.contains_lookaround_expr = + props.contains_lookaround_expr || p.contains_lookaround_expr(); props.utf8 = props.utf8 && p.is_utf8(); props.explicit_captures_len = props .explicit_captures_len @@ -2465,6 +2480,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, // It is debatable whether an empty regex always matches at valid // UTF-8 boundaries. Strictly speaking, at a byte oriented view, // it is clearly false. There are, for example, many empty strings @@ -2501,6 +2517,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: core::str::from_utf8(&lit.0).is_ok(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2520,6 +2537,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: class.is_utf8(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2539,6 +2557,9 @@ impl Properties { look_set_suffix: LookSet::singleton(look), look_set_prefix_any: LookSet::singleton(look), look_set_suffix_any: LookSet::singleton(look), + // Note, this field represents _general_ lookarounds (ones using + // LookAround) and not simple ones (using Look). + contains_lookaround_expr: false, // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid // UTF-8, even though technically matching every empty string will @@ -2569,6 +2590,7 @@ impl Properties { maximum_len: Some(0), literal: false, alternation_literal: false, + contains_lookaround_expr: true, ..*sub_p.0.clone() }; Properties(Box::new(inner)) @@ -2595,6 +2617,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: p.look_set_prefix_any(), look_set_suffix_any: p.look_set_suffix_any(), + contains_lookaround_expr: p.contains_lookaround_expr(), utf8: p.is_utf8(), explicit_captures_len: p.explicit_captures_len(), static_explicit_captures_len: p.static_explicit_captures_len(), @@ -2656,6 +2679,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2667,6 +2691,8 @@ impl Properties { let p = x.properties(); props.look_set.set_union(p.look_set()); props.utf8 = props.utf8 && p.is_utf8(); + props.contains_lookaround_expr = + props.contains_lookaround_expr || p.contains_lookaround_expr(); props.explicit_captures_len = props .explicit_captures_len .saturating_add(p.explicit_captures_len()); From 519d13dc3638640c95bd0feeb2b7435b81e40dab Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 18:06:11 +0100 Subject: [PATCH 29/68] Address review comments --- regex-automata/src/nfa/thompson/builder.rs | 40 ++++++++------------- regex-automata/src/nfa/thompson/compiler.rs | 6 ++-- regex-automata/src/nfa/thompson/nfa.rs | 19 +++++----- regex-automata/src/nfa/thompson/pikevm.rs | 6 ++-- 4 files changed, 32 insertions(+), 39 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 8c6eb0e85..f9119f537 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -41,9 +41,7 @@ enum State { }, /// A state that only transitions to another state if the current input /// byte is in a particular range of bytes. - ByteRange { - trans: Transition, - }, + ByteRange { trans: Transition }, /// A state with possibly many transitions, represented in a sparse /// fashion. Transitions must be ordered lexicographically by input range /// and be non-overlapping. As such, this may only be used when every @@ -57,15 +55,10 @@ enum State { /// that `Sparse` is used for via `Union`. But this creates a more bloated /// NFA with more epsilon transitions than is necessary in the special case /// of character classes. - Sparse { - transitions: Vec, - }, + Sparse { transitions: Vec }, /// A conditional epsilon transition satisfied via some sort of /// look-around. - Look { - look: Look, - next: StateID, - }, + Look { look: Look, next: StateID }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to /// record position information for a capture group when using the NFA for @@ -98,20 +91,21 @@ enum State { /// The next state that this state should transition to. next: StateID, }, - WriteLookaround { - lookaround_index: usize, - }, + /// An empty state that behaves analogously to a `Match` state but for + /// the look-around sub-expression with the given index. + WriteLookaround { lookaround_index: SmallIndex }, + /// A conditional epsilon transition that will only be taken if the + /// look-around sub-expression with the given index evaluates to `positive` + /// at the current position in the haystack. CheckLookaround { - lookaround_index: usize, + lookaround_index: SmallIndex, positive: bool, next: StateID, }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. - Union { - alternates: Vec, - }, + Union { alternates: Vec }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via later transitions are /// preferred over earlier transitions. @@ -127,9 +121,7 @@ enum State { /// to be amortized constant time. But if we used a `Union`, we'd need to /// prepend the state, which takes O(n) time. There are other approaches we /// could use to solve this, but this seems simple enough. - UnionReverse { - alternates: Vec, - }, + UnionReverse { alternates: Vec }, /// A state that cannot be transitioned out of. This is useful for cases /// where you want to prevent matching from occurring. For example, if your /// regex parser permits empty character classes, then one could choose a @@ -143,9 +135,7 @@ enum State { /// /// `pattern_id` refers to the ID of the pattern itself, which corresponds /// to the pattern's index (starting at 0). - Match { - pattern_id: PatternID, - }, + Match { pattern_id: PatternID }, } impl State { @@ -736,7 +726,7 @@ impl Builder { /// is satisfied at the current position. pub fn add_write_lookaround( &mut self, - index: usize, + index: SmallIndex, ) -> Result { self.add(State::WriteLookaround { lookaround_index: index }) } @@ -745,7 +735,7 @@ impl Builder { /// index is satisfied at the current position. pub fn add_check_lookaround( &mut self, - index: usize, + index: SmallIndex, positive: bool, next: StateID, ) -> Result { diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 6e369f976..ae1105280 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -19,7 +19,7 @@ use crate::{ }, util::{ look::{Look, LookMatcher}, - primitives::{PatternID, StateID}, + primitives::{PatternID, SmallIndex, StateID}, }, }; @@ -1681,14 +1681,14 @@ impl Compiler { fn add_write_lookaround( &self, - index: usize, + index: SmallIndex, ) -> Result { self.builder.borrow_mut().add_write_lookaround(index) } fn add_check_lookaround( &self, - index: usize, + index: SmallIndex, positive: bool, ) -> Result { self.builder.borrow_mut().add_check_lookaround( diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index e6c81345d..8e38168df 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1102,8 +1102,8 @@ impl NFA { /// Returns how many lookaround sub-expressions this nfa contains #[inline] - pub fn look_count(&self) -> usize { - self.0.look_count + pub fn lookaround_count(&self) -> SmallIndex { + self.0.lookaround_count } // FIXME: The `look_set_prefix_all` computation was not correct, and it @@ -1266,7 +1266,10 @@ pub(super) struct Inner { /// zero-length prefix for any of the patterns in this NFA. look_set_prefix_all: LookSet, */ - look_count: usize, + /// How many look-around expression this NFA contains. + /// This is needed to initialize the table for storing the result of + /// look-around evaluation + lookaround_count: SmallIndex, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1384,7 +1387,7 @@ impl Inner { } State::CheckLookaround { look_idx, .. } | State::WriteLookaround { look_idx } => { - self.look_count = self.look_count.max(look_idx); + self.lookaround_count = self.lookaround_count.max(look_idx); } State::Union { .. } | State::BinaryUnion { .. } @@ -1565,7 +1568,7 @@ pub enum State { /// index `look_idx` WriteLookaround { /// The index of the lookaround expression that matches - look_idx: usize, + look_idx: SmallIndex, }, /// This indicates that we need to check whether lookaround expression with /// index `look_idx` holds at the current position in the haystack @@ -1573,7 +1576,7 @@ pub enum State { /// hence must NOT hold. CheckLookaround { /// The index of the lookaround expression that must be satisfied - look_idx: usize, + look_idx: SmallIndex, /// Whether this is a positive lookaround expression positive: bool, /// The next state to transition if the lookaround assertion is satisfied @@ -1791,13 +1794,13 @@ impl fmt::Debug for State { write!(f, "{:?} => {:?}", look, next.as_usize()) } State::WriteLookaround { look_idx } => { - write!(f, "Write Lookaround: {}", look_idx) + write!(f, "Write Lookaround: {}", look_idx.as_u32()) } State::CheckLookaround { look_idx, positive, next } => { write!( f, "Check Lookaround {} is {} => {}", - look_idx, + look_idx.as_u32(), positive, next.as_usize() ) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index ccdd124a0..bc5c3accb 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1216,8 +1216,8 @@ impl PikeVM { } impl PikeVM { - fn look_count(&self) -> usize { - self.nfa.look_count() + fn lookaround_count(&self) -> SmallIndex { + self.nfa.lookaround_count() } /// The implementation of standard leftmost search. @@ -1984,7 +1984,7 @@ impl Cache { next: ActiveStates::new(re), lookaround: { let mut res = Vec::new(); - res.resize(re.look_count(), false); + res.resize(re.lookaround_count().as_usize(), false); res }, } From cccfc23c62bfba4e4637e4fd9a5dc59ad189b14c Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 18:34:17 +0100 Subject: [PATCH 30/68] Implement look-around index generation --- regex-automata/src/nfa/thompson/compiler.rs | 13 +++++++++++- regex-automata/src/nfa/thompson/error.rs | 23 ++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index ae1105280..493a0df59 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -711,7 +711,13 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, + /// Top level alternation state which is used to run all look-around + /// assertion checks in lockstep with the main expression. Each look-around + /// expression is compiled to a set of states that is patched into this + /// state, and this state is updated on each new pattern being compiled. lookaround_alt: RefCell>, + /// The next index to use for a look-around expression. + lookaround_index: RefCell, } impl Compiler { @@ -725,6 +731,7 @@ impl Compiler { trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), lookaround_alt: RefCell::new(None), + lookaround_index: RefCell::new(SmallIndex::ZERO), } } @@ -1046,7 +1053,11 @@ impl Compiler { LookAround::NegativeLookBehind(_) => false, LookAround::PositiveLookBehind(_) => true, }; - let idx = todo!("get index"); + let idx = *self.lookaround_index.borrow(); + *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) + .map_err(|e| { + BuildError::too_many_lookarounds(e.attempted() as usize) + })?; let check = self.add_check_lookaround(idx, pos)?; let write = self.add_write_lookaround(idx)?; self.patch(sub.end, write)?; diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index e29006586..fa00eb7dd 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -1,6 +1,6 @@ use crate::util::{ captures, look, - primitives::{PatternID, StateID}, + primitives::{PatternID, SmallIndex, StateID}, }; /// An error that can occurred during the construction of a thompson NFA. @@ -55,6 +55,14 @@ enum BuildErrorKind { /// The limit on the number of states. limit: usize, }, + /// An error that occurs if too many indices need to be generated for + /// look-around sub-expressions while building an NFA. + TooManyLookArounds { + /// The number of sub-expressions that exceeded the limit. + given: usize, + /// The limit on the number of sub-expressions. + limit: usize, + }, /// An error that occurs when NFA compilation exceeds a configured heap /// limit. ExceededSizeLimit { @@ -115,6 +123,13 @@ impl BuildError { BuildError { kind: BuildErrorKind::TooManyStates { given, limit } } } + pub(crate) fn too_many_lookarounds(given: usize) -> BuildError { + let limit = SmallIndex::LIMIT; + BuildError { + kind: BuildErrorKind::TooManyLookArounds { given, limit }, + } + } + pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError { BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } } @@ -164,6 +179,12 @@ impl core::fmt::Display for BuildError { which exceeds the limit of {}", given, limit, ), + BuildErrorKind::TooManyLookArounds { given, limit } => write!( + f, + "attempted to compile {} look-around expressions, \ + which exceeds the limit of {}", + given, limit, + ), BuildErrorKind::ExceededSizeLimit { limit } => write!( f, "heap usage during NFA compilation exceeded limit of {}", From 9c33cca41af22828f794ac1ca68c392d64313f61 Mon Sep 17 00:00:00 2001 From: Robin Date: Tue, 11 Mar 2025 18:52:59 +0100 Subject: [PATCH 31/68] Change tracking of look-around state to index This makes it so we don't need to reset the lookaround state on each character advancement. --- regex-automata/src/nfa/thompson/pikevm.rs | 28 ++++++++++++++--------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index bc5c3accb..a8406daa2 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1490,7 +1490,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, slots: &mut [Option], @@ -1527,7 +1527,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, patset: &mut PatternSet, @@ -1581,7 +1581,7 @@ impl PikeVM { stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, sid: StateID, @@ -1672,7 +1672,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, sid: StateID, @@ -1732,7 +1732,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, - lookarounds: &mut Vec, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, mut sid: StateID, @@ -1773,11 +1773,16 @@ impl PikeVM { sid = next; } State::WriteLookaround { look_idx } => { - lookarounds[look_idx] = true; + // This is ok since `at` is always less than `usize::MAX`. + lookarounds[look_idx] = NonMaxUsize::new(at); return; } State::CheckLookaround { look_idx, positive, next } => { - if lookarounds[look_idx] != positive { + let state = match lookarounds[look_idx] { + None => usize::MAX, + Some(pos) => pos.get(), + }; + if (state == at) != positive { return; } sid = next; @@ -1963,9 +1968,10 @@ pub struct Cache { /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, - /// This answers the question: "Does lookaround assertion x hold at the - /// current position in the haystack" - lookaround: Vec, + /// This answers the question: "What is the maximum position in the + /// haystack at which lookaround assertion x holds and which is <= to the + /// current position" + lookaround: Vec>, } impl Cache { @@ -1984,7 +1990,7 @@ impl Cache { next: ActiveStates::new(re), lookaround: { let mut res = Vec::new(); - res.resize(re.lookaround_count().as_usize(), false); + res.resize(re.lookaround_count().as_usize(), None); res }, } From e94394cf67ba90bf2ea6b186d405bc671ec10649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 13 Mar 2025 11:19:21 +0100 Subject: [PATCH 32/68] Fix cli tool and AST->HIR translation --- regex-cli/cmd/generate/fowler.rs | 1 - regex-syntax/src/hir/translate.rs | 74 +++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 70db71fb0..052d59ef8 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -421,6 +421,5 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { Ast::Concat(ref concat) => { concat.asts.iter().map(count_capturing_groups_ast).sum() } - Ast::LookAround(_) => todo!(), } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 1c7a48947..4fd450560 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -212,6 +212,13 @@ enum HirFrame { /// This sentinel only exists to stop other things (like flattening /// literals) from reaching across repetition operators. Repetition, + /// This is pushed whenever a look-around expression is observed. After + /// visiting the sub-expression in the look-around, the translator's stack + /// is expected to have this sentinel at the top. + /// + /// This sentinel only exists to stop other things (like flattening + /// literals) from reaching across look-around operators. + LookAround, /// This is pushed on to the stack upon first seeing any kind of capture, /// indicated by parentheses (including non-capturing groups). It is popped /// upon leaving a group. @@ -298,6 +305,18 @@ impl HirFrame { } } + fn unwrap_lookaround(self) { + match self { + HirFrame::LookAround => {} + _ => { + panic!( + "tried to unwrap look-around from HirFrame, got: {:?}", + self + ) + } + } + } + /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). @@ -363,6 +382,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::AlternationBranch); } } + Ast::LookAround(_) => self.push(HirFrame::LookAround), _ => {} } Ok(()) @@ -448,6 +468,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::LookAround(ref x) => { let expr = Box::new(self.pop().unwrap().unwrap_expr()); + self.pop().unwrap().unwrap_lookaround(); self.push(HirFrame::Expr(Hir::lookaround(match x.kind { ast::LookAroundKind::PositiveLookBehind => { hir::LookAround::PositiveLookBehind(expr) @@ -770,6 +791,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or concat, got alt branch marker") } + HirFrame::LookAround => { + unreachable!("expected expr or concat, got look-around") + } } } @@ -801,6 +825,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or alt, got alt branch marker") } + HirFrame::LookAround => { + unreachable!("expected expr or alt, got look-around") + } } } @@ -1612,6 +1639,15 @@ mod tests { Hir::look(look) } + fn hir_lookbehind(expr: Hir, positive: bool) -> Hir { + let lookaround = if positive { + hir::LookAround::PositiveLookBehind(Box::new(expr)) + } else { + hir::LookAround::NegativeLookBehind(Box::new(expr)) + }; + Hir::lookaround(lookaround) + } + #[test] fn empty() { assert_eq!(t(""), Hir::empty()); @@ -1835,6 +1871,44 @@ mod tests { assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); } + #[test] + fn lookarounds() { + assert_eq!(t("(?<=a)"), hir_lookbehind(hir_lit("a"), true)); + assert_eq!(t("(? Date: Thu, 13 Mar 2025 11:52:48 +0100 Subject: [PATCH 33/68] Fix lookaround union order --- regex-automata/src/nfa/thompson/compiler.rs | 50 ++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 493a0df59..f16cbce7d 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -996,7 +996,7 @@ impl Compiler { if has_lookarounds { let lookaround_prefix = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union_reverse()?; + let lookaround_alt = self.add_union()?; self.patch(lookaround_prefix.end, lookaround_alt)?; self.patch(top_level_alt, lookaround_prefix.start)?; self.lookaround_alt.borrow_mut().replace(lookaround_alt); @@ -2037,6 +2037,22 @@ mod tests { } } + fn s_write_lookaround(id: usize) -> State { + State::WriteLookaround { + look_idx: SmallIndex::new(id) + .expect("look-around index too large"), + } + } + + fn s_check_lookaround(id: usize, positive: bool, next: usize) -> State { + State::CheckLookaround { + look_idx: SmallIndex::new(id) + .expect("look-around index too large"), + positive, + next: sid(next), + } + } + fn s_fail() -> State { State::Fail } @@ -2262,6 +2278,38 @@ mod tests { ); } + #[test] + fn compile_lookbehind() { + assert_eq!( + build(r"(?<=a)").states(), + &[ + s_bin_union(1, 4), + s_bin_union(3, 2), + s_range(b'\x00', b'\xFF', 1), + s_byte(b'a', 5), + s_check_lookaround(0, true, 6), + s_write_lookaround(0), + s_match(0) + ] + ); + assert_eq!( + build(r"(?<=a(? Date: Tue, 18 Mar 2025 13:17:59 +0100 Subject: [PATCH 34/68] Address review comments Rename certain enums to be consistent with rest of codebase. --- regex-automata/src/dfa/onepass.rs | 4 +- regex-automata/src/nfa/thompson/backtrack.rs | 4 +- regex-automata/src/nfa/thompson/builder.rs | 28 ++++----- regex-automata/src/nfa/thompson/compiler.rs | 15 ++--- regex-automata/src/nfa/thompson/nfa.rs | 61 +++++++++++--------- regex-automata/src/nfa/thompson/pikevm.rs | 18 +++--- regex-automata/src/util/determinize/mod.rs | 12 ++-- regex-syntax/src/hir/mod.rs | 2 - 8 files changed, 72 insertions(+), 72 deletions(-) diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 3a175bce3..4dbb9b50e 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -638,8 +638,8 @@ impl<'a> InternalBuilder<'a> { self.stack_push(nfa_id, Epsilons::empty())?; while let Some((id, epsilons)) = self.stack.pop() { match *self.nfa.state(id) { - thompson::State::WriteLookaround { .. } - | thompson::State::CheckLookaround { .. } => { + thompson::State::WriteLookAround { .. } + | thompson::State::CheckLookAround { .. } => { todo!("check how to handle") } thompson::State::ByteRange { ref trans } => { diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index b63a47fd5..be0cbcfbd 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -1519,8 +1519,8 @@ impl BoundedBacktracker { } sid = next; } - State::WriteLookaround { .. } - | State::CheckLookaround { .. } => { + State::WriteLookAround { .. } + | State::CheckLookAround { .. } => { todo!("check how to handle") } State::Union { ref alternates } => { diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index f9119f537..748d1d01c 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -93,11 +93,11 @@ enum State { }, /// An empty state that behaves analogously to a `Match` state but for /// the look-around sub-expression with the given index. - WriteLookaround { lookaround_index: SmallIndex }, + WriteLookAround { lookaround_index: SmallIndex }, /// A conditional epsilon transition that will only be taken if the /// look-around sub-expression with the given index evaluates to `positive` /// at the current position in the haystack. - CheckLookaround { + CheckLookAround { lookaround_index: SmallIndex, positive: bool, next: StateID, @@ -166,8 +166,8 @@ impl State { | State::CaptureEnd { .. } | State::Fail | State::Match { .. } - | State::CheckLookaround { .. } - | State::WriteLookaround { .. } => 0, + | State::CheckLookAround { .. } + | State::WriteLookAround { .. } => 0, State::Sparse { ref transitions } => { transitions.len() * mem::size_of::() } @@ -483,18 +483,18 @@ impl Builder { State::Look { look, next } => { remap[sid] = nfa.add(nfa::State::Look { look, next }); } - State::WriteLookaround { lookaround_index } => { - remap[sid] = nfa.add(nfa::State::WriteLookaround { - look_idx: lookaround_index, + State::WriteLookAround { lookaround_index } => { + remap[sid] = nfa.add(nfa::State::WriteLookAround { + lookaround_idx: lookaround_index, }); } - State::CheckLookaround { + State::CheckLookAround { lookaround_index, positive, next, } => { - remap[sid] = nfa.add(nfa::State::CheckLookaround { - look_idx: lookaround_index, + remap[sid] = nfa.add(nfa::State::CheckLookAround { + lookaround_idx: lookaround_index, positive, next, }); @@ -728,7 +728,7 @@ impl Builder { &mut self, index: SmallIndex, ) -> Result { - self.add(State::WriteLookaround { lookaround_index: index }) + self.add(State::WriteLookAround { lookaround_index: index }) } /// Add a state which will check whether the lookaround with the given @@ -739,7 +739,7 @@ impl Builder { positive: bool, next: StateID, ) -> Result { - self.add(State::CheckLookaround { + self.add(State::CheckLookAround { lookaround_index: index, positive, next, @@ -1212,7 +1212,7 @@ impl Builder { State::Look { ref mut next, .. } => { *next = to; } - State::CheckLookaround { ref mut next, .. } => { + State::CheckLookAround { ref mut next, .. } => { *next = to; } State::Union { ref mut alternates } => { @@ -1229,7 +1229,7 @@ impl Builder { State::CaptureEnd { ref mut next, .. } => { *next = to; } - State::WriteLookaround { .. } => {} + State::WriteLookAround { .. } => {} State::Fail => {} State::Match { .. } => {} } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index f16cbce7d..334f86a12 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1045,10 +1045,7 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = match lookaround { - LookAround::NegativeLookBehind(ref sub) - | LookAround::PositiveLookBehind(ref sub) => self.c(sub)?, - }; + let sub = self.c(lookaround.sub()); let pos = match lookaround { LookAround::NegativeLookBehind(_) => false, LookAround::PositiveLookBehind(_) => true, @@ -1064,7 +1061,7 @@ impl Compiler { self.patch( self.lookaround_alt .borrow() - .expect("Cannot compile lookaround outside pattern"), + .expect("Cannot compile look-around outside pattern"), sub.start, )?; Ok(ThompsonRef { start: check, end: check }) @@ -2038,15 +2035,15 @@ mod tests { } fn s_write_lookaround(id: usize) -> State { - State::WriteLookaround { - look_idx: SmallIndex::new(id) + State::WriteLookAround { + lookaround_idx: SmallIndex::new(id) .expect("look-around index too large"), } } fn s_check_lookaround(id: usize, positive: bool, next: usize) -> State { - State::CheckLookaround { - look_idx: SmallIndex::new(id) + State::CheckLookAround { + lookaround_idx: SmallIndex::new(id) .expect("look-around index too large"), positive, next: sid(next), diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 8e38168df..0833fdf33 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,7 +1100,7 @@ impl NFA { self.0.look_set_prefix_any } - /// Returns how many lookaround sub-expressions this nfa contains + /// Returns how many look-around sub-expressions this nfa contains #[inline] pub fn lookaround_count(&self) -> SmallIndex { self.0.lookaround_count @@ -1299,8 +1299,8 @@ impl Inner { State::ByteRange { .. } | State::Dense { .. } | State::Fail - | State::WriteLookaround { .. } => continue, - State::CheckLookaround { next, .. } => { + | State::WriteLookAround { .. } => continue, + State::CheckLookAround { next, .. } => { stack.push(next); } State::Sparse(_) => { @@ -1385,8 +1385,8 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } - State::CheckLookaround { look_idx, .. } - | State::WriteLookaround { look_idx } => { + State::CheckLookAround { lookaround_idx: look_idx, .. } + | State::WriteLookAround { lookaround_idx: look_idx } => { self.lookaround_count = self.lookaround_count.max(look_idx); } State::Union { .. } @@ -1563,23 +1563,24 @@ pub enum State { /// satisfied. next: StateID, }, - /// This is like a match state but for a lookaround expression - /// executing this state will write a `true` into the lookaround oracle at - /// index `look_idx` - WriteLookaround { - /// The index of the lookaround expression that matches - look_idx: SmallIndex, + /// This is like a match state but for a look-around expression. + /// Executing this state will write the current haystack offset into the + /// look-around oracle at index `lookaround_idx`. + WriteLookAround { + /// The index of the look-around expression that matches. + lookaround_idx: SmallIndex, }, /// This indicates that we need to check whether lookaround expression with - /// index `look_idx` holds at the current position in the haystack + /// index `lookaround_idx` holds at the current position in the haystack /// If `positive` is false, then the lookaround expression is negative and /// hence must NOT hold. - CheckLookaround { - /// The index of the lookaround expression that must be satisfied - look_idx: SmallIndex, - /// Whether this is a positive lookaround expression + CheckLookAround { + /// The index of the look-around expression that must be satisfied. + lookaround_idx: SmallIndex, + /// Whether this is a positive lookaround expression. positive: bool, - /// The next state to transition if the lookaround assertion is satisfied + /// The next state to transition if the look-around assertion is + /// satisfied. next: StateID, }, /// An alternation such that there exists an epsilon transition to all @@ -1696,12 +1697,12 @@ impl State { | State::Dense { .. } | State::Fail | State::Match { .. } - | State::WriteLookaround { .. } => false, + | State::WriteLookAround { .. } => false, State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } | State::Capture { .. } - | State::CheckLookaround { .. } => true, + | State::CheckLookAround { .. } => true, } } @@ -1714,8 +1715,8 @@ impl State { | State::Capture { .. } | State::Match { .. } | State::Fail - | State::WriteLookaround { .. } - | State::CheckLookaround { .. } => 0, + | State::WriteLookAround { .. } + | State::CheckLookAround { .. } => 0, State::Sparse(SparseTransitions { ref transitions }) => { transitions.len() * mem::size_of::() } @@ -1748,7 +1749,7 @@ impl State { } } State::Look { ref mut next, .. } => *next = remap[*next], - State::CheckLookaround { ref mut next, .. } => { + State::CheckLookAround { ref mut next, .. } => { *next = remap[*next] } State::Union { ref mut alternates } => { @@ -1763,7 +1764,7 @@ impl State { State::Capture { ref mut next, .. } => *next = remap[*next], State::Fail | State::Match { .. } - | State::WriteLookaround { .. } => {} + | State::WriteLookAround { .. } => {} } } } @@ -1793,15 +1794,19 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } - State::WriteLookaround { look_idx } => { - write!(f, "Write Lookaround: {}", look_idx.as_u32()) + State::WriteLookAround { lookaround_idx: look_idx } => { + write!(f, "write-look-around({})", look_idx.as_u32()) } - State::CheckLookaround { look_idx, positive, next } => { + State::CheckLookAround { + lookaround_idx: look_idx, + positive, + next, + } => { write!( f, - "Check Lookaround {} is {} => {}", + "check-look-around({} is {}) => {}", look_idx.as_u32(), - positive, + if positive { "matched" } else { "not matched" }, next.as_usize() ) } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index a8406daa2..ceb6602bc 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1593,8 +1593,8 @@ impl PikeVM { | State::Union { .. } | State::BinaryUnion { .. } | State::Capture { .. } - | State::WriteLookaround { .. } - | State::CheckLookaround { .. } => None, + | State::WriteLookAround { .. } + | State::CheckLookAround { .. } => None, State::ByteRange { ref trans } => { if trans.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); @@ -1772,12 +1772,16 @@ impl PikeVM { } sid = next; } - State::WriteLookaround { look_idx } => { + State::WriteLookAround { lookaround_idx: look_idx } => { // This is ok since `at` is always less than `usize::MAX`. lookarounds[look_idx] = NonMaxUsize::new(at); return; } - State::CheckLookaround { look_idx, positive, next } => { + State::CheckLookAround { + lookaround_idx: look_idx, + positive, + next, + } => { let state = match lookarounds[look_idx] { None => usize::MAX, Some(pos) => pos.get(), @@ -1988,11 +1992,7 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), - lookaround: { - let mut res = Vec::new(); - res.resize(re.lookaround_count().as_usize(), None); - res - }, + lookaround: vec![None; re.lookaround_count().as_usize()], } } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 08839ceaf..729d293ed 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -251,8 +251,8 @@ pub(crate) fn next( | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} - thompson::State::CheckLookaround { .. } - | thompson::State::WriteLookaround { .. } => { + thompson::State::CheckLookAround { .. } + | thompson::State::WriteLookAround { .. } => { todo!("check how to handle") } thompson::State::Match { pattern_id } => { @@ -403,8 +403,8 @@ pub(crate) fn epsilon_closure( | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, - thompson::State::WriteLookaround { .. } - | thompson::State::CheckLookaround { .. } => { + thompson::State::WriteLookAround { .. } + | thompson::State::CheckLookAround { .. } => { todo!("check how to handle") } thompson::State::Look { look, next } => { @@ -473,8 +473,8 @@ pub(crate) fn add_nfa_states( builder.add_nfa_state_id(nfa_id); builder.set_look_need(|need| need.insert(look)); } - thompson::State::CheckLookaround { .. } - | thompson::State::WriteLookaround { .. } => { + thompson::State::CheckLookAround { .. } + | thompson::State::WriteLookAround { .. } => { todo!("check how to handle") } thompson::State::Union { .. } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 41311371d..debc5dcb4 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2139,8 +2139,6 @@ impl Properties { /// /// Only returns true for [`HirKind::LookAround`] and not for /// [`HirKind::Look`], which can be queried by [`look_set`] instead. - /// Currently, only lookbehind assertions without capture groups are - /// supported. #[inline] pub fn contains_lookaround_expr(&self) -> bool { self.0.contains_lookaround_expr From c9814a35f0398b95bc51f45b4a46a2e9c5cd7429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Tue, 18 Mar 2025 15:48:44 +0100 Subject: [PATCH 35/68] Fix look-around indexing --- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 7 ++++--- regex-automata/src/nfa/thompson/pikevm.rs | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 334f86a12..5d033dd1d 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1045,7 +1045,7 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = self.c(lookaround.sub()); + let sub = self.c(lookaround.sub())?; let pos = match lookaround { LookAround::NegativeLookBehind(_) => false, LookAround::PositiveLookBehind(_) => true, diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 0833fdf33..a7bd02d96 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1102,7 +1102,7 @@ impl NFA { /// Returns how many look-around sub-expressions this nfa contains #[inline] - pub fn lookaround_count(&self) -> SmallIndex { + pub fn lookaround_count(&self) -> usize { self.0.lookaround_count } @@ -1269,7 +1269,7 @@ pub(super) struct Inner { /// How many look-around expression this NFA contains. /// This is needed to initialize the table for storing the result of /// look-around evaluation - lookaround_count: SmallIndex, + lookaround_count: usize, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1387,7 +1387,8 @@ impl Inner { } State::CheckLookAround { lookaround_idx: look_idx, .. } | State::WriteLookAround { lookaround_idx: look_idx } => { - self.lookaround_count = self.lookaround_count.max(look_idx); + self.lookaround_count = + self.lookaround_count.max(look_idx.as_usize() + 1); } State::Union { .. } | State::BinaryUnion { .. } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index ceb6602bc..c75a3fff5 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1216,7 +1216,7 @@ impl PikeVM { } impl PikeVM { - fn lookaround_count(&self) -> SmallIndex { + fn lookaround_count(&self) -> usize { self.nfa.lookaround_count() } @@ -1992,7 +1992,7 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), - lookaround: vec![None; re.lookaround_count().as_usize()], + lookaround: vec![None; re.lookaround_count()], } } From d98cfbf8fc554e21ed250b34eb33d7d7747e137a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Tue, 18 Mar 2025 18:55:52 +0100 Subject: [PATCH 36/68] Add error messages and fix pre-filter We need to disable pre-filters when a regex contains lookarounds. This is because the relevant information for a lookbehind can be before the start of the match. --- regex-automata/src/dfa/dense.rs | 6 ++++++ regex-automata/src/dfa/determinize.rs | 4 ++++ regex-automata/src/hybrid/dfa.rs | 3 +++ regex-automata/src/hybrid/error.rs | 6 ++++++ regex-automata/src/meta/wrappers.rs | 2 ++ regex-syntax/src/hir/literal.rs | 5 ++--- 6 files changed, 23 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 056213b28..ce0b5e07a 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -5096,6 +5096,12 @@ impl BuildError { BuildError { kind: BuildErrorKind::Unsupported(msg) } } + pub(crate) fn unsupported_lookaround() -> BuildError { + let msg = "cannot build DFAs for regexes with look-around\ + sub-expressions; use a different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } + pub(crate) fn too_many_states() -> BuildError { BuildError { kind: BuildErrorKind::TooManyStates } } diff --git a/regex-automata/src/dfa/determinize.rs b/regex-automata/src/dfa/determinize.rs index 7a49c2453..1256087f6 100644 --- a/regex-automata/src/dfa/determinize.rs +++ b/regex-automata/src/dfa/determinize.rs @@ -219,6 +219,10 @@ impl<'a> Runner<'a> { return Err(BuildError::unsupported_dfa_word_boundary_unicode()); } + if self.nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } + // A sequence of "representative" bytes drawn from each equivalence // class. These representative bytes are fed to the NFA to compute // state transitions. This allows us to avoid re-computing state diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 92956911f..b933a307e 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -4056,6 +4056,9 @@ impl Builder { &self, nfa: thompson::NFA, ) -> Result { + if nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } let quitset = self.config.quit_set_from_nfa(&nfa)?; let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); // Check that we can fit at least a few states into our cache, diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index d134e7ec9..ae3ae6c53 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -61,6 +61,12 @@ impl BuildError { different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } + + pub(crate) fn unsupported_lookaround() -> BuildError { + let msg = "cannot build DFAs for regexes with look-around\ + sub-expressions; use a different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } } #[cfg(feature = "std")] diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 95d0e07b1..f2f0ec5b7 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -204,6 +204,8 @@ impl BoundedBacktrackerEngine { { if !info.config().get_backtrack() || info.config().get_match_kind() != MatchKind::LeftmostFirst + // TODO: remove once look-around support is added. + || nfa.lookaround_count() > 0 { return Ok(None); } diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 9b21abf94..2517e4601 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,9 +172,8 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) | LookAround(_) => { - Seq::singleton(self::Literal::exact(vec![])) - } + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + LookAround(_) => Seq::infinite(), Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); From 5284761cdcc872324b787a675a109c6326665dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Tue, 18 Mar 2025 18:58:11 +0100 Subject: [PATCH 37/68] Add unit tests for look-behind assertions --- regex-automata/tests/dfa/onepass/suite.rs | 5 +- regex-automata/tests/dfa/suite.rs | 12 +++- regex-automata/tests/hybrid/suite.rs | 11 +++- regex-automata/tests/lib.rs | 1 + .../tests/nfa/thompson/backtrack/suite.rs | 8 +++ testdata/lookaround.toml | 59 +++++++++++++++++++ 6 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 testdata/lookaround.toml diff --git a/regex-automata/tests/dfa/onepass/suite.rs b/regex-automata/tests/dfa/onepass/suite.rs index 20bd6965c..4c7682f7f 100644 --- a/regex-automata/tests/dfa/onepass/suite.rs +++ b/regex-automata/tests/dfa/onepass/suite.rs @@ -79,7 +79,10 @@ fn compiler( // Since our error types are all generally opaque, we just // look for an error string. Not great, but not the end of the // world. - if test.compiles() && msg.contains("not one-pass") { + if test.compiles() + && (msg.contains("not one-pass") + || msg.contains("look-around")) + { return Ok(CompiledRegex::skip()); } return Err(err.into()); diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index 8ed6dd007..febded611 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -292,7 +292,17 @@ fn compiler( if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - create_matcher(&builder, pre, builder.build_many(®exes)?) + let re = match builder.build_many(regexes) { + Ok(re) => re, + Err(err) + if test.compiles() + && format!("{err}").contains("look-around") => + { + return Ok(CompiledRegex::skip()); + } + Err(err) => return Err(err.into()), + }; + create_matcher(&builder, pre, re) } } diff --git a/regex-automata/tests/hybrid/suite.rs b/regex-automata/tests/hybrid/suite.rs index 4aaca6698..ee81aca8d 100644 --- a/regex-automata/tests/hybrid/suite.rs +++ b/regex-automata/tests/hybrid/suite.rs @@ -183,7 +183,16 @@ fn compiler( if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = builder.build_many(®exes)?; + let re = match builder.build_many(regexes) { + Ok(re) => re, + Err(err) + if test.compiles() + && format!("{err}").contains("look-around") => + { + return Ok(CompiledRegex::skip()); + } + Err(err) => return Err(err.into()), + }; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs index 67c979aa8..1ba08fe87 100644 --- a/regex-automata/tests/lib.rs +++ b/regex-automata/tests/lib.rs @@ -65,6 +65,7 @@ fn suite() -> anyhow::Result { load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); + load!("lookaround"); Ok(tests) } diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index bce0eef40..674ce5039 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -74,6 +74,10 @@ fn min_visited_capacity() -> Result<()> { .configure(config_thompson(test)) .syntax(config_syntax(test)) .build_many(®exes)?; + // TODO: remove once look-around is supported. + if nfa.lookaround_count() > 0 { + return Ok(CompiledRegex::skip()); + } let mut builder = BoundedBacktracker::builder(); if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); @@ -105,6 +109,10 @@ fn compiler( return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; + // TODO: remove once look-around is supported. + if re.get_nfa().lookaround_count() > 0 { + return Ok(CompiledRegex::skip()); + } let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml new file mode 100644 index 000000000..ecbd76d48 --- /dev/null +++ b/testdata/lookaround.toml @@ -0,0 +1,59 @@ +[[test]] +name = "basic lookbehind positive" +regex = "(?<=b)a" +haystack = "ba" +matches = [[1, 2]] + +[[test]] +name = "basic lookbehind negative" +regex = "(? Date: Tue, 25 Mar 2025 11:17:49 +0100 Subject: [PATCH 38/68] Bump version numbers --- Cargo.toml | 6 +++--- regex-automata/Cargo.toml | 4 ++-- regex-cli/Cargo.toml | 4 ++-- regex-syntax/Cargo.toml | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index be69acf12..8904fea69 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.11.2" #:version +version = "1.12.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" @@ -176,14 +176,14 @@ default-features = false # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.8" +version = "0.5.0" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.8.5" +version = "0.9.0" default-features = false [dev-dependencies] diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 6d3eb7a90..08631ae0f 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.10" #:version +version = "0.5.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" @@ -87,7 +87,7 @@ internal-instrument-pikevm = ["logging", "std"] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } memchr = { version = "2.6.0", optional = true, default-features = false } -regex-syntax = { path = "../regex-syntax", version = "0.8.5", optional = true, default-features = false } +regex-syntax = { path = "../regex-syntax", version = "0.9.0", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index 6c4ad7d8c..c436c0a07 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -30,8 +30,8 @@ lexopt = "0.3.0" log = { version = "0.4.17", features = ["std"] } memmap2 = "0.9.4" regex = { version = "1.9.0", path = ".." } -regex-automata = { version = "0.4.8", path = "../regex-automata", features = ["logging"] } +regex-automata = { version = "0.5.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } -regex-syntax = { version = "0.8.5", path = "../regex-syntax" } +regex-syntax = { version = "0.9.0", path = "../regex-syntax" } tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } textwrap = { version = "0.16.0", default-features = false } diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index eb5ab9389..d3e0bca68 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.6" #:version +version = "0.9.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" homepage = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From cff77ec0adad45f77f8d9ab111bb7b7f6c165186 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 2 Apr 2025 22:17:07 +0200 Subject: [PATCH 39/68] Adjust some docs --- regex-automata/src/nfa/thompson/backtrack.rs | 2 +- regex-automata/src/nfa/thompson/builder.rs | 13 +++++----- regex-automata/src/nfa/thompson/compiler.rs | 4 +-- regex-automata/src/nfa/thompson/nfa.rs | 26 ++++++++++---------- regex-automata/src/nfa/thompson/pikevm.rs | 8 +++--- regex-syntax/src/hir/mod.rs | 4 +-- 6 files changed, 28 insertions(+), 29 deletions(-) diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index be0cbcfbd..98a5b5c1e 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -1453,7 +1453,7 @@ impl BoundedBacktracker { /// Execute a "step" in the backtracing algorithm. /// /// A "step" is somewhat of a misnomer, because this routine keeps going - /// until it either runs out of things to try or fins a match. In the + /// until it either runs out of things to try or finds a match. In the /// former case, it may have pushed some things on to the backtracking /// stack, in which case, those will be tried next as part of the /// 'backtrack' routine above. diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 748d1d01c..c769fda23 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -92,7 +92,7 @@ enum State { next: StateID, }, /// An empty state that behaves analogously to a `Match` state but for - /// the look-around sub-expression with the given index. + /// the look-around sub-expression with the given look-around index. WriteLookAround { lookaround_index: SmallIndex }, /// A conditional epsilon transition that will only be taken if the /// look-around sub-expression with the given index evaluates to `positive` @@ -484,9 +484,8 @@ impl Builder { remap[sid] = nfa.add(nfa::State::Look { look, next }); } State::WriteLookAround { lookaround_index } => { - remap[sid] = nfa.add(nfa::State::WriteLookAround { - lookaround_idx: lookaround_index, - }); + remap[sid] = nfa + .add(nfa::State::WriteLookAround { lookaround_index }); } State::CheckLookAround { lookaround_index, @@ -494,7 +493,7 @@ impl Builder { next, } => { remap[sid] = nfa.add(nfa::State::CheckLookAround { - lookaround_idx: lookaround_index, + lookaround_index, positive, next, }); @@ -722,7 +721,7 @@ impl Builder { self.add(State::Empty { next: StateID::ZERO }) } - /// Add a state which will record that the lookaround with the given index + /// Add a state which will record that the look-around with the given index /// is satisfied at the current position. pub fn add_write_lookaround( &mut self, @@ -731,7 +730,7 @@ impl Builder { self.add(State::WriteLookAround { lookaround_index: index }) } - /// Add a state which will check whether the lookaround with the given + /// Add a state which will check whether the look-around with the given /// index is satisfied at the current position. pub fn add_check_lookaround( &mut self, diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 5d033dd1d..69f6e72ef 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -2036,14 +2036,14 @@ mod tests { fn s_write_lookaround(id: usize) -> State { State::WriteLookAround { - lookaround_idx: SmallIndex::new(id) + lookaround_index: SmallIndex::new(id) .expect("look-around index too large"), } } fn s_check_lookaround(id: usize, positive: bool, next: usize) -> State { State::CheckLookAround { - lookaround_idx: SmallIndex::new(id) + lookaround_index: SmallIndex::new(id) .expect("look-around index too large"), positive, next: sid(next), diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index a7bd02d96..51d6b4138 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,7 +1100,7 @@ impl NFA { self.0.look_set_prefix_any } - /// Returns how many look-around sub-expressions this nfa contains + /// Returns how many look-around sub-expressions this nfa contains. #[inline] pub fn lookaround_count(&self) -> usize { self.0.lookaround_count @@ -1268,7 +1268,7 @@ pub(super) struct Inner { */ /// How many look-around expression this NFA contains. /// This is needed to initialize the table for storing the result of - /// look-around evaluation + /// look-around evaluation. lookaround_count: usize, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state @@ -1385,8 +1385,8 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } - State::CheckLookAround { lookaround_idx: look_idx, .. } - | State::WriteLookAround { lookaround_idx: look_idx } => { + State::CheckLookAround { lookaround_index: look_idx, .. } + | State::WriteLookAround { lookaround_index: look_idx } => { self.lookaround_count = self.lookaround_count.max(look_idx.as_usize() + 1); } @@ -1566,19 +1566,19 @@ pub enum State { }, /// This is like a match state but for a look-around expression. /// Executing this state will write the current haystack offset into the - /// look-around oracle at index `lookaround_idx`. + /// look-around oracle at index `lookaround_index`. WriteLookAround { /// The index of the look-around expression that matches. - lookaround_idx: SmallIndex, + lookaround_index: SmallIndex, }, - /// This indicates that we need to check whether lookaround expression with - /// index `lookaround_idx` holds at the current position in the haystack - /// If `positive` is false, then the lookaround expression is negative and + /// This indicates that we need to check whether look-around expression with + /// index `lookaround_index` holds at the current position in the haystack. + /// If `positive` is false, then the look-around expression is negative and /// hence must NOT hold. CheckLookAround { /// The index of the look-around expression that must be satisfied. - lookaround_idx: SmallIndex, - /// Whether this is a positive lookaround expression. + lookaround_index: SmallIndex, + /// Whether this is a positive look-around expression. positive: bool, /// The next state to transition if the look-around assertion is /// satisfied. @@ -1795,11 +1795,11 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } - State::WriteLookAround { lookaround_idx: look_idx } => { + State::WriteLookAround { lookaround_index: look_idx } => { write!(f, "write-look-around({})", look_idx.as_u32()) } State::CheckLookAround { - lookaround_idx: look_idx, + lookaround_index: look_idx, positive, next, } => { diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index c75a3fff5..f6c5ce5cf 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1772,13 +1772,13 @@ impl PikeVM { } sid = next; } - State::WriteLookAround { lookaround_idx: look_idx } => { + State::WriteLookAround { lookaround_index: look_idx } => { // This is ok since `at` is always less than `usize::MAX`. lookarounds[look_idx] = NonMaxUsize::new(at); return; } State::CheckLookAround { - lookaround_idx: look_idx, + lookaround_index: look_idx, positive, next, } => { @@ -1973,8 +1973,8 @@ pub struct Cache { /// next byte in the haystack. next: ActiveStates, /// This answers the question: "What is the maximum position in the - /// haystack at which lookaround assertion x holds and which is <= to the - /// current position" + /// haystack at which look-around indexed x holds and which is <= to the + /// current position". lookaround: Vec>, } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index debc5dcb4..dfdf8ed7e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1819,7 +1819,7 @@ impl LookAround { } } - /// Returns a mutable reference to the inner expression + /// Returns a mutable reference to the inner expression. pub fn sub_mut(&mut self) -> &mut Hir { match self { Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { @@ -2556,7 +2556,7 @@ impl Properties { look_set_prefix_any: LookSet::singleton(look), look_set_suffix_any: LookSet::singleton(look), // Note, this field represents _general_ lookarounds (ones using - // LookAround) and not simple ones (using Look). + // LookAround) and not assertions (using Look). contains_lookaround_expr: false, // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid From 4a87b58367c19ca03b149c19228f753719c898d3 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 3 Apr 2025 20:53:53 +0200 Subject: [PATCH 40/68] Add lookbehind with capture group test --- testdata/lookaround.toml | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index ecbd76d48..9b34be47b 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -26,34 +26,43 @@ matches = [] name = "lookbehind in quantifier non-repeating" regex = "(?:(?<=c)a)+" haystack = "badacacaea" -matches = [[5,6], [7,8]] +matches = [[5, 6], [7, 8]] [[test]] name = "lookbehind in quantifier repeating" regex = "(?:(?<=a)a)+" haystack = "babaabaaabaaaac" -matches = [[4,5], [7,9], [11,14]] +matches = [[4, 5], [7, 9], [11, 14]] [[test]] name = "lookbehind with quantifier" regex = "(?<=cb+)a" haystack = "acabacbacbbaea" -matches = [[7,8], [11,12]] +matches = [[7, 8], [11, 12]] [[test]] name = "nested lookbehind" regex = "(?<=c[def]+(? Date: Thu, 3 Apr 2025 20:59:27 +0200 Subject: [PATCH 41/68] Change how test suite filters tests --- regex-automata/tests/dfa/suite.rs | 18 +++++++----------- regex-automata/tests/hybrid/suite.rs | 17 +++++++---------- .../tests/nfa/thompson/backtrack/suite.rs | 4 ++-- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index febded611..aa43cc7e6 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -289,20 +289,16 @@ fn compiler( } } } + // Or look-around expressions. + for hir in hirs.iter() { + if hir.properties().contains_lookaround_expr() { + return Ok(CompiledRegex::skip()); + } + } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = match builder.build_many(regexes) { - Ok(re) => re, - Err(err) - if test.compiles() - && format!("{err}").contains("look-around") => - { - return Ok(CompiledRegex::skip()); - } - Err(err) => return Err(err.into()), - }; - create_matcher(&builder, pre, re) + create_matcher(&builder, pre, builder.build_many(regexes)?) } } diff --git a/regex-automata/tests/hybrid/suite.rs b/regex-automata/tests/hybrid/suite.rs index ee81aca8d..65769f001 100644 --- a/regex-automata/tests/hybrid/suite.rs +++ b/regex-automata/tests/hybrid/suite.rs @@ -180,19 +180,16 @@ fn compiler( } } } + // Or look-around expressions. + for hir in hirs.iter() { + if hir.properties().contains_lookaround_expr() { + return Ok(CompiledRegex::skip()); + } + } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = match builder.build_many(regexes) { - Ok(re) => re, - Err(err) - if test.compiles() - && format!("{err}").contains("look-around") => - { - return Ok(CompiledRegex::skip()); - } - Err(err) => return Err(err.into()), - }; + let re = builder.build_many(®exes)?; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index 674ce5039..2dd9d1f1b 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -74,7 +74,7 @@ fn min_visited_capacity() -> Result<()> { .configure(config_thompson(test)) .syntax(config_syntax(test)) .build_many(®exes)?; - // TODO: remove once look-around is supported. + // The backtracker doesn't support lookarounds, so skip if there are any. if nfa.lookaround_count() > 0 { return Ok(CompiledRegex::skip()); } @@ -109,7 +109,7 @@ fn compiler( return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; - // TODO: remove once look-around is supported. + // The backtracker doesn't support lookarounds, so skip if there are any. if re.get_nfa().lookaround_count() > 0 { return Ok(CompiledRegex::skip()); } From 6192349ad610bff0acbffe50de00a74e531d5865 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 4 Apr 2025 21:43:09 +0200 Subject: [PATCH 42/68] Change engine fallbacks --- regex-automata/src/dfa/dense.rs | 2 +- regex-automata/src/dfa/onepass.rs | 13 ++- regex-automata/src/hybrid/error.rs | 2 +- regex-automata/src/meta/strategy.rs | 85 ++++++++++--------- regex-automata/src/nfa/thompson/backtrack.rs | 7 +- regex-automata/src/nfa/thompson/compiler.rs | 7 ++ regex-automata/src/nfa/thompson/error.rs | 16 ++++ regex-automata/src/util/determinize/mod.rs | 6 +- .../tests/nfa/thompson/backtrack/suite.rs | 12 ++- 9 files changed, 101 insertions(+), 49 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index ce0b5e07a..aabc6d14d 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -5097,7 +5097,7 @@ impl BuildError { } pub(crate) fn unsupported_lookaround() -> BuildError { - let msg = "cannot build DFAs for regexes with look-around\ + let msg = "cannot build DFAs for regexes with look-around \ sub-expressions; use a different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 4dbb9b50e..3ca3fde09 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -602,6 +602,9 @@ impl<'a> InternalBuilder<'a> { )); } assert_eq!(DEAD, self.add_empty_state()?); + if self.nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } // This is where the explicit slots start. We care about this because // we only need to track explicit slots. The implicit slots---two for @@ -640,7 +643,7 @@ impl<'a> InternalBuilder<'a> { match *self.nfa.state(id) { thompson::State::WriteLookAround { .. } | thompson::State::CheckLookAround { .. } => { - todo!("check how to handle") + return Err(BuildError::unsupported_lookaround()); } thompson::State::ByteRange { ref trans } => { self.compile_transition(dfa_id, trans, epsilons)?; @@ -3000,6 +3003,7 @@ enum BuildErrorKind { UnsupportedLook { look: Look }, ExceededSizeLimit { limit: usize }, NotOnePass { msg: &'static str }, + UnsupportedLookAround, } impl BuildError { @@ -3030,6 +3034,10 @@ impl BuildError { fn not_one_pass(msg: &'static str) -> BuildError { BuildError { kind: BuildErrorKind::NotOnePass { msg } } } + + fn unsupported_lookaround() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLookAround } + } } #[cfg(feature = "std")] @@ -3078,6 +3086,9 @@ impl core::fmt::Display for BuildError { pattern is not one-pass: {}", msg, ), + UnsupportedLookAround => { + write!(f, "one-pass DFA does not support look-arounds") + } } } } diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index ae3ae6c53..062b9ac62 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -63,7 +63,7 @@ impl BuildError { } pub(crate) fn unsupported_lookaround() -> BuildError { - let msg = "cannot build DFAs for regexes with look-around\ + let msg = "cannot build DFAs for regexes with look-around \ sub-expressions; use a different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 04f2ba3c3..0ac830b9d 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -490,49 +490,52 @@ impl Core { // we know we aren't going to use the lazy DFA. So we do a config check // up front, which is in practice the only way we won't try to use the // DFA. - let (nfarev, hybrid, dfa) = - if !info.config().get_hybrid() && !info.config().get_dfa() { - (None, wrappers::Hybrid::none(), wrappers::DFA::none()) + let (nfarev, hybrid, dfa) = if !info.config().get_hybrid() + && !info.config().get_dfa() + // With look-arounds, the lazy DFA and dense DFA would fail to build + || nfa.lookaround_count() > 0 + { + (None, wrappers::Hybrid::none(), wrappers::DFA::none()) + } else { + // FIXME: Technically, we don't quite yet KNOW that we need + // a reverse NFA. It's possible for the DFAs below to both + // fail to build just based on the forward NFA. In which case, + // building the reverse NFA was totally wasted work. But... + // fixing this requires breaking DFA construction apart into + // two pieces: one for the forward part and another for the + // reverse part. Quite annoying. Making it worse, when building + // both DFAs fails, it's quite likely that the NFA is large and + // that it will take quite some time to build the reverse NFA + // too. So... it's really probably worth it to do this! + let nfarev = thompson::Compiler::new() + // Currently, reverse NFAs don't support capturing groups, + // so we MUST disable them. But even if we didn't have to, + // we would, because nothing in this crate does anything + // useful with capturing groups in reverse. And of course, + // the lazy DFA ignores capturing groups in all cases. + .configure( + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), + ) + .build_many_from_hir(hirs) + .map_err(BuildError::nfa)?; + let dfa = if !info.config().get_dfa() { + wrappers::DFA::none() } else { - // FIXME: Technically, we don't quite yet KNOW that we need - // a reverse NFA. It's possible for the DFAs below to both - // fail to build just based on the forward NFA. In which case, - // building the reverse NFA was totally wasted work. But... - // fixing this requires breaking DFA construction apart into - // two pieces: one for the forward part and another for the - // reverse part. Quite annoying. Making it worse, when building - // both DFAs fails, it's quite likely that the NFA is large and - // that it will take quite some time to build the reverse NFA - // too. So... it's really probably worth it to do this! - let nfarev = thompson::Compiler::new() - // Currently, reverse NFAs don't support capturing groups, - // so we MUST disable them. But even if we didn't have to, - // we would, because nothing in this crate does anything - // useful with capturing groups in reverse. And of course, - // the lazy DFA ignores capturing groups in all cases. - .configure( - thompson_config - .clone() - .which_captures(WhichCaptures::None) - .reverse(true), - ) - .build_many_from_hir(hirs) - .map_err(BuildError::nfa)?; - let dfa = if !info.config().get_dfa() { - wrappers::DFA::none() - } else { - wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) - }; - let hybrid = if !info.config().get_hybrid() { - wrappers::Hybrid::none() - } else if dfa.is_some() { - debug!("skipping lazy DFA because we have a full DFA"); - wrappers::Hybrid::none() - } else { - wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) - }; - (Some(nfarev), hybrid, dfa) + wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) }; + let hybrid = if !info.config().get_hybrid() { + wrappers::Hybrid::none() + } else if dfa.is_some() { + debug!("skipping lazy DFA because we have a full DFA"); + wrappers::Hybrid::none() + } else { + wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) + }; + (Some(nfarev), hybrid, dfa) + }; Ok(Core { info, pre, diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index 98a5b5c1e..eb36d1829 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -301,6 +301,9 @@ impl Builder { nfa: NFA, ) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; + if nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookarounds()); + } Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } @@ -1521,7 +1524,9 @@ impl BoundedBacktracker { } State::WriteLookAround { .. } | State::CheckLookAround { .. } => { - todo!("check how to handle") + unimplemented!( + "backtracking engine does not support look-arounds" + ); } State::Union { ref alternates } => { sid = match alternates.get(0) { diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 69f6e72ef..867b316dd 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -954,6 +954,13 @@ impl Compiler { { return Err(BuildError::unsupported_captures()); } + if self.config.get_reverse() + && exprs.iter().any(|e| { + (e.borrow() as &Hir).properties().contains_lookaround_expr() + }) + { + return Err(BuildError::unsupported_lookarounds()); + } self.builder.borrow_mut().clear(); self.builder.borrow_mut().set_utf8(self.config.get_utf8()); diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index fa00eb7dd..ebb9a7ca5 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -81,6 +81,13 @@ enum BuildErrorKind { /// should support it at some point. #[cfg(feature = "syntax")] UnsupportedCaptures, + /// An error that occurs when one tries to build a reverse NFA with + /// look-around sub-expressions. Currently, this isn't supported, but we + /// probably should support it at some point. + /// + /// This is also emmitted by the backtracking engine which does not + /// support look-around sub-expressions. + UnsupportedLookArounds, } impl BuildError { @@ -142,6 +149,10 @@ impl BuildError { pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } } + + pub(crate) fn unsupported_lookarounds() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLookArounds } + } } #[cfg(feature = "std")] @@ -201,6 +212,11 @@ impl core::fmt::Display for BuildError { "currently captures must be disabled when compiling \ a reverse NFA", ), + BuildErrorKind::UnsupportedLookArounds => write!( + f, + "currently look-around sub-expressions cannot be in the pattern \ + when compiling a reverse NFA or using the backtracking engine", + ), } } } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 729d293ed..616326ee8 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -253,7 +253,7 @@ pub(crate) fn next( | thompson::State::Capture { .. } => {} thompson::State::CheckLookAround { .. } | thompson::State::WriteLookAround { .. } => { - todo!("check how to handle") + unimplemented!("look-around support in DFA") } thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match @@ -405,7 +405,7 @@ pub(crate) fn epsilon_closure( | thompson::State::Match { .. } => break, thompson::State::WriteLookAround { .. } | thompson::State::CheckLookAround { .. } => { - todo!("check how to handle") + unimplemented!("look-around support in DFA") } thompson::State::Look { look, next } => { if !look_have.contains(look) { @@ -475,7 +475,7 @@ pub(crate) fn add_nfa_states( } thompson::State::CheckLookAround { .. } | thompson::State::WriteLookAround { .. } => { - todo!("check how to handle") + unimplemented!("look-around support in DFA") } thompson::State::Union { .. } | thompson::State::BinaryUnion { .. } => { diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index 2dd9d1f1b..7be175f04 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -108,7 +108,17 @@ fn compiler( if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = builder.build_many(®exes)?; + let re = match builder.build_many(®exes) { + Ok(re) => re, + // Due to errors being opaque, we need to check the error message to skip tests with look-arounds + Err(err) => { + if test.compiles() && err.to_string().contains("look-around") { + return Ok(CompiledRegex::skip()); + } + + return Err(err.into()); + } + }; // The backtracker doesn't support lookarounds, so skip if there are any. if re.get_nfa().lookaround_count() > 0 { return Ok(CompiledRegex::skip()); From 07668e6f72a05f917aa2d1c8000f84a4c4e667ae Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 4 Apr 2025 21:46:39 +0200 Subject: [PATCH 43/68] Rename lookaround_index --- regex-automata/src/nfa/thompson/nfa.rs | 18 +++++++----------- regex-automata/src/nfa/thompson/pikevm.rs | 8 ++++---- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 51d6b4138..950a86ce7 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1385,10 +1385,10 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } - State::CheckLookAround { lookaround_index: look_idx, .. } - | State::WriteLookAround { lookaround_index: look_idx } => { + State::CheckLookAround { lookaround_index, .. } + | State::WriteLookAround { lookaround_index } => { self.lookaround_count = - self.lookaround_count.max(look_idx.as_usize() + 1); + self.lookaround_count.max(lookaround_index.as_usize() + 1); } State::Union { .. } | State::BinaryUnion { .. } @@ -1795,18 +1795,14 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } - State::WriteLookAround { lookaround_index: look_idx } => { - write!(f, "write-look-around({})", look_idx.as_u32()) + State::WriteLookAround { lookaround_index } => { + write!(f, "write-look-around({})", lookaround_index.as_u32()) } - State::CheckLookAround { - lookaround_index: look_idx, - positive, - next, - } => { + State::CheckLookAround { lookaround_index, positive, next } => { write!( f, "check-look-around({} is {}) => {}", - look_idx.as_u32(), + lookaround_index.as_u32(), if positive { "matched" } else { "not matched" }, next.as_usize() ) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index f6c5ce5cf..d7bb07397 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1772,17 +1772,17 @@ impl PikeVM { } sid = next; } - State::WriteLookAround { lookaround_index: look_idx } => { + State::WriteLookAround { lookaround_index } => { // This is ok since `at` is always less than `usize::MAX`. - lookarounds[look_idx] = NonMaxUsize::new(at); + lookarounds[lookaround_index] = NonMaxUsize::new(at); return; } State::CheckLookAround { - lookaround_index: look_idx, + lookaround_index, positive, next, } => { - let state = match lookarounds[look_idx] { + let state = match lookarounds[lookaround_index] { None => usize::MAX, Some(pos) => pos.get(), }; From 93759000a3f5564d45980240bf71922053a50208 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 4 Apr 2025 21:56:46 +0200 Subject: [PATCH 44/68] Fix literals tests --- regex-syntax/src/hir/literal.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 2517e4601..7859327df 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2456,16 +2456,16 @@ mod tests { #[test] fn lookaround() { - assert_eq!(exact(["ab"]), e(r"a(?<=qwa)b")); - assert_eq!(exact(["ab"]), e(r"a(? Date: Sat, 5 Apr 2025 08:00:55 +0200 Subject: [PATCH 45/68] Fix anchors in lookarounds --- regex-automata/src/nfa/thompson/compiler.rs | 23 +++++++++++++++++++++ regex-syntax/src/hir/mod.rs | 2 ++ testdata/lookaround.toml | 6 ++++++ 3 files changed, 31 insertions(+) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 867b316dd..9d72ad82c 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -2158,6 +2158,29 @@ mod tests { ); } + #[test] + fn compile_yes_unanchored_prefix_with_start_anchor_in_lookaround() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"(?<=^)a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_bin_union(3, 6), + s_bin_union(5, 4), + s_range(0, 255, 3), + s_look(Look::Start, 7), + s_check_lookaround(0, true, 8), + s_write_lookaround(0), + s_byte(b'a', 9), + s_match(0) + ] + ); + } + #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]); diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index dfdf8ed7e..67b54c8e1 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2589,6 +2589,8 @@ impl Properties { literal: false, alternation_literal: false, contains_lookaround_expr: true, + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), ..*sub_p.0.clone() }; Properties(Box::new(inner)) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 9b34be47b..8818a8f1a 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -58,6 +58,12 @@ regex = "(?<=c+)a|(?<=d+)a" haystack = "aabacadaccaddaea" matches = [[5, 6], [7, 8], [10, 11], [13, 14]] +[[test]] +name = "lookbehind with anchor" +regex = "(?<=^c)a" +haystack = "cacacaasdacabasdqwe" +matches = [[1, 2]] + [[test]] name = "lookbehind next to capture group" regex = "(?<=c)(a|b)(b|a)" From 9c7c558a754282cd0b7c44481a0685b1e06ad942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 10 Apr 2025 16:00:08 +0200 Subject: [PATCH 46/68] Fix broken doc link --- regex-syntax/src/hir/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 67b54c8e1..176599bcf 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2138,7 +2138,7 @@ impl Properties { /// Returns whether there are any look-around expressions in this HIR value. /// /// Only returns true for [`HirKind::LookAround`] and not for - /// [`HirKind::Look`], which can be queried by [`look_set`] instead. + /// [`HirKind::Look`], which can be queried by [`look_set`](Properties::look_set) instead. #[inline] pub fn contains_lookaround_expr(&self) -> bool { self.0.contains_lookaround_expr From 0917a1ddec018a3dd7801dbdcda1566f3476c342 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 17 Apr 2025 10:47:46 +0200 Subject: [PATCH 47/68] Remove unneeded if condition --- regex-automata/tests/nfa/thompson/backtrack/suite.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index 7be175f04..b0aa0fc6c 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -119,10 +119,6 @@ fn compiler( return Err(err.into()); } }; - // The backtracker doesn't support lookarounds, so skip if there are any. - if re.get_nfa().lookaround_count() > 0 { - return Ok(CompiledRegex::skip()); - } let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) From 88df919598ede52d027b497c4c857c08dd8b38dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 11:43:13 +0200 Subject: [PATCH 48/68] Explain use of empty look-set --- regex-syntax/src/hir/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 176599bcf..a847e2486 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -2589,6 +2589,8 @@ impl Properties { literal: false, alternation_literal: false, contains_lookaround_expr: true, + // We do not want look-around subexpressions to influence matching + // of the main expression when they contain anchors, so we clear the set. look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), ..*sub_p.0.clone() From 1e136453f0f902fc8f9559b44c79770dc14dd9ad Mon Sep 17 00:00:00 2001 From: Robin Date: Wed, 16 Apr 2025 11:42:58 +0200 Subject: [PATCH 49/68] Add regression tests --- testdata/lookaround.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 8818a8f1a..14a303d7c 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -46,6 +46,18 @@ regex = "(?<=c[def]+(? Date: Wed, 16 Apr 2025 16:52:11 +0200 Subject: [PATCH 50/68] Change compilation to disconnected components --- regex-automata/src/nfa/thompson/builder.rs | 9 +++ regex-automata/src/nfa/thompson/compiler.rs | 79 +++++++-------------- regex-automata/src/nfa/thompson/nfa.rs | 9 +++ 3 files changed, 43 insertions(+), 54 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index c769fda23..e4b6ff665 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -340,6 +340,8 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, + /// The starting states for each individual look-behind sub-expression. + start_look_behind: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -449,6 +451,7 @@ impl Builder { remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); + nfa.set_look_behind_starts(self.start_look_behind.as_slice()); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -706,6 +709,12 @@ impl Builder { self.start_pattern.len() } + /// Adds the [`start_id`] to the set of starting states that is used when + /// running look-behind expressions. + pub fn start_look_behind(&mut self, start_id: StateID) { + self.start_look_behind.push(start_id); + } + /// Add an "empty" NFA state. /// /// An "empty" NFA state is a state with a single unconditional epsilon diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 9d72ad82c..a57f4fd75 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -711,11 +711,6 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, - /// Top level alternation state which is used to run all look-around - /// assertion checks in lockstep with the main expression. Each look-around - /// expression is compiled to a set of states that is patched into this - /// state, and this state is updated on each new pattern being compiled. - lookaround_alt: RefCell>, /// The next index to use for a look-around expression. lookaround_index: RefCell, } @@ -730,7 +725,6 @@ impl Compiler { utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), - lookaround_alt: RefCell::new(None), lookaround_index: RefCell::new(SmallIndex::ZERO), } } @@ -993,32 +987,11 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; - let has_lookarounds = - (e.borrow() as &Hir).properties().contains_lookaround_expr(); - let mut top_level_alt = if has_lookarounds { - self.add_union()? - } else { - StateID::ZERO - }; - if has_lookarounds { - let lookaround_prefix = - self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union()?; - self.patch(lookaround_prefix.end, lookaround_alt)?; - self.patch(top_level_alt, lookaround_prefix.start)?; - self.lookaround_alt.borrow_mut().replace(lookaround_alt); - } let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - if has_lookarounds { - self.patch(top_level_alt, one.start)?; - } else { - top_level_alt = one.start; - } - let _ = self.finish_pattern(top_level_alt)?; - self.lookaround_alt.borrow_mut().take(); - Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) + let _ = self.finish_pattern(one.start)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) }))?; self.patch(unanchored_prefix.end, compiled.start)?; let nfa = self @@ -1052,25 +1025,25 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = self.c(lookaround.sub())?; - let pos = match lookaround { - LookAround::NegativeLookBehind(_) => false, - LookAround::PositiveLookBehind(_) => true, - }; let idx = *self.lookaround_index.borrow(); *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) .map_err(|e| { BuildError::too_many_lookarounds(e.attempted() as usize) })?; + let pos = match lookaround { + LookAround::NegativeLookBehind(_) => false, + LookAround::PositiveLookBehind(_) => true, + }; let check = self.add_check_lookaround(idx, pos)?; + + let unanchored = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + + let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; + self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; - self.patch( - self.lookaround_alt - .borrow() - .expect("Cannot compile look-around outside pattern"), - sub.start, - )?; + self.builder.borrow_mut().start_look_behind(unanchored.start); Ok(ThompsonRef { start: check, end: check }) } @@ -2169,13 +2142,12 @@ mod tests { &[ s_bin_union(2, 1), s_range(0, 255, 0), - s_bin_union(3, 6), + s_check_lookaround(0, true, 7), s_bin_union(5, 4), s_range(0, 255, 3), - s_look(Look::Start, 7), - s_check_lookaround(0, true, 8), + s_look(Look::Start, 6), s_write_lookaround(0), - s_byte(b'a', 9), + s_byte(b'a', 8), s_match(0) ] ); @@ -2310,11 +2282,10 @@ mod tests { assert_eq!( build(r"(?<=a)").states(), &[ - s_bin_union(1, 4), + s_check_lookaround(0, true, 5), s_bin_union(3, 2), s_range(b'\x00', b'\xFF', 1), - s_byte(b'a', 5), - s_check_lookaround(0, true, 6), + s_byte(b'a', 4), s_write_lookaround(0), s_match(0) ] @@ -2322,16 +2293,16 @@ mod tests { assert_eq!( build(r"(?<=a(?, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1419,6 +1421,13 @@ impl Inner { self.start_pattern = start_pattern.to_vec(); } + pub(super) fn set_look_behind_starts( + &mut self, + look_behind_starts: &[StateID], + ) { + self.start_look_behind = look_behind_starts.to_vec(); + } + /// Sets the UTF-8 mode of this NFA. pub(super) fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; From 5cc52ea36bdd6958a21b936fcbb913f64dca75cb Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 14:53:49 +0200 Subject: [PATCH 51/68] Implement look-behind state processing --- regex-automata/src/nfa/thompson/builder.rs | 1 + regex-automata/src/nfa/thompson/nfa.rs | 9 ++ regex-automata/src/nfa/thompson/pikevm.rs | 103 +++++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index e4b6ff665..e2f8bf2ad 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -387,6 +387,7 @@ impl Builder { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); + self.start_look_behind.clear(); self.captures.clear(); self.memory_states = 0; } diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 782fdfd7c..01ce221c7 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1106,6 +1106,12 @@ impl NFA { self.0.lookaround_count } + /// Returns the starting states for initializing look-behind evaluation + #[inline] + pub fn look_behind_starts(&self) -> &Vec { + &self.0.start_look_behind + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1481,6 +1487,9 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } + for id in self.start_look_behind.iter_mut() { + *id = old_to_new[*id]; + } } } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d7bb07397..cb96bad22 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1263,7 +1263,46 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because nested look-behinds have a higher index but must + // be executed first. + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + + // This brings the look-behind threads into the state they must be for + // starting at input.start() instead of the beginning. This is + // necessary for lookbehinds to be able to match outside of the input + // span. + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1374,6 +1413,17 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + // The lookbehind states must be processed first, since their + // result must be available for the processing of the main states. + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); if let Some(pid) = self.nexts(stack, curr, next, lookaround, input, at, slots) { @@ -1387,7 +1437,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); at += 1; } instrument!(|c| c.eprint(&self.nfa)); @@ -1442,7 +1494,34 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1459,6 +1538,15 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); self.nexts_overlapping( stack, curr, next, lookaround, input, at, patset, ); @@ -1470,7 +1558,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); } instrument!(|c| c.eprint(&self.nfa)); } @@ -1976,6 +2066,10 @@ pub struct Cache { /// haystack at which look-around indexed x holds and which is <= to the /// current position". lookaround: Vec>, + /// The current active states for look-behind subexpressions + curr_lookaround: ActiveStates, + /// The next set of states to be explored for look-behind subexpressions + next_lookaround: ActiveStates, } impl Cache { @@ -1993,6 +2087,8 @@ impl Cache { curr: ActiveStates::new(re), next: ActiveStates::new(re), lookaround: vec![None; re.lookaround_count()], + curr_lookaround: ActiveStates::new(re), + next_lookaround: ActiveStates::new(re), } } @@ -2036,6 +2132,9 @@ impl Cache { pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); + self.curr_lookaround.reset(re); + self.next_lookaround.reset(re); + self.lookaround = vec![None; re.lookaround_count()]; } /// Returns the heap memory usage, in bytes, of this cache. @@ -2063,6 +2162,10 @@ impl Cache { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); + // capture groups are not allowed inside look-arounds, so we + // set the slot-length to zero. + self.curr_lookaround.setup_search(0); + self.next_lookaround.setup_search(0); } } From 1ec885a738e95ef98ff12a9885c10e3d75361a81 Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 14:56:11 +0200 Subject: [PATCH 52/68] Show look-behind starts in nfa debug print --- regex-automata/src/nfa/thompson/nfa.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 01ce221c7..a1cb26efc 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1501,6 +1501,8 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' + } else if self.start_look_behind.contains(&sid) { + '<' } else { ' ' }; From df0ebca19db7506ab02c99177c471c14edf34134 Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 15:44:24 +0200 Subject: [PATCH 53/68] Fix doc-link --- regex-automata/src/nfa/thompson/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index e2f8bf2ad..4f2f9af79 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -710,7 +710,7 @@ impl Builder { self.start_pattern.len() } - /// Adds the [`start_id`] to the set of starting states that is used when + /// Adds the `start_id` to the set of starting states that is used when /// running look-behind expressions. pub fn start_look_behind(&mut self, start_id: StateID) { self.start_look_behind.push(start_id); From 7b9e339038b8ab584c06f849bf8bb90a3936f597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Wed, 23 Apr 2025 10:27:11 +0200 Subject: [PATCH 54/68] Fix memory usage calculation --- regex-automata/src/nfa/thompson/pikevm.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index cb96bad22..d05a907a0 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -2146,6 +2146,8 @@ impl Cache { (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + + self.curr_lookaround.memory_usage() + + self.next_lookaround.memory_usage() } /// Clears this cache. This should be called at the start of every search From 985a66209646b2cd3f28bb661546c28485d6e3e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 11:52:23 +0200 Subject: [PATCH 55/68] Fix spelling --- regex-automata/src/nfa/thompson/nfa.rs | 4 ++-- regex-automata/src/nfa/thompson/pikevm.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index a1cb26efc..8a73fa5df 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1106,7 +1106,7 @@ impl NFA { self.0.lookaround_count } - /// Returns the starting states for initializing look-behind evaluation + /// Returns the starting states for initializing look-behind evaluation. #[inline] pub fn look_behind_starts(&self) -> &Vec { &self.0.start_look_behind @@ -1276,7 +1276,7 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, - /// Contains the start states for each of the look-behind subexpressions + /// Contains the start states for each of the look-behind subexpressions. start_look_behind: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d05a907a0..dc629abdb 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1287,7 +1287,7 @@ impl PikeVM { // This brings the look-behind threads into the state they must be for // starting at input.start() instead of the beginning. This is - // necessary for lookbehinds to be able to match outside of the input + // necessary for look-behinds to be able to match outside of the input // span. for lb_at in 0..input.start() { self.nexts( @@ -1413,7 +1413,7 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } - // The lookbehind states must be processed first, since their + // The look-behind states must be processed first, since their // result must be available for the processing of the main states. self.nexts( stack, @@ -2066,9 +2066,9 @@ pub struct Cache { /// haystack at which look-around indexed x holds and which is <= to the /// current position". lookaround: Vec>, - /// The current active states for look-behind subexpressions + /// The current active states for look-behind subexpressions. curr_lookaround: ActiveStates, - /// The next set of states to be explored for look-behind subexpressions + /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, } From be4a9788221d130fa2bc5957e99ee2bcf896cf51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Wed, 23 Apr 2025 10:27:48 +0200 Subject: [PATCH 56/68] Implement matchall performance improvement --- regex-automata/src/nfa/thompson/pikevm.rs | 121 +++++++++++++++------- 1 file changed, 86 insertions(+), 35 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index dc629abdb..adba3386d 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -891,6 +891,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = Captures::matches(self.get_nfa().group_info().clone()); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, caps, it } @@ -934,6 +935,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -1265,42 +1267,48 @@ impl PikeVM { ref mut lookaround, ref mut curr_lookaround, ref mut next_lookaround, + ref mut match_lookaround, + ref keep_lookaround_state, } = cache; - // This initializes the look-behind threads from the start of the input - // Note: since capture groups are not allowed inside look-behinds, - // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. We need to add the start states - // in reverse because nested look-behinds have a higher index but must - // be executed first. - for look_behind_start in self.nfa.look_behind_starts() { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - 0, - *look_behind_start, - ); - } + if let Some(active) = match_lookaround { + *curr_lookaround = active.clone(); + } else { + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because nested look-behinds have a higher index but must + // be executed first. + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } - // This brings the look-behind threads into the state they must be for - // starting at input.start() instead of the beginning. This is - // necessary for look-behinds to be able to match outside of the input - // span. - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); + // This brings the look-behind threads into the state they must be for + // starting at input.start() instead of the beginning. This is + // necessary for lookbehinds to be able to match outside of the input + // span. + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } } let mut hm = None; @@ -1428,6 +1436,9 @@ impl PikeVM { self.nexts(stack, curr, next, lookaround, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); + if *keep_lookaround_state { + *match_lookaround = Some(curr_lookaround.clone()); + } } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will @@ -1496,6 +1507,10 @@ impl PikeVM { ref mut lookaround, ref mut curr_lookaround, ref mut next_lookaround, + // It makes no sense to keep any look-behind state for this version of + // the search, since the caller receives no information about + // where the search ended. + .. } = cache; for look_behind_start in self.nfa.look_behind_starts() { @@ -1989,10 +2004,14 @@ impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. - it.advance(|input| { + let result = it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) - }) + }); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } } @@ -2034,6 +2053,7 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -2070,6 +2090,12 @@ pub struct Cache { curr_lookaround: ActiveStates, /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, + /// The active set of states when a match was found. This is needed + /// to resume a search without recomputing look-behind subexpressions. + match_lookaround: Option, + /// When true, use the states of `match_lookaround` to initialize a search, + /// otherwise recompute from the beginning of the haystack. + keep_lookaround_state: bool, } impl Cache { @@ -2089,6 +2115,8 @@ impl Cache { lookaround: vec![None; re.lookaround_count()], curr_lookaround: ActiveStates::new(re), next_lookaround: ActiveStates::new(re), + match_lookaround: None, + keep_lookaround_state: false, } } @@ -2135,6 +2163,24 @@ impl Cache { self.curr_lookaround.reset(re); self.next_lookaround.reset(re); self.lookaround = vec![None; re.lookaround_count()]; + self.match_lookaround = None; + self.keep_lookaround_state = false; + } + + /// Set this cache to keep the state of look-behind assertions upon a + /// match being found. + /// + /// This must only be called with a value of `true` when a new search is + /// started at the end of a previously found match, otherwise the result + /// of any search after this call will most likely be wrong. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.keep_lookaround_state = keep; + if !keep { + self.match_lookaround = None; + } } /// Returns the heap memory usage, in bytes, of this cache. @@ -2143,11 +2189,16 @@ impl Cache { /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { use core::mem::size_of; + let match_lookaround_memory = match &self.match_lookaround { + Some(ml) => ml.memory_usage(), + None => 0, + }; (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + self.curr_lookaround.memory_usage() + self.next_lookaround.memory_usage() + + match_lookaround_memory } /// Clears this cache. This should be called at the start of every search From 80f26074dac6180dc17ef833559ca0750f7b1b00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Wed, 23 Apr 2025 11:36:17 +0200 Subject: [PATCH 57/68] Implement matchall speedup for meta-engine --- regex-automata/src/meta/regex.rs | 26 +++++++++++++++++++++++--- regex-automata/src/meta/wrappers.rs | 6 ++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 8cfdecbec..6bc4bdc71 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -611,7 +611,8 @@ impl Regex { &'r self, input: I, ) -> FindMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, it } } @@ -652,7 +653,8 @@ impl Regex { &'r self, input: I, ) -> CapturesMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -2076,7 +2078,11 @@ impl<'r, 'h> Iterator for FindMatches<'r, 'h> { #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; - it.advance(|input| Ok(re.search_with(cache, input))) + let result = it.advance(|input| Ok(re.search_with(cache, input))); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } #[inline] @@ -2149,6 +2155,7 @@ impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -2385,6 +2392,19 @@ impl Cache { re.imp.strat.reset_cache(self) } + /// Set this cache to keep the state of look-behind assertions upon a + /// match being found. + /// + /// This must only be called with a value of `true` when a new search is + /// started at the end of a previously found match, otherwise the result + /// of any search after this call will most likely be wrong. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.pikevm.keep_lookaround_state(keep); + } + /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index f2f0ec5b7..aa432b7d7 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -133,6 +133,12 @@ impl PikeVMCache { PikeVMCache(Some(builder.get().0.create_cache())) } + pub(crate) fn keep_lookaround_state(&mut self, keep: bool) { + if let Some(cache) = self.0.as_mut() { + cache.keep_lookaround_state(keep); + } + } + pub(crate) fn reset(&mut self, builder: &PikeVM) { self.0.as_mut().unwrap().reset(&builder.get().0); } From 869cf0c930381a258388bee960ecc45e2dffcd3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 11:58:03 +0200 Subject: [PATCH 58/68] Replace catchall with explicit ignore --- regex-automata/src/nfa/thompson/pikevm.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index adba3386d..4c707bd15 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1510,7 +1510,8 @@ impl PikeVM { // It makes no sense to keep any look-behind state for this version of // the search, since the caller receives no information about // where the search ended. - .. + keep_lookaround_state: _, + match_lookaround: _, } = cache; for look_behind_start in self.nfa.look_behind_starts() { From 0ce83566fb110b9619197cb4735ebedde9ce022d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 12:16:29 +0200 Subject: [PATCH 59/68] Rephrase doc and fix lb start state order --- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/pikevm.rs | 25 ++++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index a57f4fd75..9eeb60865 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1038,12 +1038,12 @@ impl Compiler { let unanchored = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + self.builder.borrow_mut().start_look_behind(unanchored.start); let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; - self.builder.borrow_mut().start_look_behind(unanchored.start); Ok(ThompsonRef { start: check, end: check }) } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 4c707bd15..df358a924 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1278,9 +1278,11 @@ impl PikeVM { // Note: since capture groups are not allowed inside look-behinds, // there won't be any Capture epsilon transitions and hence it is ok to // use &mut [] for the slots parameter. We need to add the start states - // in reverse because nested look-behinds have a higher index but must - // be executed first. - for look_behind_start in self.nfa.look_behind_starts() { + // in reverse because more deeply nested look-behinds have a higher index + // but must be executed first, so that the result is available for the + // outer expression. + for look_behind_start in self.nfa.look_behind_starts().iter().rev() + { self.epsilon_closure( stack, &mut [], @@ -2091,8 +2093,10 @@ pub struct Cache { curr_lookaround: ActiveStates, /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, - /// The active set of states when a match was found. This is needed - /// to resume a search without recomputing look-behind subexpressions. + /// The set of active threads, belonging to look-behind expressions, + /// when a match was found. This is needed to resume a search after a match + /// was found (to look for further matches), without having to re-scan the + /// beginning of the haystack. match_lookaround: Option, /// When true, use the states of `match_lookaround` to initialize a search, /// otherwise recompute from the beginning of the haystack. @@ -2168,12 +2172,13 @@ impl Cache { self.keep_lookaround_state = false; } - /// Set this cache to keep the state of look-behind assertions upon a - /// match being found. + /// Set this cache to store a copy of the active threads belonging + /// to look-behind assertions upon a match being found. /// - /// This must only be called with a value of `true` when a new search is - /// started at the end of a previously found match, otherwise the result - /// of any search after this call will most likely be wrong. + /// This is a performance optimization and must only be called with a + /// value of `true` when intending to start a new search at the end of + /// a previously found match. Otherwise, the result of look-behind + /// sub-expressions will be out of sync with the main regex. /// /// Calling this function with a value of `false` will clear any previously /// stored look-behind state. From aea8fa35cada32a5e195434f28b165c9420e5998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 14:50:16 +0200 Subject: [PATCH 60/68] Disable lookaround scanning when none present --- regex-automata/src/nfa/thompson/pikevm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index df358a924..48d6c3e24 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1273,7 +1273,7 @@ impl PikeVM { if let Some(active) = match_lookaround { *curr_lookaround = active.clone(); - } else { + } else if self.lookaround_count() > 0 { // This initializes the look-behind threads from the start of the input // Note: since capture groups are not allowed inside look-behinds, // there won't be any Capture epsilon transitions and hence it is ok to From e35c8d9e8f7b762cf38c406f3cf5445c8fc412d0 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 7 May 2025 21:40:07 +0200 Subject: [PATCH 61/68] Fast forward look-around threads upon prefiltering --- regex-automata/src/nfa/thompson/pikevm.rs | 95 +++++++++++++++-------- 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 48d6c3e24..7879d5791 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1293,24 +1293,16 @@ impl PikeVM { *look_behind_start, ); } - - // This brings the look-behind threads into the state they must be for - // starting at input.start() instead of the beginning. This is - // necessary for lookbehinds to be able to match outside of the input - // span. - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + // This is necessary for look-behinds to be able to match outside of the + // input span. + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); } let mut hm = None; @@ -1352,7 +1344,21 @@ impl PikeVM { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, - Some(ref span) => at = span.start, + Some(ref span) => { + if self.lookaround_count() > 0 { + // We are jumping ahead due to the pre-filter, thus we must bring + // the look-behind threads to the new position. + self.fast_forward_lookbehinds( + Span { start: at, end: span.start }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); + } + at = span.start + } } } } @@ -1459,6 +1465,36 @@ impl PikeVM { hm } + /// This brings the look-behind threads into the state they must be for + /// starting at [input.end]. The assumption is that they are currently + /// at [input.start]. + fn fast_forward_lookbehinds( + &self, + forward_span: Span, + input: &Input<'_>, + stack: &mut Vec, + curr_lookaround: &mut ActiveStates, + next_lookaround: &mut ActiveStates, + lookaround: &mut Vec>, + ) { + for lb_at in forward_span.start..forward_span.end { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + // Since capture groups are not allowed inside look-arounds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } + /// The implementation for the 'which_overlapping_matches' API. Basically, /// we do a single scan through the entire haystack (unless our regex /// or search is anchored) and record every pattern that matched. In @@ -1527,19 +1563,14 @@ impl PikeVM { *look_behind_start, ); } - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { From c43efb08134eb48eb3d17d422a60c1feb0f1a75c Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 7 May 2025 21:45:58 +0200 Subject: [PATCH 62/68] Add small test for prefiltered regex with lookbehind --- testdata/lookaround.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 14a303d7c..91fab56a0 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -84,3 +84,9 @@ matches = [ [[1, 3], [1, 2], [2, 3]], [[5, 7], [5, 6], [6, 7]], ] + +[[test]] +name = "lookbehind matching before the prefiltered start position" +regex = "b(?<=ab)" +haystack = "ab" +matches = [[1, 2]] From 3abfcfd5b3220b0bbb72f295c9dbba796105e021 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 08:03:40 +0200 Subject: [PATCH 63/68] Change literal extraction for look-arounds --- regex-automata/src/meta/strategy.rs | 5 +++++ regex-syntax/src/hir/literal.rs | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 0ac830b9d..19823b555 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -258,6 +258,11 @@ impl Pre<()> { if !info.props()[0].look_set().is_empty() { return None; } + // For a similar reason, we require that it has zero look-around + // expressions. + if info.props()[0].contains_lookaround_expr() { + return None; + } // Finally, currently, our prefilters are all oriented around // leftmost-first match semantics, so don't try to use them if the // caller asked for anything else. diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 7859327df..efa5e0ee6 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,8 +172,9 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), - LookAround(_) => Seq::infinite(), + Empty | Look(_) | LookAround(_) => { + Seq::singleton(self::Literal::exact(vec![])) + } Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); From 1f5c5c1e25609908f18875b450da5084509f3199 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 10:30:10 +0200 Subject: [PATCH 64/68] Update wrong doc --- regex-automata/src/nfa/thompson/pikevm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 7879d5791..f04b8201f 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1466,8 +1466,8 @@ impl PikeVM { } /// This brings the look-behind threads into the state they must be for - /// starting at [input.end]. The assumption is that they are currently - /// at [input.start]. + /// starting at [forward_span.end]. The assumption is that they are currently + /// at [forward_span.start]. fn fast_forward_lookbehinds( &self, forward_span: Span, From cbc452e229b6cb98b1796b53397032a1427bb7a2 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 10:30:22 +0200 Subject: [PATCH 65/68] Fix literal extraction tests --- regex-syntax/src/hir/literal.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index efa5e0ee6..e42ad8c2c 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2457,16 +2457,16 @@ mod tests { #[test] fn lookaround() { - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(?<=qwa)b")); - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(? Date: Thu, 8 May 2025 10:31:20 +0200 Subject: [PATCH 66/68] Reverse look_behind_starts --- regex-automata/src/nfa/thompson/pikevm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index f04b8201f..f980d3f69 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1552,7 +1552,7 @@ impl PikeVM { match_lookaround: _, } = cache; - for look_behind_start in self.nfa.look_behind_starts() { + for look_behind_start in self.nfa.look_behind_starts().iter().rev() { self.epsilon_closure( stack, &mut [], From d5e7dc331987c3d3ccf0fd1d36cf4e1e8c99c834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Tue, 20 May 2025 16:39:11 +0200 Subject: [PATCH 67/68] Fix NFA memory usage and typo --- regex-automata/src/nfa/thompson/nfa.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 8a73fa5df..ff5e0576f 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1185,6 +1185,7 @@ impl NFA { + self.0.states.len() * size_of::() + self.0.start_pattern.len() * size_of::() + self.0.group_info.memory_usage() + + self.0.start_look_behind.len() * size_of::() + self.0.memory_extra } } @@ -1276,7 +1277,7 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, - /// Contains the start states for each of the look-behind subexpressions. + /// Contains the start state for each of the look-behind subexpressions. start_look_behind: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state From 3d13971b44c08ad4dafd653a9adfaaddc24f3921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 12 Jun 2025 17:51:58 +0200 Subject: [PATCH 68/68] Fix lookaround index initialization --- regex-automata/src/nfa/thompson/compiler.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 9eeb60865..1bfd0a836 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -965,6 +965,7 @@ impl Compiler { self.builder .borrow_mut() .set_size_limit(self.config.get_nfa_size_limit())?; + *self.lookaround_index.borrow_mut() = SmallIndex::ZERO; // We always add an unanchored prefix unless we were specifically told // not to (for tests only), or if we know that the regex is anchored