From 363d4af810f5213b6d1a73d466eea76e75398fc9 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Mon, 18 Jul 2022 10:54:01 +0200 Subject: [PATCH 1/3] Fix #16: export `decode{PatternSet|CharacterClass}` from `Pattern` --- lib/Text/Regex/TDFA/Pattern.hs | 35 ++++++++++++++++++++++-- lib/Text/Regex/TDFA/TNFA.hs | 50 ++++------------------------------ 2 files changed, 39 insertions(+), 46 deletions(-) diff --git a/lib/Text/Regex/TDFA/Pattern.hs b/lib/Text/Regex/TDFA/Pattern.hs index b6fedf2..53e527f 100644 --- a/lib/Text/Regex/TDFA/Pattern.hs +++ b/lib/Text/Regex/TDFA/Pattern.hs @@ -12,6 +12,7 @@ module Text.Regex.TDFA.Pattern ,PatternSetEquivalenceClass(..) ,GroupIndex ,DoPa(..) + ,decodeCharacterClass, decodePatternSet ,showPattern -- ** Internal use ,starTrans @@ -22,8 +23,8 @@ module Text.Regex.TDFA.Pattern {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} import Data.List(intersperse,partition) -import qualified Data.Set as Set(toAscList,toList) -import Data.Set(Set) -- XXX EnumSet +import qualified Data.Set as Set +import Data.Set (Set) import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error) err :: String -> a @@ -131,6 +132,36 @@ instance Show PatternSetCollatingElement where instance Show PatternSetEquivalenceClass where showsPrec _ p = showChar '[' . showChar '=' . shows (unSEC p) . showChar '=' . showChar ']' +-- | @decodePatternSet@ cannot handle collating element and treats +-- equivalence classes as just their definition and nothing more. +decodePatternSet :: PatternSet -> Set Char +decodePatternSet (PatternSet msc mscc _ msec) = + let baseMSC = maybe Set.empty id msc + withMSCC = foldl (flip Set.insert) baseMSC (maybe [] (concatMap decodeCharacterClass . Set.toAscList) mscc) + withMSEC = foldl (flip Set.insert) withMSCC (maybe [] (concatMap unSEC . Set.toAscList) msec) + in withMSEC + +-- | This returns the strictly ascending list of characters +-- represented by @[: :]@ POSIX character classes. +-- Unrecognized class names return an empty string. +decodeCharacterClass :: PatternSetCharacterClass -> String +decodeCharacterClass (PatternSetCharacterClass s) = + case s of + "alnum" -> ['0'..'9']++['A'..'Z']++['a'..'z'] + "digit" -> ['0'..'9'] + "punct" -> ['\33'..'\47']++['\58'..'\64']++['\91'..'\96']++['\123'..'\126'] + "alpha" -> ['A'..'Z']++['a'..'z'] + "graph" -> ['\41'..'\126'] + "space" -> "\t\n\v\f\r " + "blank" -> "\t " + "lower" -> ['a'..'z'] + "upper" -> ['A'..'Z'] + "cntrl" -> ['\0'..'\31']++"\127" -- with NUL + "print" -> ['\32'..'\126'] + "xdigit" -> ['0'..'9']++['A'..'F']++['a'..'f'] + "word" -> ['0'..'9']++['A'..'Z']++"_"++['a'..'z'] + _ -> [] + -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- | Do the transformation and simplification in a single traversal. diff --git a/lib/Text/Regex/TDFA/TNFA.hs b/lib/Text/Regex/TDFA/TNFA.hs index 9fa6437..a03a0dc 100644 --- a/lib/Text/Regex/TDFA/TNFA.hs +++ b/lib/Text/Regex/TDFA/TNFA.hs @@ -29,8 +29,10 @@ -- -- Uses recursive do notation. -module Text.Regex.TDFA.TNFA(patternToNFA - ,QNFA(..),QT(..),QTrans,TagUpdate(..)) where +module Text.Regex.TDFA.TNFA + ( patternToNFA + , QNFA(..), QT(..), QTrans, TagUpdate(..) + ) where {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} @@ -48,7 +50,7 @@ import Data.IntSet.EnumSet2(EnumSet) import qualified Data.IntSet.EnumSet2 as Set(singleton,toList,insert) import Data.Maybe(catMaybes,isNothing) import Data.Monoid as Mon(Monoid(..)) -import qualified Data.Set as S(Set,insert,toAscList,empty) +import qualified Data.Set as S (insert, toAscList) import Text.Regex.TDFA.Common(QT(..),QNFA(..),QTrans,TagTask(..),TagUpdate(..),DoPa(..) ,CompOption(..) @@ -57,8 +59,7 @@ import Text.Regex.TDFA.Common(QT(..),QNFA(..),QTrans,TagTask(..),TagUpdate(..),D import Text.Regex.TDFA.CorePattern(Q(..),P(..),OP(..),WhichTest,cleanNullView,NullView ,SetTestInfo(..),Wanted(..),TestInfo ,mustAccept,cannotAccept,patternToQ) -import Text.Regex.TDFA.Pattern(Pattern(..),PatternSet(..),unSEC,PatternSetCharacterClass(..)) ---import Debug.Trace +import Text.Regex.TDFA.Pattern (Pattern(..), decodePatternSet) ecart :: String -> a -> a ecart _ = id @@ -785,42 +786,3 @@ qt_win seems to only allow PreUpdate so why keep the same type? ADD ORPHAN ID check and make this a fatal error while testing -} - --- | decodePatternSet cannot handle collating element and treats --- equivalence classes as just their definition and nothing more. -decodePatternSet :: PatternSet -> S.Set Char -decodePatternSet (PatternSet msc mscc _ msec) = - let baseMSC = maybe S.empty id msc - withMSCC = foldl (flip S.insert) baseMSC (maybe [] (concatMap decodeCharacterClass . S.toAscList) mscc) - withMSEC = foldl (flip S.insert) withMSCC (maybe [] (concatMap unSEC . S.toAscList) msec) - in withMSEC - --- | This returns the distinct ascending list of characters --- represented by [: :] values in legalCharacterClasses; unrecognized --- class names return an empty string -decodeCharacterClass :: PatternSetCharacterClass -> String -decodeCharacterClass (PatternSetCharacterClass s) = - case s of - "alnum" -> ['0'..'9']++['a'..'z']++['A'..'Z'] - "digit" -> ['0'..'9'] - "punct" -> ['\33'..'\47']++['\58'..'\64']++['\91'..'\95']++"\96"++['\123'..'\126'] - "alpha" -> ['a'..'z']++['A'..'Z'] - "graph" -> ['\41'..'\126'] - "space" -> "\t\n\v\f\r " - "blank" -> "\t " - "lower" -> ['a'..'z'] - "upper" -> ['A'..'Z'] - "cntrl" -> ['\0'..'\31']++"\127" -- with NUL - "print" -> ['\32'..'\126'] - "xdigit" -> ['0'..'9']++['a'..'f']++['A'..'F'] - "word" -> ['0'..'9']++['a'..'z']++['A'..'Z']++"_" - _ -> [] - -{- --- | This is the list of recognized [: :] character classes, others --- are decoded as empty. -legalCharacterClasses :: [String] -legalCharacterClasses = ["alnum","digit","punct","alpha","graph" - ,"space","blank","lower","upper","cntrl","print","xdigit","word"] - --} From 26477989983e948b5c78f8e340b1c6021770077d Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Mon, 18 Jul 2022 20:00:57 +0200 Subject: [PATCH 2/3] Extend and polish haddock for `Pattern` module --- lib/Text/Regex/TDFA/Common.hs | 12 +-- lib/Text/Regex/TDFA/Pattern.hs | 134 ++++++++++++++++++++++----------- 2 files changed, 96 insertions(+), 50 deletions(-) diff --git a/lib/Text/Regex/TDFA/Common.hs b/lib/Text/Regex/TDFA/Common.hs index a9884d4..049d0aa 100644 --- a/lib/Text/Regex/TDFA/Common.hs +++ b/lib/Text/Regex/TDFA/Common.hs @@ -109,26 +109,26 @@ data ExecOption = ExecOption { captureGroups :: Bool -- ^ True by default. Set to False to improve speed (and space). } deriving (Read,Show) --- | Used by implementation to name certain Postions during --- matching. Identity of Position tag to set during a transition. +-- | Used by implementation to name certain 'Postion's during +-- matching. Identity of 'Position' tag to set during a transition. type Tag = Int --- | Internal use to indicate type of tag and preference for larger or smaller Positions. +-- | Internal use to indicate type of tag and preference for larger or smaller 'Position's. data OP = Maximize | Minimize | Orbit | Ignore deriving (Eq,Show) -- | Internal NFA node identity number. type Index = Int --- | Internal DFA identity is this Set of NFA Index. +-- | Internal DFA identity is this 'Set' of NFA 'Index'. type SetIndex = IntSet {- Index -} -- | Index into the text being searched. type Position = Int --- | GroupIndex is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group'). +-- | 'GroupIndex' is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group'). type GroupIndex = Int --- | GroupInfo collects the parent and tag information for an instance of a group. +-- | 'GroupInfo' collects the parent and tag information for an instance of a group. data GroupInfo = GroupInfo { thisIndex, parentIndex :: GroupIndex , startTag, stopTag, flagTag :: Tag diff --git a/lib/Text/Regex/TDFA/Pattern.hs b/lib/Text/Regex/TDFA/Pattern.hs index 53e527f..e75b2b7 100644 --- a/lib/Text/Regex/TDFA/Pattern.hs +++ b/lib/Text/Regex/TDFA/Pattern.hs @@ -2,7 +2,7 @@ -- | This "Text.Regex.TDFA.Pattern" module provides the 'Pattern' data -- type and its subtypes. This 'Pattern' type is used to represent --- the parsed form of a Regular Expression. +-- the parsed form of a regular expression. module Text.Regex.TDFA.Pattern (Pattern(..) @@ -16,7 +16,7 @@ module Text.Regex.TDFA.Pattern ,showPattern -- ** Internal use ,starTrans --- ** Internal use, Operations to support debugging under ghci +-- ** Internal use, operations to support debugging under @ghci@ ,starTrans',simplify',dfsPattern ) where @@ -30,35 +30,64 @@ import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error) err :: String -> a err = common_error "Text.Regex.TDFA.Pattern" --- | Pattern is the type returned by the regular expression parser. --- This is consumed by the CorePattern module and the tender leaves --- are nibbled by the TNFA module. -data Pattern = PEmpty - | PGroup (Maybe GroupIndex) Pattern -- Nothing to indicate non-matching PGroup (Nothing never used!) - | POr [Pattern] -- flattened by starTrans - | PConcat [Pattern] -- flattened by starTrans - | PQuest Pattern -- eliminated by starTrans - | PPlus Pattern -- eliminated by starTrans - | PStar Bool Pattern -- True means mayFirstBeNull is True - | PBound Int (Maybe Int) Pattern -- eliminated by starTrans - -- The rest of these need an index of where in the regex string it is from - | PCarat {getDoPa::DoPa} - | PDollar {getDoPa::DoPa} - -- The following test and accept a single character - | PDot {getDoPa::DoPa} -- Any character (newline?) at all - | PAny {getDoPa::DoPa,getPatternSet::PatternSet} -- Square bracketed things - | PAnyNot {getDoPa::DoPa,getPatternSet::PatternSet} -- Inverted square bracketed things - | PEscape {getDoPa::DoPa,getPatternChar::Char} -- Backslashed Character - | PChar {getDoPa::DoPa,getPatternChar::Char} -- Specific Character - -- The following are semantic tags created in starTrans, not the parser - | PNonCapture Pattern -- introduced by starTrans - | PNonEmpty Pattern -- introduced by starTrans - deriving (Eq,Show) - --- | I have not been checking, but this should have the property that --- parsing the resulting string should result in an identical Pattern. --- This is not true if starTrans has created PNonCapture and PNonEmpty --- values or a (PStar False). The contents of a "[ ]" grouping are +-- | 'Pattern' is the type returned by the regular expression parser 'parseRegex'. +-- This is consumed by the "Text.Regex.TDFA.CorePattern" module and the tender leaves +-- are nibbled by the "Text.Regex.TDFA.TNFA" module. +-- +-- The 'DoPa' field is the index of the component in the regex string @r@. +data Pattern + = PEmpty + -- ^ @()@, matches the empty string. + | PGroup (Maybe GroupIndex) Pattern + -- ^ Group @(r)@. @Nothing@ indicates non-matching 'PGroup' + -- (never produced by parser 'parseRegex'). + | POr [Pattern] + -- ^ Alternative @r|s@ (flattened by 'starTrans'). + | PConcat [Pattern] + -- ^ Sequence @rs@ (flattened by 'starTrans'). + | PQuest Pattern + -- ^ Zero or one repetitions @r?@ (eliminated by 'starTrans'). + | PPlus Pattern + -- ^ One or more repetitions @r+@ (eliminated by 'starTrans'). + | PStar Bool Pattern + -- ^ Zero or more repetitions @r*@. + -- @True@ (default) means may accept the empty string on its first iteration. + | PBound Int (Maybe Int) Pattern + -- ^ Given number or repetitions @r{n}@ or @r{n,m}@ + -- (eliminated by 'starTrans'). + + -- The rest of these need an index of where in the regex string it is from + | PCarat { getDoPa :: DoPa } + -- ^ @^@ matches beginning of input. + | PDollar { getDoPa :: DoPa } + -- ^ @$@ matches end of input. + + -- The following test and accept a single character + | PDot { getDoPa :: DoPa } + -- ^ @.@ matches any character. + | PAny { getDoPa :: DoPa, getPatternSet :: PatternSet } + -- ^ Bracket expression @[...]@. + | PAnyNot { getDoPa :: DoPa, getPatternSet :: PatternSet } + -- ^ Inverted bracket expression @[^...]@. + | PEscape { getDoPa :: DoPa, getPatternChar :: Char } + -- ^ Backslashed character @\c@, may have special meaning. + | PChar { getDoPa :: DoPa, getPatternChar :: Char } + -- ^ Single character, matches given character. + + -- The following are semantic tags created in starTrans, not the parser + | PNonCapture Pattern + -- ^ Tag for internal use, introduced by 'starTrans'. + | PNonEmpty Pattern + -- ^ Tag for internal use, introduced by 'starTrans'. + deriving (Eq, Show) + +-- Andreas Abel, 2022-07-18, issue #47: +-- The following claim is FALSE: +-- +-- I have not been checking, but this should have the property that +-- parsing the resulting string should result in an identical 'Pattern'. +-- This is not true if 'starTrans' has created 'PNonCapture' and 'PNonEmpty' +-- values or a @'PStar' False@. The contents of a @[...]@ grouping are -- always shown in a sorted canonical order. showPattern :: Pattern -> String showPattern pIn = @@ -93,12 +122,19 @@ showPattern pIn = -} paren s = ('(':s)++")" +-- | Content of a bracket expression @[...]@ organized into +-- characters, +-- POSIX character classes (e.g. @[[:alnum:]]@), +-- collating elements (e.g. @[.ch.]@, unused), and +-- equivalence classes (e.g. @[=a=]@, treated as characters). +-- data PatternSet = PatternSet (Maybe (Set Char)) (Maybe (Set PatternSetCharacterClass)) (Maybe (Set PatternSetCollatingElement)) (Maybe (Set PatternSetEquivalenceClass)) deriving (Eq) +-- | Hand-rolled implementation, giving textual rather than Haskell representation. instance Show PatternSet where showsPrec i (PatternSet s scc sce sec) = let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s @@ -118,17 +154,27 @@ instance Show PatternSet where groupRange x n [] = if n <=3 then take n [x..] else x:'-':(toEnum (pred n+fromEnum x)):[] +-- | Content of @[: :]@, e.g. @"alnum"@ for @[:alnum:]@. newtype PatternSetCharacterClass = PatternSetCharacterClass {unSCC::String} deriving (Eq,Ord) + +-- | Content of @[. .]@, e.g. @"ch"@ for @[.ch.]@. newtype PatternSetCollatingElement = PatternSetCollatingElement {unSCE::String} deriving (Eq,Ord) + +-- | Content of @[= =]@, e.g. @"a"@ for @[=a=]@. newtype PatternSetEquivalenceClass = PatternSetEquivalenceClass {unSEC::String} deriving (Eq,Ord) +-- | Hand-rolled implementation, giving textual rather than Haskell representation. instance Show PatternSetCharacterClass where showsPrec _ p = showChar '[' . showChar ':' . shows (unSCC p) . showChar ':' . showChar ']' + +-- | Hand-rolled implementation, giving textual rather than Haskell representation. instance Show PatternSetCollatingElement where showsPrec _ p = showChar '[' . showChar '.' . shows (unSCE p) . showChar '.' . showChar ']' + +-- | Hand-rolled implementation, giving textual rather than Haskell representation. instance Show PatternSetEquivalenceClass where showsPrec _ p = showChar '[' . showChar '=' . shows (unSEC p) . showChar '=' . showChar ']' @@ -165,18 +211,18 @@ decodeCharacterClass (PatternSetCharacterClass s) = -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- | Do the transformation and simplification in a single traversal. --- This removes the PPlus, PQuest, and PBound values, changing to POr --- and PEmpty and PStar True\/False. For some PBound values it adds --- PNonEmpty and PNonCapture semantic marker. It also simplifies to --- flatten out nested POr and PConcat instances and eliminate some --- unneeded PEmpty values. +-- This removes the 'PPlus', 'PQuest', and 'PBound' values, changing to 'POr' +-- and 'PEmpty' and 'PStar'. For some 'PBound' values it adds +-- 'PNonEmpty' and 'PNonCapture' semantic marker. It also simplifies to +-- flatten out nested 'POr' and 'PConcat' instances and eliminate some +-- unneeded 'PEmpty' values. starTrans :: Pattern -> Pattern starTrans = dfsPattern (simplify' . starTrans') --- | Apply a Pattern transformation function depth first -dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function - -> Pattern -- ^ The Pattern to transform - -> Pattern -- ^ The transformed Pattern +-- | Apply a 'Pattern' transformation function depth first. +dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function. + -> Pattern -- ^ The 'Pattern' to transform. + -> Pattern -- ^ The transformed 'Pattern'. dfsPattern f = dfs where unary c = f . c . dfs dfs pattern = case pattern of @@ -354,7 +400,7 @@ starTrans' pIn = pass = pIn -- | Function to transform a pattern into an equivalent, but less --- redundant form. Nested 'POr' and 'PConcat' are flattened. PEmpty +-- redundant form. Nested 'POr' and 'PConcat' are flattened. 'PEmpty' -- is propagated. simplify' :: Pattern -> Pattern simplify' x@(POr _) = @@ -376,7 +422,7 @@ simplify' (PNonCapture PEmpty) = PEmpty -- 2009, perhaps useful --simplify' (PNonEmpty PEmpty) = err "simplify' (PNonEmpty PEmpty) = should be Impossible!" -- 2009 simplify' other = other --- | Function to flatten nested POr or nested PConcat applicataions. +-- | Function to flatten nested 'POr' or nested 'PConcat' applicataions. flatten :: Pattern -> [Pattern] flatten (POr ps) = (concatMap (\x -> case x of POr ps' -> ps' @@ -390,8 +436,8 @@ notPEmpty :: Pattern -> Bool notPEmpty PEmpty = False notPEmpty _ = True --- | Determines if pIn will fail or accept [] and never accept any --- characters. Treat PCarat and PDollar as True. +-- | Determines if 'Pattern' will fail or accept @[]@ and never accept any +-- characters. Treat 'PCarat' and 'PDollar' as @True@. canOnlyMatchNull :: Pattern -> Bool canOnlyMatchNull pIn = case pIn of From 0dc80f58e3046e29e51ec33133b03447ac4f6800 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Mon, 18 Jul 2022 17:52:24 +0200 Subject: [PATCH 3/3] Bump to 1.3.2 and CHANGELOG --- CHANGELOG.md | 9 +++++++++ README.md | 2 +- lib/Text/Regex/TDFA.hs | 2 +- lib/Text/Regex/TDFA/Pattern.hs | 4 ++++ regex-tdfa.cabal | 4 ++-- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca3818d..20a0eab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ For the package version policy (PVP), see http://pvp.haskell.org/faq . +### 1.3.2 + +_2022-07-18, Andreas Abel_ + +- Export `decodePatternSet` and `decodeCharacterClass` from `Text.Regex.TDFA.Pattern` + ([#16](https://github.com/haskell-hvr/regex-tdfa/issues/16)) +- Extend and correct docs for `Pattern` module +- Tested with GHC 7.4 - 9.4 + ### 1.3.1.5 _2022-07-18, Andreas Abel_ diff --git a/README.md b/README.md index 5533e0f..7375909 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The name "tdfa" stands for Tagged-DFA. [Declare a dependency](https://www.haskell.org/cabal/users-guide/developing-packages.html#pkg-field-build-depends) on the `regex-tdfa` library in your `.cabal` file: ``` -build-depends: regex-tdfa ^>= 1.3.1 +build-depends: regex-tdfa ^>= 1.3.2 ``` In Haskell modules where you need to use regexes `import` the respective `regex-tdfa` module: diff --git a/lib/Text/Regex/TDFA.hs b/lib/Text/Regex/TDFA.hs index 9541b2b..ab89302 100644 --- a/lib/Text/Regex/TDFA.hs +++ b/lib/Text/Regex/TDFA.hs @@ -21,7 +21,7 @@ OS's bugs. Declare a dependency on the @regex-tdfa@ library in your @.cabal@ file: -> build-depends: regex-tdfa ^>= 1.3.1.1 +> build-depends: regex-tdfa ^>= 1.3.2 In Haskell modules where you want to use regexes simply @import@ /this/ module: diff --git a/lib/Text/Regex/TDFA/Pattern.hs b/lib/Text/Regex/TDFA/Pattern.hs index e75b2b7..066f1c7 100644 --- a/lib/Text/Regex/TDFA/Pattern.hs +++ b/lib/Text/Regex/TDFA/Pattern.hs @@ -180,6 +180,8 @@ instance Show PatternSetEquivalenceClass where -- | @decodePatternSet@ cannot handle collating element and treats -- equivalence classes as just their definition and nothing more. +-- +-- @since 1.3.2 decodePatternSet :: PatternSet -> Set Char decodePatternSet (PatternSet msc mscc _ msec) = let baseMSC = maybe Set.empty id msc @@ -190,6 +192,8 @@ decodePatternSet (PatternSet msc mscc _ msec) = -- | This returns the strictly ascending list of characters -- represented by @[: :]@ POSIX character classes. -- Unrecognized class names return an empty string. +-- +-- @since 1.3.2 decodeCharacterClass :: PatternSetCharacterClass -> String decodeCharacterClass (PatternSetCharacterClass s) = case s of diff --git a/regex-tdfa.cabal b/regex-tdfa.cabal index b3b8089..1ee967d 100644 --- a/regex-tdfa.cabal +++ b/regex-tdfa.cabal @@ -1,6 +1,6 @@ cabal-version: 1.12 name: regex-tdfa -version: 1.3.1.5 +version: 1.3.2 build-Type: Simple license: BSD3 @@ -46,7 +46,7 @@ source-repository head source-repository this type: git location: https://github.com/haskell-hvr/regex-tdfa.git - tag: v1.3.1.5 + tag: v1.3.2 flag force-O2 default: False