haskell-hvr · andreasabel · Jul 18, 2022 · Jul 18, 2022 · Jul 18, 2022 · Jul 18, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 For the package version policy (PVP), see  http://pvp.haskell.org/faq .
 
+### 1.3.2
+
+_2022-07-18, Andreas Abel_
+
+- Export `decodePatternSet` and `decodeCharacterClass` from `Text.Regex.TDFA.Pattern`
+  ([#16](https://github.com/haskell-hvr/regex-tdfa/issues/16))
+- Extend and correct docs for `Pattern` module
+- Tested with GHC 7.4 - 9.4
+
 ### 1.3.1.5
 
 _2022-07-18, Andreas Abel_

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ The name "tdfa" stands for Tagged-DFA.
 [Declare a dependency](https://www.haskell.org/cabal/users-guide/developing-packages.html#pkg-field-build-depends) on the `regex-tdfa` library in your `.cabal` file:
 
 ```
-build-depends: regex-tdfa ^>= 1.3.1
+build-depends: regex-tdfa ^>= 1.3.2
 ```
 
 In Haskell modules where you need to use regexes `import` the respective `regex-tdfa` module:

diff --git a/lib/Text/Regex/TDFA.hs b/lib/Text/Regex/TDFA.hs
@@ -21,7 +21,7 @@ OS's bugs.
 
 Declare a dependency on the @regex-tdfa@ library in your @.cabal@ file:
 
-> build-depends: regex-tdfa ^>= 1.3.1.1
+> build-depends: regex-tdfa ^>= 1.3.2
 
 In Haskell modules where you want to use regexes simply @import@ /this/ module:
 

diff --git a/lib/Text/Regex/TDFA/Common.hs b/lib/Text/Regex/TDFA/Common.hs
@@ -109,26 +109,26 @@ data ExecOption = ExecOption {
     captureGroups :: Bool    -- ^ True by default.  Set to False to improve speed (and space).
   } deriving (Read,Show)
 
--- | Used by implementation to name certain Postions during
--- matching. Identity of Position tag to set during a transition.
+-- | Used by implementation to name certain 'Postion's during
+-- matching. Identity of 'Position' tag to set during a transition.
 type Tag = Int
 
--- | Internal use to indicate type of tag and preference for larger or smaller Positions.
+-- | Internal use to indicate type of tag and preference for larger or smaller 'Position's.
 data OP = Maximize | Minimize | Orbit | Ignore deriving (Eq,Show)
 
 -- | Internal NFA node identity number.
 type Index = Int
 
--- | Internal DFA identity is this Set of NFA Index.
+-- | Internal DFA identity is this 'Set' of NFA 'Index'.
 type SetIndex = IntSet {- Index -}
 
 -- | Index into the text being searched.
 type Position = Int
 
--- | GroupIndex is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group').
+-- | 'GroupIndex' is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group').
 type GroupIndex = Int
 
--- | GroupInfo collects the parent and tag information for an instance of a group.
+-- | 'GroupInfo' collects the parent and tag information for an instance of a group.
 data GroupInfo = GroupInfo {
     thisIndex, parentIndex :: GroupIndex
   , startTag, stopTag, flagTag :: Tag

diff --git a/lib/Text/Regex/TDFA/Pattern.hs b/lib/Text/Regex/TDFA/Pattern.hs
@@ -2,7 +2,7 @@
 
 -- | This "Text.Regex.TDFA.Pattern" module provides the 'Pattern' data
 -- type and its subtypes.  This 'Pattern' type is used to represent
--- the parsed form of a Regular Expression.
+-- the parsed form of a regular expression.
 
 module Text.Regex.TDFA.Pattern
     (Pattern(..)
@@ -12,52 +12,82 @@ module Text.Regex.TDFA.Pattern
     ,PatternSetEquivalenceClass(..)
     ,GroupIndex
     ,DoPa(..)
+    ,decodeCharacterClass, decodePatternSet
     ,showPattern
 -- ** Internal use
     ,starTrans
--- ** Internal use, Operations to support debugging under ghci
+-- ** Internal use, operations to support debugging under @ghci@
     ,starTrans',simplify',dfsPattern
     ) where
 
 {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
 
 import Data.List(intersperse,partition)
-import qualified Data.Set as Set(toAscList,toList)
-import Data.Set(Set) -- XXX EnumSet
+import qualified Data.Set as Set
+import Data.Set (Set)
 import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error)
 
 err :: String -> a
 err = common_error "Text.Regex.TDFA.Pattern"
 
--- | Pattern is the type returned by the regular expression parser.
--- This is consumed by the CorePattern module and the tender leaves
--- are nibbled by the TNFA module.
-data Pattern = PEmpty
-             | PGroup  (Maybe GroupIndex) Pattern -- Nothing to indicate non-matching PGroup (Nothing never used!)
-             | POr     [Pattern]                  -- flattened by starTrans
-             | PConcat [Pattern]                  -- flattened by starTrans
-             | PQuest  Pattern                    -- eliminated by starTrans
-             | PPlus   Pattern                    -- eliminated by starTrans
-             | PStar   Bool Pattern               -- True means mayFirstBeNull is True
-             | PBound  Int (Maybe Int) Pattern    -- eliminated by starTrans
-             -- The rest of these need an index of where in the regex string it is from
-             | PCarat  {getDoPa::DoPa}
-             | PDollar {getDoPa::DoPa}
-             -- The following test and accept a single character
-             | PDot    {getDoPa::DoPa}            -- Any character (newline?) at all
-             | PAny    {getDoPa::DoPa,getPatternSet::PatternSet} -- Square bracketed things
-             | PAnyNot {getDoPa::DoPa,getPatternSet::PatternSet} -- Inverted square bracketed things
-             | PEscape {getDoPa::DoPa,getPatternChar::Char}      -- Backslashed Character
-             | PChar   {getDoPa::DoPa,getPatternChar::Char}      -- Specific Character
-             -- The following are semantic tags created in starTrans, not the parser
-             | PNonCapture Pattern               -- introduced by starTrans
-             | PNonEmpty Pattern                 -- introduced by starTrans
-               deriving (Eq,Show)
-
--- | I have not been checking, but this should have the property that
--- parsing the resulting string should result in an identical Pattern.
--- This is not true if starTrans has created PNonCapture and PNonEmpty
--- values or a (PStar False).  The contents of a "[ ]" grouping are
+-- | 'Pattern' is the type returned by the regular expression parser 'parseRegex'.
+-- This is consumed by the "Text.Regex.TDFA.CorePattern" module and the tender leaves
+-- are nibbled by the "Text.Regex.TDFA.TNFA" module.
+--
+-- The 'DoPa' field is the index of the component in the regex string @r@.
+data Pattern
+  = PEmpty
+      -- ^ @()@, matches the empty string.
+  | PGroup  (Maybe GroupIndex) Pattern
+      -- ^ Group @(r)@.  @Nothing@ indicates non-matching 'PGroup'
+      -- (never produced by parser 'parseRegex').
+  | POr     [Pattern]
+      -- ^ Alternative @r|s@ (flattened by 'starTrans').
+  | PConcat [Pattern]
+      -- ^ Sequence @rs@ (flattened by 'starTrans').
+  | PQuest  Pattern
+      -- ^ Zero or one repetitions @r?@ (eliminated by 'starTrans').
+  | PPlus   Pattern
+      -- ^ One or more repetitions @r+@ (eliminated by 'starTrans').
+  | PStar   Bool Pattern
+      -- ^ Zero or more repetitions @r*@.
+      -- @True@ (default) means may accept the empty string on its first iteration.
+  | PBound  Int (Maybe Int) Pattern
+      -- ^ Given number or repetitions @r{n}@ or @r{n,m}@
+      -- (eliminated by 'starTrans').
+
+  -- The rest of these need an index of where in the regex string it is from
+  | PCarat  { getDoPa :: DoPa }
+      -- ^ @^@ matches beginning of input.
+  | PDollar { getDoPa :: DoPa }
+      -- ^ @$@ matches end of input.
+
+  -- The following test and accept a single character
+  | PDot    { getDoPa :: DoPa }
+      -- ^ @.@ matches any character.
+  | PAny    { getDoPa :: DoPa, getPatternSet :: PatternSet }
+      -- ^ Bracket expression @[...]@.
+  | PAnyNot { getDoPa :: DoPa, getPatternSet :: PatternSet }
+      -- ^ Inverted bracket expression @[^...]@.
+  | PEscape { getDoPa :: DoPa, getPatternChar :: Char }
+      -- ^ Backslashed character @\c@, may have special meaning.
+  | PChar   { getDoPa :: DoPa, getPatternChar :: Char }
+      -- ^ Single character, matches given character.
+
+  -- The following are semantic tags created in starTrans, not the parser
+  | PNonCapture Pattern
+     -- ^ Tag for internal use, introduced by 'starTrans'.
+  | PNonEmpty Pattern
+     -- ^ Tag for internal use, introduced by 'starTrans'.
+  deriving (Eq, Show)
+
+-- Andreas Abel, 2022-07-18, issue #47:
+-- The following claim is FALSE:
+--
+-- I have not been checking, but this should have the property that
+-- parsing the resulting string should result in an identical 'Pattern'.
+-- This is not true if 'starTrans' has created 'PNonCapture' and 'PNonEmpty'
+-- values or a @'PStar' False@.  The contents of a @[...]@ grouping are
 -- always shown in a sorted canonical order.
 showPattern :: Pattern -> String
 showPattern pIn =
@@ -92,12 +122,19 @@ showPattern pIn =
 -}
         paren s = ('(':s)++")"
 
+-- | Content of a bracket expression @[...]@ organized into
+-- characters,
+-- POSIX character classes (e.g. @[[:alnum:]]@),
+-- collating elements (e.g. @[.ch.]@, unused), and
+-- equivalence classes (e.g. @[=a=]@, treated as characters).
+--
 data PatternSet = PatternSet (Maybe (Set Char))
                              (Maybe (Set PatternSetCharacterClass))
                              (Maybe (Set PatternSetCollatingElement))
                              (Maybe (Set PatternSetEquivalenceClass))
                              deriving (Eq)
 
+-- | Hand-rolled implementation, giving textual rather than Haskell representation.
 instance Show PatternSet where
   showsPrec i (PatternSet s scc sce sec) =
     let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s
@@ -117,35 +154,79 @@ instance Show PatternSet where
           groupRange x n [] = if n <=3 then take n [x..]
                               else x:'-':(toEnum (pred n+fromEnum x)):[]
 
+-- | Content of @[: :]@, e.g. @"alnum"@ for @[:alnum:]@.
 newtype PatternSetCharacterClass   = PatternSetCharacterClass   {unSCC::String}
   deriving (Eq,Ord)
+
+-- | Content of @[. .]@, e.g. @"ch"@ for @[.ch.]@.
 newtype PatternSetCollatingElement = PatternSetCollatingElement {unSCE::String}
   deriving (Eq,Ord)
+
+-- | Content of @[= =]@, e.g. @"a"@ for @[=a=]@.
 newtype PatternSetEquivalenceClass = PatternSetEquivalenceClass {unSEC::String}
   deriving (Eq,Ord)
 
+-- | Hand-rolled implementation, giving textual rather than Haskell representation.
 instance Show PatternSetCharacterClass where
   showsPrec _ p = showChar '[' . showChar ':' . shows (unSCC p) . showChar ':' . showChar ']'
+
+-- | Hand-rolled implementation, giving textual rather than Haskell representation.
 instance Show PatternSetCollatingElement where
   showsPrec _ p = showChar '[' . showChar '.' . shows (unSCE p) . showChar '.' . showChar ']'
+
+-- | Hand-rolled implementation, giving textual rather than Haskell representation.
 instance Show PatternSetEquivalenceClass where
   showsPrec _ p = showChar '[' . showChar '=' . shows (unSEC p) . showChar '=' . showChar ']'
 
+-- | @decodePatternSet@ cannot handle collating element and treats
+-- equivalence classes as just their definition and nothing more.
+--
+-- @since 1.3.2
+decodePatternSet :: PatternSet -> Set Char
+decodePatternSet (PatternSet msc mscc _ msec) =
+  let baseMSC = maybe Set.empty id msc
+      withMSCC = foldl (flip Set.insert) baseMSC  (maybe [] (concatMap decodeCharacterClass . Set.toAscList) mscc)
+      withMSEC = foldl (flip Set.insert) withMSCC (maybe [] (concatMap unSEC . Set.toAscList) msec)
+  in withMSEC
+
+-- | This returns the strictly ascending list of characters
+-- represented by @[: :]@ POSIX character classes.
+-- Unrecognized class names return an empty string.
+--
+-- @since 1.3.2
+decodeCharacterClass :: PatternSetCharacterClass -> String
+decodeCharacterClass (PatternSetCharacterClass s) =
+  case s of
+    "alnum"  -> ['0'..'9']++['A'..'Z']++['a'..'z']
+    "digit"  -> ['0'..'9']
+    "punct"  -> ['\33'..'\47']++['\58'..'\64']++['\91'..'\96']++['\123'..'\126']
+    "alpha"  -> ['A'..'Z']++['a'..'z']
+    "graph"  -> ['\41'..'\126']
+    "space"  -> "\t\n\v\f\r "
+    "blank"  -> "\t "
+    "lower"  -> ['a'..'z']
+    "upper"  -> ['A'..'Z']
+    "cntrl"  -> ['\0'..'\31']++"\127" -- with NUL
+    "print"  -> ['\32'..'\126']
+    "xdigit" -> ['0'..'9']++['A'..'F']++['a'..'f']
+    "word"   -> ['0'..'9']++['A'..'Z']++"_"++['a'..'z']
+    _ -> []
+
 -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
 
 -- | Do the transformation and simplification in a single traversal.
--- This removes the PPlus, PQuest, and PBound values, changing to POr
--- and PEmpty and PStar True\/False.  For some PBound values it adds
--- PNonEmpty and PNonCapture semantic marker.  It also simplifies to
--- flatten out nested POr and PConcat instances and eliminate some
--- unneeded PEmpty values.
+-- This removes the 'PPlus', 'PQuest', and 'PBound' values, changing to 'POr'
+-- and 'PEmpty' and 'PStar'.  For some 'PBound' values it adds
+-- 'PNonEmpty' and 'PNonCapture' semantic marker.  It also simplifies to
+-- flatten out nested 'POr' and 'PConcat' instances and eliminate some
+-- unneeded 'PEmpty' values.
 starTrans :: Pattern -> Pattern
 starTrans = dfsPattern (simplify' . starTrans')
 
--- | Apply a Pattern transformation function depth first
-dfsPattern :: (Pattern -> Pattern)  -- ^ The transformation function
-           -> Pattern               -- ^ The Pattern to transform
-           -> Pattern               -- ^ The transformed Pattern
+-- | Apply a 'Pattern' transformation function depth first.
+dfsPattern :: (Pattern -> Pattern)  -- ^ The transformation function.
+           -> Pattern               -- ^ The 'Pattern' to transform.
+           -> Pattern               -- ^ The transformed 'Pattern'.
 dfsPattern f = dfs
  where unary c = f . c . dfs
        dfs pattern = case pattern of
@@ -323,7 +404,7 @@ starTrans' pIn =
     pass = pIn
 
 -- | Function to transform a pattern into an equivalent, but less
--- redundant form.  Nested 'POr' and 'PConcat' are flattened. PEmpty
+-- redundant form.  Nested 'POr' and 'PConcat' are flattened. 'PEmpty'
 -- is propagated.
 simplify' :: Pattern -> Pattern
 simplify' x@(POr _) =
@@ -345,7 +426,7 @@ simplify' (PNonCapture PEmpty) = PEmpty -- 2009, perhaps useful
 --simplify' (PNonEmpty PEmpty) = err "simplify' (PNonEmpty PEmpty) = should be Impossible!" -- 2009
 simplify' other = other
 
--- | Function to flatten nested POr or nested PConcat applicataions.
+-- | Function to flatten nested 'POr' or nested 'PConcat' applicataions.
 flatten :: Pattern -> [Pattern]
 flatten (POr ps) = (concatMap (\x -> case x of
                                        POr ps' -> ps'
@@ -359,8 +440,8 @@ notPEmpty :: Pattern -> Bool
 notPEmpty PEmpty = False
 notPEmpty _      = True
 
--- | Determines if pIn will fail or accept [] and never accept any
--- characters. Treat PCarat and PDollar as True.
+-- | Determines if 'Pattern' will fail or accept @[]@ and never accept any
+-- characters. Treat 'PCarat' and 'PDollar' as @True@.
 canOnlyMatchNull :: Pattern -> Bool
 canOnlyMatchNull pIn =
   case pIn of