Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
For the package version policy (PVP), see http://pvp.haskell.org/faq .

### 1.3.2

_2022-07-18, Andreas Abel_

- Export `decodePatternSet` and `decodeCharacterClass` from `Text.Regex.TDFA.Pattern`
([#16](https://github.com/haskell-hvr/regex-tdfa/issues/16))
- Extend and correct docs for `Pattern` module
- Tested with GHC 7.4 - 9.4

### 1.3.1.5

_2022-07-18, Andreas Abel_
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The name "tdfa" stands for Tagged-DFA.
[Declare a dependency](https://www.haskell.org/cabal/users-guide/developing-packages.html#pkg-field-build-depends) on the `regex-tdfa` library in your `.cabal` file:

```
build-depends: regex-tdfa ^>= 1.3.1
build-depends: regex-tdfa ^>= 1.3.2
```

In Haskell modules where you need to use regexes `import` the respective `regex-tdfa` module:
Expand Down
2 changes: 1 addition & 1 deletion lib/Text/Regex/TDFA.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ OS's bugs.

Declare a dependency on the @regex-tdfa@ library in your @.cabal@ file:

> build-depends: regex-tdfa ^>= 1.3.1.1
> build-depends: regex-tdfa ^>= 1.3.2

In Haskell modules where you want to use regexes simply @import@ /this/ module:

Expand Down
12 changes: 6 additions & 6 deletions lib/Text/Regex/TDFA/Common.hs
Original file line number Diff line number Diff line change
Expand Up @@ -109,26 +109,26 @@ data ExecOption = ExecOption {
captureGroups :: Bool -- ^ True by default. Set to False to improve speed (and space).
} deriving (Read,Show)

-- | Used by implementation to name certain Postions during
-- matching. Identity of Position tag to set during a transition.
-- | Used by implementation to name certain 'Postion's during
-- matching. Identity of 'Position' tag to set during a transition.
type Tag = Int

-- | Internal use to indicate type of tag and preference for larger or smaller Positions.
-- | Internal use to indicate type of tag and preference for larger or smaller 'Position's.
data OP = Maximize | Minimize | Orbit | Ignore deriving (Eq,Show)

-- | Internal NFA node identity number.
type Index = Int

-- | Internal DFA identity is this Set of NFA Index.
-- | Internal DFA identity is this 'Set' of NFA 'Index'.
type SetIndex = IntSet {- Index -}

-- | Index into the text being searched.
type Position = Int

-- | GroupIndex is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group').
-- | 'GroupIndex' is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group').
type GroupIndex = Int

-- | GroupInfo collects the parent and tag information for an instance of a group.
-- | 'GroupInfo' collects the parent and tag information for an instance of a group.
data GroupInfo = GroupInfo {
thisIndex, parentIndex :: GroupIndex
, startTag, stopTag, flagTag :: Tag
Expand Down
173 changes: 127 additions & 46 deletions lib/Text/Regex/TDFA/Pattern.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

-- | This "Text.Regex.TDFA.Pattern" module provides the 'Pattern' data
-- type and its subtypes. This 'Pattern' type is used to represent
-- the parsed form of a Regular Expression.
-- the parsed form of a regular expression.

module Text.Regex.TDFA.Pattern
(Pattern(..)
Expand All @@ -12,52 +12,82 @@ module Text.Regex.TDFA.Pattern
,PatternSetEquivalenceClass(..)
,GroupIndex
,DoPa(..)
,decodeCharacterClass, decodePatternSet
,showPattern
-- ** Internal use
,starTrans
-- ** Internal use, Operations to support debugging under ghci
-- ** Internal use, operations to support debugging under @ghci@
,starTrans',simplify',dfsPattern
) where

{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}

import Data.List(intersperse,partition)
import qualified Data.Set as Set(toAscList,toList)
import Data.Set(Set) -- XXX EnumSet
import qualified Data.Set as Set
import Data.Set (Set)
import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error)

err :: String -> a
err = common_error "Text.Regex.TDFA.Pattern"

-- | Pattern is the type returned by the regular expression parser.
-- This is consumed by the CorePattern module and the tender leaves
-- are nibbled by the TNFA module.
data Pattern = PEmpty
| PGroup (Maybe GroupIndex) Pattern -- Nothing to indicate non-matching PGroup (Nothing never used!)
| POr [Pattern] -- flattened by starTrans
| PConcat [Pattern] -- flattened by starTrans
| PQuest Pattern -- eliminated by starTrans
| PPlus Pattern -- eliminated by starTrans
| PStar Bool Pattern -- True means mayFirstBeNull is True
| PBound Int (Maybe Int) Pattern -- eliminated by starTrans
-- The rest of these need an index of where in the regex string it is from
| PCarat {getDoPa::DoPa}
| PDollar {getDoPa::DoPa}
-- The following test and accept a single character
| PDot {getDoPa::DoPa} -- Any character (newline?) at all
| PAny {getDoPa::DoPa,getPatternSet::PatternSet} -- Square bracketed things
| PAnyNot {getDoPa::DoPa,getPatternSet::PatternSet} -- Inverted square bracketed things
| PEscape {getDoPa::DoPa,getPatternChar::Char} -- Backslashed Character
| PChar {getDoPa::DoPa,getPatternChar::Char} -- Specific Character
-- The following are semantic tags created in starTrans, not the parser
| PNonCapture Pattern -- introduced by starTrans
| PNonEmpty Pattern -- introduced by starTrans
deriving (Eq,Show)

-- | I have not been checking, but this should have the property that
-- parsing the resulting string should result in an identical Pattern.
-- This is not true if starTrans has created PNonCapture and PNonEmpty
-- values or a (PStar False). The contents of a "[ ]" grouping are
-- | 'Pattern' is the type returned by the regular expression parser 'parseRegex'.
-- This is consumed by the "Text.Regex.TDFA.CorePattern" module and the tender leaves
-- are nibbled by the "Text.Regex.TDFA.TNFA" module.
--
-- The 'DoPa' field is the index of the component in the regex string @r@.
data Pattern
= PEmpty
-- ^ @()@, matches the empty string.
| PGroup (Maybe GroupIndex) Pattern
-- ^ Group @(r)@. @Nothing@ indicates non-matching 'PGroup'
-- (never produced by parser 'parseRegex').
| POr [Pattern]
-- ^ Alternative @r|s@ (flattened by 'starTrans').
| PConcat [Pattern]
-- ^ Sequence @rs@ (flattened by 'starTrans').
| PQuest Pattern
-- ^ Zero or one repetitions @r?@ (eliminated by 'starTrans').
| PPlus Pattern
-- ^ One or more repetitions @r+@ (eliminated by 'starTrans').
| PStar Bool Pattern
-- ^ Zero or more repetitions @r*@.
-- @True@ (default) means may accept the empty string on its first iteration.
| PBound Int (Maybe Int) Pattern
-- ^ Given number or repetitions @r{n}@ or @r{n,m}@
-- (eliminated by 'starTrans').

-- The rest of these need an index of where in the regex string it is from
| PCarat { getDoPa :: DoPa }
-- ^ @^@ matches beginning of input.
| PDollar { getDoPa :: DoPa }
-- ^ @$@ matches end of input.

-- The following test and accept a single character
| PDot { getDoPa :: DoPa }
-- ^ @.@ matches any character.
| PAny { getDoPa :: DoPa, getPatternSet :: PatternSet }
-- ^ Bracket expression @[...]@.
| PAnyNot { getDoPa :: DoPa, getPatternSet :: PatternSet }
-- ^ Inverted bracket expression @[^...]@.
| PEscape { getDoPa :: DoPa, getPatternChar :: Char }
-- ^ Backslashed character @\c@, may have special meaning.
| PChar { getDoPa :: DoPa, getPatternChar :: Char }
-- ^ Single character, matches given character.

-- The following are semantic tags created in starTrans, not the parser
| PNonCapture Pattern
-- ^ Tag for internal use, introduced by 'starTrans'.
| PNonEmpty Pattern
-- ^ Tag for internal use, introduced by 'starTrans'.
deriving (Eq, Show)

-- Andreas Abel, 2022-07-18, issue #47:
-- The following claim is FALSE:
--
-- I have not been checking, but this should have the property that
-- parsing the resulting string should result in an identical 'Pattern'.
-- This is not true if 'starTrans' has created 'PNonCapture' and 'PNonEmpty'
-- values or a @'PStar' False@. The contents of a @[...]@ grouping are
-- always shown in a sorted canonical order.
showPattern :: Pattern -> String
showPattern pIn =
Expand Down Expand Up @@ -92,12 +122,19 @@ showPattern pIn =
-}
paren s = ('(':s)++")"

-- | Content of a bracket expression @[...]@ organized into
-- characters,
-- POSIX character classes (e.g. @[[:alnum:]]@),
-- collating elements (e.g. @[.ch.]@, unused), and
-- equivalence classes (e.g. @[=a=]@, treated as characters).
--
data PatternSet = PatternSet (Maybe (Set Char))
(Maybe (Set PatternSetCharacterClass))
(Maybe (Set PatternSetCollatingElement))
(Maybe (Set PatternSetEquivalenceClass))
deriving (Eq)

-- | Hand-rolled implementation, giving textual rather than Haskell representation.
instance Show PatternSet where
showsPrec i (PatternSet s scc sce sec) =
let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s
Expand All @@ -117,35 +154,79 @@ instance Show PatternSet where
groupRange x n [] = if n <=3 then take n [x..]
else x:'-':(toEnum (pred n+fromEnum x)):[]

-- | Content of @[: :]@, e.g. @"alnum"@ for @[:alnum:]@.
newtype PatternSetCharacterClass = PatternSetCharacterClass {unSCC::String}
deriving (Eq,Ord)

-- | Content of @[. .]@, e.g. @"ch"@ for @[.ch.]@.
newtype PatternSetCollatingElement = PatternSetCollatingElement {unSCE::String}
deriving (Eq,Ord)

-- | Content of @[= =]@, e.g. @"a"@ for @[=a=]@.
newtype PatternSetEquivalenceClass = PatternSetEquivalenceClass {unSEC::String}
deriving (Eq,Ord)

-- | Hand-rolled implementation, giving textual rather than Haskell representation.
instance Show PatternSetCharacterClass where
showsPrec _ p = showChar '[' . showChar ':' . shows (unSCC p) . showChar ':' . showChar ']'

-- | Hand-rolled implementation, giving textual rather than Haskell representation.
instance Show PatternSetCollatingElement where
showsPrec _ p = showChar '[' . showChar '.' . shows (unSCE p) . showChar '.' . showChar ']'

-- | Hand-rolled implementation, giving textual rather than Haskell representation.
instance Show PatternSetEquivalenceClass where
showsPrec _ p = showChar '[' . showChar '=' . shows (unSEC p) . showChar '=' . showChar ']'

-- | @decodePatternSet@ cannot handle collating element and treats
-- equivalence classes as just their definition and nothing more.
--
-- @since 1.3.2
decodePatternSet :: PatternSet -> Set Char
decodePatternSet (PatternSet msc mscc _ msec) =
let baseMSC = maybe Set.empty id msc
withMSCC = foldl (flip Set.insert) baseMSC (maybe [] (concatMap decodeCharacterClass . Set.toAscList) mscc)
withMSEC = foldl (flip Set.insert) withMSCC (maybe [] (concatMap unSEC . Set.toAscList) msec)
in withMSEC

-- | This returns the strictly ascending list of characters
-- represented by @[: :]@ POSIX character classes.
-- Unrecognized class names return an empty string.
--
-- @since 1.3.2
decodeCharacterClass :: PatternSetCharacterClass -> String
decodeCharacterClass (PatternSetCharacterClass s) =
case s of
"alnum" -> ['0'..'9']++['A'..'Z']++['a'..'z']
"digit" -> ['0'..'9']
"punct" -> ['\33'..'\47']++['\58'..'\64']++['\91'..'\96']++['\123'..'\126']
"alpha" -> ['A'..'Z']++['a'..'z']
"graph" -> ['\41'..'\126']
"space" -> "\t\n\v\f\r "
"blank" -> "\t "
"lower" -> ['a'..'z']
"upper" -> ['A'..'Z']
"cntrl" -> ['\0'..'\31']++"\127" -- with NUL
"print" -> ['\32'..'\126']
"xdigit" -> ['0'..'9']++['A'..'F']++['a'..'f']
"word" -> ['0'..'9']++['A'..'Z']++"_"++['a'..'z']
_ -> []

-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==

-- | Do the transformation and simplification in a single traversal.
-- This removes the PPlus, PQuest, and PBound values, changing to POr
-- and PEmpty and PStar True\/False. For some PBound values it adds
-- PNonEmpty and PNonCapture semantic marker. It also simplifies to
-- flatten out nested POr and PConcat instances and eliminate some
-- unneeded PEmpty values.
-- This removes the 'PPlus', 'PQuest', and 'PBound' values, changing to 'POr'
-- and 'PEmpty' and 'PStar'. For some 'PBound' values it adds
-- 'PNonEmpty' and 'PNonCapture' semantic marker. It also simplifies to
-- flatten out nested 'POr' and 'PConcat' instances and eliminate some
-- unneeded 'PEmpty' values.
starTrans :: Pattern -> Pattern
starTrans = dfsPattern (simplify' . starTrans')

-- | Apply a Pattern transformation function depth first
dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function
-> Pattern -- ^ The Pattern to transform
-> Pattern -- ^ The transformed Pattern
-- | Apply a 'Pattern' transformation function depth first.
dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function.
-> Pattern -- ^ The 'Pattern' to transform.
-> Pattern -- ^ The transformed 'Pattern'.
dfsPattern f = dfs
where unary c = f . c . dfs
dfs pattern = case pattern of
Expand Down Expand Up @@ -323,7 +404,7 @@ starTrans' pIn =
pass = pIn

-- | Function to transform a pattern into an equivalent, but less
-- redundant form. Nested 'POr' and 'PConcat' are flattened. PEmpty
-- redundant form. Nested 'POr' and 'PConcat' are flattened. 'PEmpty'
-- is propagated.
simplify' :: Pattern -> Pattern
simplify' x@(POr _) =
Expand All @@ -345,7 +426,7 @@ simplify' (PNonCapture PEmpty) = PEmpty -- 2009, perhaps useful
--simplify' (PNonEmpty PEmpty) = err "simplify' (PNonEmpty PEmpty) = should be Impossible!" -- 2009
simplify' other = other

-- | Function to flatten nested POr or nested PConcat applicataions.
-- | Function to flatten nested 'POr' or nested 'PConcat' applicataions.
flatten :: Pattern -> [Pattern]
flatten (POr ps) = (concatMap (\x -> case x of
POr ps' -> ps'
Expand All @@ -359,8 +440,8 @@ notPEmpty :: Pattern -> Bool
notPEmpty PEmpty = False
notPEmpty _ = True

-- | Determines if pIn will fail or accept [] and never accept any
-- characters. Treat PCarat and PDollar as True.
-- | Determines if 'Pattern' will fail or accept @[]@ and never accept any
-- characters. Treat 'PCarat' and 'PDollar' as @True@.
canOnlyMatchNull :: Pattern -> Bool
canOnlyMatchNull pIn =
case pIn of
Expand Down
Loading