CurryInfo: regexp-4.1.0 / RegExpEff

classes:

              
documentation:
------------------------------------------------------------------------------
--- An efficient library that defines a structure of regular expressions and
--- some operations for regular expressions.
--- This library is used to translate integrated code in the form
--- of POSIX extended regular expressions into efficient Curry programs.
--- The implementation is based on the paper "A Play on Regular Expressions -
--- Functional Pearl" by Fischer, Huch and Wilke (September 27–29, 2010,
--- Baltimore, Maryland, USA).
---
--- @author Corinna Wambsganz
--- @version October 2022
------------------------------------------------------------------------------
name:
RegExpEff
operations:
alt anyL bracket capture captureG conc end eps grep grepPos grepShow grepShowUnique literal match negBracket pl rep start times
sourcecode:
module RegExpEff
  ( Reg, eps, literal, alt, conc, rep, pl, anyL, bracket, negBracket
  , start, end, times, captureG
  , match, grep, grepPos, grepShow, grepShowUnique, capture
  ) 
 where

import Data.List

-- Datatype for regular expressions. The regular expression (reg) and two flags:
-- empty: true if the regular expression accepts the empty word and
-- final: true if the remaining regular expression accepts the empty word.
data Reg c s = Reg {emptyRe :: s, final :: s, cg :: [Int], reg :: Re c s}
data Re c s = Eps                       -- ε
            | Sym ((c, [Int]) -> s)     -- a
            | Alt (Reg c s) (Reg c s)   -- α|β
            | Seq (Reg c s) (Reg c s)   -- αβ
            | Rep (Reg c s)             -- α*
            | Start (Reg c s)           -- ^α
            | End (Reg c s)             -- α$
            | Times (Int,Int) (Reg c s) -- α{n,m}

-- Datatype to compute the leftmost position of a substring that
-- matches a given regular expression.
data Leftmost = NoLeft | Leftmost Startl
data Startl = NoStartl | Startl Int

-- Datatype to compute the range of a substring that matches a given regular
-- expression.
data LeftLong = NoLeftLong | LeftLong Range
data Range = NoRange | Range Int Int

-- Datatype to compute all startingpositions of substrings that match a given
-- regular expression.
data All = NoAll | All List
data List = NoList | List [Int]

-- Datatype to compute the range of all substrings that match a given regular
-- expression.
data AllRange = NoAllRange | AllRange Ranges
data Ranges = NoRanges | Ranges [(Int, Int)]

-- Datatype to compute capture groups.
data CaptureGroups = NoCaptureGroups | CaptureGroups Groups
data Groups = NoGroups | Groups ([(Int, [(Int, Int)])], [Int])

-- Eq-Instance for leftmost values.
instance Eq Leftmost where
  NoLeft     == NoLeft     = True
  Leftmost _ == Leftmost _ = True
  NoLeft     == Leftmost _ = False
  Leftmost _ == NoLeft     = False

-- Eq-Instance for leftlong values.
instance Eq LeftLong where
  NoLeftLong == NoLeftLong = True
  LeftLong _ == LeftLong _ = True
  NoLeftLong == LeftLong _ = False
  LeftLong _ == NoLeftLong = False

-- Eq-Instance for all values.
instance Eq All where
  NoAll == NoAll = True
  All _ == All _ = True
  NoAll == All _ = False
  All _ == NoAll = False

-- Eq-Instance for allRange values.
instance Eq AllRange where
  NoAllRange == NoAllRange = True
  AllRange _ == AllRange _ = True
  NoAllRange == AllRange _ = False
  AllRange _ == NoAllRange = False

-- Eq-Instance for captureGroup values.
instance Eq CaptureGroups where
  NoCaptureGroups == NoCaptureGroups = True
  CaptureGroups _ == CaptureGroups _ = True
  NoCaptureGroups == CaptureGroups _ = False
  CaptureGroups _ == NoCaptureGroups = False

-- A semiring is an algebraic structure with zero, one, addition, and
-- multiplication that satisfies certain laws.
class Semiring s where
 zero, one :: s
 (+) :: s -> s -> s
 (*) :: s -> s -> s

-- Semiring-Instance for boolean values.
instance Semiring Bool where
 zero = False
 one  = True
 (+)  = (||)
 (*)  = (&&)

-- Semiring-Instance for integer values.
instance Semiring Int where
 zero   = 0
 one    = 1
 (+) = (Prelude.+)
 (*) = (Prelude.*)

-- Semiring-Instance for leftmost values.
instance Semiring Leftmost where
 zero = NoLeft
 one = Leftmost NoStartl
 NoLeft     + NoLeft     = NoLeft
 Leftmost x + NoLeft     = Leftmost x
 NoLeft     + Leftmost x = Leftmost x
 Leftmost x + Leftmost y = Leftmost (leftmost x y)
  where leftmost NoStartl   NoStartl   = NoStartl
        leftmost NoStartl   (Startl i) = Startl i
        leftmost (Startl i) NoStartl   = Startl i
        leftmost (Startl i) (Startl j) = Startl (min i j)
 NoLeft     * NoLeft     = NoLeft
 NoLeft     * Leftmost _ = NoLeft
 Leftmost _ * NoLeft     = NoLeft
 Leftmost x * Leftmost y = Leftmost (startl x y)
  where startl NoStartl    s = s
        startl (Startl i)  _ = Startl i

-- Semiring-Instance for leftlong values.
instance Semiring LeftLong where
 zero = NoLeftLong
 one  = LeftLong NoRange
 NoLeftLong + NoLeftLong = NoLeftLong
 NoLeftLong + LeftLong x = LeftLong x
 LeftLong x + NoLeftLong = LeftLong x
 LeftLong x + LeftLong y = LeftLong (leftlong x y)
  where leftlong NoRange     NoRange     = NoRange
        leftlong NoRange     (Range i j) = Range i j
        leftlong (Range i j) NoRange     = Range i j
        leftlong (Range i j) (Range k l) | i < k || i == k && j >= l = Range i j
                                         | otherwise                 = Range k l
 NoLeftLong * NoLeftLong = NoLeftLong
 LeftLong _ * NoLeftLong = NoLeftLong
 NoLeftLong * LeftLong _ = NoLeftLong
 LeftLong x * LeftLong y = LeftLong (range x y)
  where range NoRange     NoRange     = NoRange
        range (Range i j) NoRange     = Range i j
        range NoRange     (Range i j) = Range i j
        range (Range i _) (Range _ j) = Range i j

-- Semiring-Instance for all values.
instance Semiring All where
 zero = NoAll
 one  = All NoList
 NoAll + NoAll = NoAll
 NoAll + All l = All l
 All l + NoAll = All l
 All x + All y = All (allList x y)
  where allList NoList   NoList   = NoList
        allList NoList   (List l) = List l
        allList (List l) NoList   = List l
        allList (List i) (List j) = List (j ++ i)
 NoAll * NoAll = NoAll
 NoAll * All _ = NoAll
 All _ * NoAll = NoAll
 All x * All y = All (list0 x y)
  where list0 NoList   NoList   = NoList
        list0 (List i) NoList   = List i
        list0 NoList   (List i) = List i
        list0 (List i) (List _) = List i

-- Semiring-Instance for allRange values.
instance Semiring AllRange where
 zero = NoAllRange
 one  = AllRange NoRanges
 NoAllRange + NoAllRange = NoAllRange
 NoAllRange + AllRange l = AllRange l
 AllRange l + NoAllRange = AllRange l
 AllRange x + AllRange y = AllRange (allrange x y)
  where allrange NoRanges   NoRanges   = NoRanges
        allrange NoRanges   (Ranges l) = Ranges l
        allrange (Ranges l) NoRanges   = Ranges l
        allrange (Ranges i) (Ranges j) = Ranges (j++i)
 NoAllRange * NoAllRange = NoAllRange
 NoAllRange * AllRange _ = NoAllRange
 AllRange _ * NoAllRange = NoAllRange
 AllRange x * AllRange y = AllRange (allr x y)
  where
    allr NoRanges   NoRanges = NoRanges
    allr (Ranges l) NoRanges   = Ranges l
    allr NoRanges   (Ranges l) = Ranges l
    allr (Ranges i) (Ranges j) = case i of
      []    -> Ranges j
      list1 -> case j of
        ((_,d) : js) -> Ranges (take ((length list1)-1) i ++
          [(fst (last list1),d)] ++ js)
        []           -> Ranges list1

-- Semiring-Instance for captureGroup values.
instance Semiring CaptureGroups where
   zero = NoCaptureGroups
   one  = CaptureGroups NoGroups
   NoCaptureGroups + NoCaptureGroups = NoCaptureGroups
   NoCaptureGroups + CaptureGroups l = CaptureGroups l
   CaptureGroups l + NoCaptureGroups = CaptureGroups l
   CaptureGroups x + CaptureGroups y = CaptureGroups (cgs x y)
    where
      cgs NoGroups   NoGroups   = NoGroups
      cgs (Groups l) NoGroups   = Groups l
      cgs NoGroups   (Groups l) = Groups l
      cgs (Groups (i, l1)) (Groups (j, l2)) = Groups ((conca i j),nub (l1++ l2))
      conca i j = case i of
        [] -> j
        ((n, s) : xs) -> case filter (\(nj, _) -> n == nj) j of
          [] -> (n, s) : conca xs j
          ((_, sj) : _) -> (n, s ++ sj) : conca xs
            (filter (\(nj, _) ->  not (n == nj)) j)
   NoCaptureGroups * NoCaptureGroups = NoCaptureGroups
   NoCaptureGroups * CaptureGroups _ = NoCaptureGroups
   CaptureGroups _ * NoCaptureGroups = NoCaptureGroups
   CaptureGroups x * CaptureGroups y = CaptureGroups (cgsm x y)
    where cgsm NoGroups   NoGroups   = NoGroups
          cgsm (Groups l) NoGroups   = Groups l
          cgsm NoGroups   (Groups l) = Groups l
          cgsm (Groups (i, l1)) (Groups (j, _)) = Groups ((comp i j l1), l1)
          comp i j l = case i of
            []           -> j
            ((n,s) : xs) -> case filter (\(nj,_) -> n == nj) j of
                []            -> (n,s) : comp xs j l
                ((_, sj) : _) -> (n, if elem n l then comb s sj else s ++ sj) :
                  comp xs (filter (\(nj, _) -> not (n == nj)) j) l
          comb _ []            = []
          comb s ((_, b) : xs) = map (\(a, _) -> (a, b)) s ++ comb s xs

-- Extends the semiring class with an index.
class Semiring s => Semiringi s where
 index :: Int -> s

-- Semiringi-Instance for leftmost values.
instance Semiringi Leftmost where
 index = Leftmost . Startl

-- Semiringi-Instance for leftlong values.
instance Semiringi LeftLong where
 index i = LeftLong (Range i i)

-- Semiringi-Instance for all values.
instance Semiringi All where
 index i = All (List [i])

--Semiringi-Instance for allRange values.
instance Semiringi AllRange where
 index i = AllRange (Ranges [(i,i)])

-- Extends the semiring class with a list.
class Semiring s => Semiringc s where
  list :: Int -> [Int] -> s

-- Semiringc-Instance for captureGroup values.
instance Semiringc CaptureGroups where
  list i l = CaptureGroups (Groups ((map (\n -> (n, [(i, i)])) l), l))

-- Semiringc-Instance for all values.
instance Semiringc All where
  list i _ = All (List [i])

-- Semiringc-Instance for allRange values.
instance Semiringc AllRange where
  list i _ = AllRange (Ranges [(i,i)])

-- Class with functions needed for the implementation of capture groups.
class CGFunction s where
  newcg :: s -> [Int] -> s
  justr :: s -> [Int] -> s

instance CGFunction Bool where
  newcg s _ = s
  justr = newcg

instance CGFunction Int where
  newcg s _ = s
  justr = newcg

instance CGFunction Leftmost where
  newcg s _ = s
  justr = newcg

instance CGFunction LeftLong where
  newcg s _ = s
  justr = newcg

instance CGFunction All where
  newcg s _ = s
  justr = newcg

instance CGFunction AllRange where
  newcg s _ = s
  justr = newcg

instance CGFunction CaptureGroups where
  newcg (NoCaptureGroups)               _ = NoCaptureGroups
  newcg (CaptureGroups (NoGroups))      _ = CaptureGroups (NoGroups)
  newcg (CaptureGroups (Groups (g, _))) l = CaptureGroups (Groups (g, l))
  justr (NoCaptureGroups)               _ = NoCaptureGroups
  justr (CaptureGroups (NoGroups))      _ = CaptureGroups (NoGroups)
  justr (CaptureGroups (Groups (g, _))) l = CaptureGroups (Groups
    (map (\(n, nl) -> if elem n l then (n, nl) else ((-abs(n)), nl)) g, l))

-- Functions for initializing a regex.
-- ε
eps :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s
eps l = Reg {emptyRe = one, final = zero, cg = l, reg = Eps}

--a
literal :: (Semiringc s, CGFunction s, Eq s) => [Int] -> Char
  -> Reg (Int, Char) s
literal l a = symc l (==a)

-- a
sym :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> ((c, [Int]) -> s)
  -> Reg c s
sym l f = Reg {emptyRe = zero, final = zero, cg = l, reg = Sym f}

-- a with index
symi :: (Semiringi s, CGFunction s, Eq s)=> [Int] -> Char -> Reg (Int, Char) s
symi l c = sym l weight
 where weight ((pos,x), _) | x == c    = index pos
                           | otherwise = zero

-- a with list
symc :: (Semiringc s, CGFunction s, Eq s)=> [Int] -> (Char -> Bool)
  -> Reg (Int, Char) s
symc l f = sym l captureList
 where captureList ((pos, x), li) | f x       = list pos li
                                  | otherwise = zero

-- α|β
alt :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
  -> Reg c s
alt l p q = Reg {emptyRe = newcg (emptyRe p + emptyRe q) l,
  final = newcg (final p + final q) l, cg = l, reg = Alt p q}

-- αβ
conc :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
  -> Reg c s
conc l p q = Reg {emptyRe = emptyRe p * emptyRe q,
  final = final p * emptyRe q + final q, cg = l, reg = Seq p q}

-- α*
rep :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
rep l r = Reg {emptyRe = one, final = newcg (final r) l, cg = l,
  reg = Rep r}

-- α⁺
pl :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
pl l r = conc l r (rep l r)

-- _
anyL :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s
anyL l = sym l (\_ -> one)

-- [as]
bracket :: (Semiringc s, CGFunction s, Eq s) => [Int] ->
  [Either Char (Char, Char)] -> Reg (Int, Char) s
bracket li l = let l' = map lift l
                   f  = foldl (\g h -> (\c -> (g c) || (h c))) (head l')
                     (tail l')
               in symc li (\c -> f c)

-- [^as]
negBracket :: (Semiringc s, CGFunction s, Eq s) => [Int] ->
  [Either Char (Char, Char)] -> Reg (Int, Char) s
negBracket li l = let l' = map negLift l
                      f  = foldl (\g h -> (\c -> (g c) && (h c)))(head l')
                        (tail l')
                  in symc li (\c -> f c)

-- ^α
start :: (Semiringc s, CGFunction s, Eq s) => [Int] -> Reg (Int, Char) s -> Bool
  -> Reg (Int, Char) s
start l r True  = Reg {emptyRe = emptyRe r, final = newcg (final r) l, cg = l,
   reg = Start r}
start l r False = conc l (Reg {emptyRe = emptyRe r, final = newcg (final r) l,
  cg = l, reg = Start r}) (rep l (symc l (\_ -> True)))

-- α$
end :: (Semiringc s, CGFunction s, Eq s) => [Int] -> Reg (Int, Char) s -> Bool
  -> Reg (Int, Char) s
end l r True  = Reg {emptyRe = emptyRe r, final = newcg (final r) l, cg = l,
  reg = End r}
end l r False = conc l (rep l (symc l (\_ -> True))) (Reg {emptyRe = emptyRe r,
  final = newcg (final r) l, cg = l, reg = End r})

-- α{n,m}
times :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Int -> Int ->
  Reg c s -> Reg c s
times l n m r =
  if m < n || n < 0
    then Reg {emptyRe = zero, final = zero, cg = l, reg = Times (n,m) r}
    else if n == 0
           then Reg {emptyRe = one, final = final r, cg = l, reg =
             Times (n,m) r}
           else Reg {emptyRe = emptyRe r, final = final r * time n, cg = l,
             reg = Times (n,m) r}
             where time k = if k == 0 then one else zero

-- /(α)/
captureG :: (Semiring s, CGFunction s, Ord c, Eq s) => Int -> Reg c s -> Reg c s
captureG _ r = r


-- For Bracket.
lift :: Either Char (Char,Char) -> Char -> Bool
lift (Left a)      = \c -> c == a
lift (Right (a,b)) = \c -> (c >= a && c <= b)

-- For NegBracket.
negLift :: Either Char (Char,Char) -> Char -> Bool
negLift a c = not (lift a c)

-- The match function is used to match lists with regular expressions
-- efficient.
--- @param r - The regular expression
--- @param l - The input list
--- @result Semiring.one if matched else Semiring.zero
matchEff :: (Semiring s, CGFunction s, Ord c, Eq s) => Reg c s -> [c] -> s
matchEff r []     = emptyRe r
matchEff r (c:cs) = final (foldl (shift zero) (shift one r c) cs)

--- The shift function is used within the match function.
--- It marks (sets final) the right positions in the regular expression.
--- @param m - The mark
--- @param r - The regular expression
--- @param c - The next literal (of the input list)
--- @result reg - The marked regular expression
shift :: (Semiring s, CGFunction s, Ord c, Eq s) => s -> Reg c s -> c -> Reg c s
shift m r c = case reg r of
  Eps              -> eps (cg r)
  (Sym f)          -> (sym (cg r) f){final = (newcg (m * (f (c, cg r))) (cg r))}
  (Alt p q)        -> alt (cg r) (shift m p c) (shift m q c)
  (Seq p q)        -> conc (cg r) (shift m p c) (shift (m * emptyRe p + final p)
    q c)
  (Rep re)         -> rep (cg r) (shift (newcg (m + justr (final re) (cg r))
    (cg r)) re c)
  (Start re)       -> shift m re c
  (End re)         -> shift m re c
  (Times (i,j) re) -> let nre = (shift (newcg (m + justr (final re) (cg r))
                                 (cg r)) re c)
                      in times (cg r) (max (if (final nre == one) then i-1
                        else i) 0) (if (final nre == one) then j-1 else j) nre

--- The submatch function is used to check if a substring matches the regular
--- expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result Semiring.one with indices if matched else Semiring.zero
submatch :: (Semiring s, CGFunction s, Ord c, Eq s) => Reg (Int, c) s -> [c]
  -> s
submatch r s = matchEff (conc [] arb (conc [] r arb)) (zip [0..] s)
 where arb = rep [] (sym [] (\_ -> one))

--- The grep function returns efficient a list with startingpositions of
--- substrings that match the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list of startingpositions of matching substrings
grep :: Ord a => Reg (Int, a) All -> [a] -> [Int]
grep re s = case submatch re s of
                 All (List l) -> l
                 _            -> []

--- The grepPos function returns efficient the first startingposition of a
--- substring which matches the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result n - The startindex of the first substring that matches,
--- -1 if no substring matches
grepPos :: Ord a => Reg (Int, a) All -> [a] -> Int
grepPos re s = case grep re s of
                    []  -> -1
                    pos -> head pos

--- The grepShow function returns efficient a list of substrings that match
--- the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @return l - The list of substrings from s that match r
grepShow :: Ord a => Reg (Int, a) AllRange -> [a] -> [[a]]
grepShow re s = case submatch re s of
  AllRange (Ranges list0) -> map (\(i,j) -> drop i (take (j Prelude.+1)s)) list0
  _                       -> []

-- As grepShow but without doubles.
grepShowUnique :: Ord a => Reg (Int, a) AllRange -> [a] -> [[a]]
grepShowUnique re s = nub (grepShow re s)

--- The capture function is used to return capture groups efficient.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list with the capture groups and
--- the respectively matching substrings
capture :: Ord c => Reg (Int, c) CaptureGroups -> [c] -> [(Int, [[c]])]
capture r s = case (matchEff r (zip [0.. ] s)) of
  CaptureGroups (Groups (l, _)) -> combine $ map (\(n, li) -> (n, (map (\(i, j)
    -> drop i (take (j Prelude.+ 1) s)) li))) l
  _                             -> []

-- Combines the strings of numbers with the same absolute value.
combine :: Ord c => [(Int, [[c]])] -> [(Int, [[c]])]
combine []              = []
combine ((n1, s1) : ns) = case filter (\(i, _) -> abs(i) == abs(n1)) ns of
    [] -> (abs(n1), s1) : combine ns
    li -> (abs(n1), foldr (++) [] $ map (\(_, s2) -> s2) ((n1, s1) : li)):
      combine (filter (\(i, _) -> not $ abs(i) == abs(n1)) ns)

-- Match with capture groups.
match :: Ord c => Reg (Int, c) CaptureGroups -> [c] -> Bool
match r s = let l = capture r s
            in if l == []
                 then False
                 else if (snd $ head l) == []
                        then False
                        else (head $ snd $ head l) == s
types:
Reg
unsafe:
safe