sourcecode:
|
module RegExpEff
( Reg, eps, literal, alt, conc, rep, pl, anyL, bracket, negBracket
, start, end, times, captureG
, match, grep, grepPos, grepShow, grepShowUnique, capture
)
where
import Data.List
-- Datatype for regular expressions. The regular expression (reg) and two flags:
-- empty: true if the regular expression accepts the empty word and
-- final: true if the remaining regular expression accepts the empty word.
data Reg c s = Reg {emptyRe :: s, final :: s, cg :: [Int], reg :: Re c s}
data Re c s = Eps -- ε
| Sym ((c, [Int]) -> s) -- a
| Alt (Reg c s) (Reg c s) -- α|β
| Seq (Reg c s) (Reg c s) -- αβ
| Rep (Reg c s) -- α*
| Start (Reg c s) -- ^α
| End (Reg c s) -- α$
| Times (Int,Int) (Reg c s) -- α{n,m}
-- Datatype to compute the leftmost position of a substring that
-- matches a given regular expression.
data Leftmost = NoLeft | Leftmost Startl
data Startl = NoStartl | Startl Int
-- Datatype to compute the range of a substring that matches a given regular
-- expression.
data LeftLong = NoLeftLong | LeftLong Range
data Range = NoRange | Range Int Int
-- Datatype to compute all startingpositions of substrings that match a given
-- regular expression.
data All = NoAll | All List
data List = NoList | List [Int]
-- Datatype to compute the range of all substrings that match a given regular
-- expression.
data AllRange = NoAllRange | AllRange Ranges
data Ranges = NoRanges | Ranges [(Int, Int)]
-- Datatype to compute capture groups.
data CaptureGroups = NoCaptureGroups | CaptureGroups Groups
data Groups = NoGroups | Groups ([(Int, [(Int, Int)])], [Int])
-- Eq-Instance for leftmost values.
instance Eq Leftmost where
NoLeft == NoLeft = True
Leftmost _ == Leftmost _ = True
NoLeft == Leftmost _ = False
Leftmost _ == NoLeft = False
-- Eq-Instance for leftlong values.
instance Eq LeftLong where
NoLeftLong == NoLeftLong = True
LeftLong _ == LeftLong _ = True
NoLeftLong == LeftLong _ = False
LeftLong _ == NoLeftLong = False
-- Eq-Instance for all values.
instance Eq All where
NoAll == NoAll = True
All _ == All _ = True
NoAll == All _ = False
All _ == NoAll = False
-- Eq-Instance for allRange values.
instance Eq AllRange where
NoAllRange == NoAllRange = True
AllRange _ == AllRange _ = True
NoAllRange == AllRange _ = False
AllRange _ == NoAllRange = False
-- Eq-Instance for captureGroup values.
instance Eq CaptureGroups where
NoCaptureGroups == NoCaptureGroups = True
CaptureGroups _ == CaptureGroups _ = True
NoCaptureGroups == CaptureGroups _ = False
CaptureGroups _ == NoCaptureGroups = False
-- A semiring is an algebraic structure with zero, one, addition, and
-- multiplication that satisfies certain laws.
class Semiring s where
zero, one :: s
(+) :: s -> s -> s
(*) :: s -> s -> s
-- Semiring-Instance for boolean values.
instance Semiring Bool where
zero = False
one = True
(+) = (||)
(*) = (&&)
-- Semiring-Instance for integer values.
instance Semiring Int where
zero = 0
one = 1
(+) = (Prelude.+)
(*) = (Prelude.*)
-- Semiring-Instance for leftmost values.
instance Semiring Leftmost where
zero = NoLeft
one = Leftmost NoStartl
NoLeft + NoLeft = NoLeft
Leftmost x + NoLeft = Leftmost x
NoLeft + Leftmost x = Leftmost x
Leftmost x + Leftmost y = Leftmost (leftmost x y)
where leftmost NoStartl NoStartl = NoStartl
leftmost NoStartl (Startl i) = Startl i
leftmost (Startl i) NoStartl = Startl i
leftmost (Startl i) (Startl j) = Startl (min i j)
NoLeft * NoLeft = NoLeft
NoLeft * Leftmost _ = NoLeft
Leftmost _ * NoLeft = NoLeft
Leftmost x * Leftmost y = Leftmost (startl x y)
where startl NoStartl s = s
startl (Startl i) _ = Startl i
-- Semiring-Instance for leftlong values.
instance Semiring LeftLong where
zero = NoLeftLong
one = LeftLong NoRange
NoLeftLong + NoLeftLong = NoLeftLong
NoLeftLong + LeftLong x = LeftLong x
LeftLong x + NoLeftLong = LeftLong x
LeftLong x + LeftLong y = LeftLong (leftlong x y)
where leftlong NoRange NoRange = NoRange
leftlong NoRange (Range i j) = Range i j
leftlong (Range i j) NoRange = Range i j
leftlong (Range i j) (Range k l) | i < k || i == k && j >= l = Range i j
| otherwise = Range k l
NoLeftLong * NoLeftLong = NoLeftLong
LeftLong _ * NoLeftLong = NoLeftLong
NoLeftLong * LeftLong _ = NoLeftLong
LeftLong x * LeftLong y = LeftLong (range x y)
where range NoRange NoRange = NoRange
range (Range i j) NoRange = Range i j
range NoRange (Range i j) = Range i j
range (Range i _) (Range _ j) = Range i j
-- Semiring-Instance for all values.
instance Semiring All where
zero = NoAll
one = All NoList
NoAll + NoAll = NoAll
NoAll + All l = All l
All l + NoAll = All l
All x + All y = All (allList x y)
where allList NoList NoList = NoList
allList NoList (List l) = List l
allList (List l) NoList = List l
allList (List i) (List j) = List (j ++ i)
NoAll * NoAll = NoAll
NoAll * All _ = NoAll
All _ * NoAll = NoAll
All x * All y = All (list0 x y)
where list0 NoList NoList = NoList
list0 (List i) NoList = List i
list0 NoList (List i) = List i
list0 (List i) (List _) = List i
-- Semiring-Instance for allRange values.
instance Semiring AllRange where
zero = NoAllRange
one = AllRange NoRanges
NoAllRange + NoAllRange = NoAllRange
NoAllRange + AllRange l = AllRange l
AllRange l + NoAllRange = AllRange l
AllRange x + AllRange y = AllRange (allrange x y)
where allrange NoRanges NoRanges = NoRanges
allrange NoRanges (Ranges l) = Ranges l
allrange (Ranges l) NoRanges = Ranges l
allrange (Ranges i) (Ranges j) = Ranges (j++i)
NoAllRange * NoAllRange = NoAllRange
NoAllRange * AllRange _ = NoAllRange
AllRange _ * NoAllRange = NoAllRange
AllRange x * AllRange y = AllRange (allr x y)
where
allr NoRanges NoRanges = NoRanges
allr (Ranges l) NoRanges = Ranges l
allr NoRanges (Ranges l) = Ranges l
allr (Ranges i) (Ranges j) = case i of
[] -> Ranges j
list1 -> case j of
((_,d) : js) -> Ranges (take ((length list1)-1) i ++
[(fst (last list1),d)] ++ js)
[] -> Ranges list1
-- Semiring-Instance for captureGroup values.
instance Semiring CaptureGroups where
zero = NoCaptureGroups
one = CaptureGroups NoGroups
NoCaptureGroups + NoCaptureGroups = NoCaptureGroups
NoCaptureGroups + CaptureGroups l = CaptureGroups l
CaptureGroups l + NoCaptureGroups = CaptureGroups l
CaptureGroups x + CaptureGroups y = CaptureGroups (cgs x y)
where
cgs NoGroups NoGroups = NoGroups
cgs (Groups l) NoGroups = Groups l
cgs NoGroups (Groups l) = Groups l
cgs (Groups (i, l1)) (Groups (j, l2)) = Groups ((conca i j),nub (l1++ l2))
conca i j = case i of
[] -> j
((n, s) : xs) -> case filter (\(nj, _) -> n == nj) j of
[] -> (n, s) : conca xs j
((_, sj) : _) -> (n, s ++ sj) : conca xs
(filter (\(nj, _) -> not (n == nj)) j)
NoCaptureGroups * NoCaptureGroups = NoCaptureGroups
NoCaptureGroups * CaptureGroups _ = NoCaptureGroups
CaptureGroups _ * NoCaptureGroups = NoCaptureGroups
CaptureGroups x * CaptureGroups y = CaptureGroups (cgsm x y)
where cgsm NoGroups NoGroups = NoGroups
cgsm (Groups l) NoGroups = Groups l
cgsm NoGroups (Groups l) = Groups l
cgsm (Groups (i, l1)) (Groups (j, _)) = Groups ((comp i j l1), l1)
comp i j l = case i of
[] -> j
((n,s) : xs) -> case filter (\(nj,_) -> n == nj) j of
[] -> (n,s) : comp xs j l
((_, sj) : _) -> (n, if elem n l then comb s sj else s ++ sj) :
comp xs (filter (\(nj, _) -> not (n == nj)) j) l
comb _ [] = []
comb s ((_, b) : xs) = map (\(a, _) -> (a, b)) s ++ comb s xs
-- Extends the semiring class with an index.
class Semiring s => Semiringi s where
index :: Int -> s
-- Semiringi-Instance for leftmost values.
instance Semiringi Leftmost where
index = Leftmost . Startl
-- Semiringi-Instance for leftlong values.
instance Semiringi LeftLong where
index i = LeftLong (Range i i)
-- Semiringi-Instance for all values.
instance Semiringi All where
index i = All (List [i])
--Semiringi-Instance for allRange values.
instance Semiringi AllRange where
index i = AllRange (Ranges [(i,i)])
-- Extends the semiring class with a list.
class Semiring s => Semiringc s where
list :: Int -> [Int] -> s
-- Semiringc-Instance for captureGroup values.
instance Semiringc CaptureGroups where
list i l = CaptureGroups (Groups ((map (\n -> (n, [(i, i)])) l), l))
-- Semiringc-Instance for all values.
instance Semiringc All where
list i _ = All (List [i])
-- Semiringc-Instance for allRange values.
instance Semiringc AllRange where
list i _ = AllRange (Ranges [(i,i)])
-- Class with functions needed for the implementation of capture groups.
class CGFunction s where
newcg :: s -> [Int] -> s
justr :: s -> [Int] -> s
instance CGFunction Bool where
newcg s _ = s
justr = newcg
instance CGFunction Int where
newcg s _ = s
justr = newcg
instance CGFunction Leftmost where
newcg s _ = s
justr = newcg
instance CGFunction LeftLong where
newcg s _ = s
justr = newcg
instance CGFunction All where
newcg s _ = s
justr = newcg
instance CGFunction AllRange where
newcg s _ = s
justr = newcg
instance CGFunction CaptureGroups where
newcg (NoCaptureGroups) _ = NoCaptureGroups
newcg (CaptureGroups (NoGroups)) _ = CaptureGroups (NoGroups)
newcg (CaptureGroups (Groups (g, _))) l = CaptureGroups (Groups (g, l))
justr (NoCaptureGroups) _ = NoCaptureGroups
justr (CaptureGroups (NoGroups)) _ = CaptureGroups (NoGroups)
justr (CaptureGroups (Groups (g, _))) l = CaptureGroups (Groups
(map (\(n, nl) -> if elem n l then (n, nl) else ((-abs(n)), nl)) g, l))
-- Functions for initializing a regex.
-- ε
eps :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s
eps l = Reg {emptyRe = one, final = zero, cg = l, reg = Eps}
--a
literal :: (Semiringc s, CGFunction s, Eq s) => [Int] -> Char
-> Reg (Int, Char) s
literal l a = symc l (==a)
-- a
sym :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> ((c, [Int]) -> s)
-> Reg c s
sym l f = Reg {emptyRe = zero, final = zero, cg = l, reg = Sym f}
-- a with index
symi :: (Semiringi s, CGFunction s, Eq s)=> [Int] -> Char -> Reg (Int, Char) s
symi l c = sym l weight
where weight ((pos,x), _) | x == c = index pos
| otherwise = zero
-- a with list
symc :: (Semiringc s, CGFunction s, Eq s)=> [Int] -> (Char -> Bool)
-> Reg (Int, Char) s
symc l f = sym l captureList
where captureList ((pos, x), li) | f x = list pos li
| otherwise = zero
-- α|β
alt :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
-> Reg c s
alt l p q = Reg {emptyRe = newcg (emptyRe p + emptyRe q) l,
final = newcg (final p + final q) l, cg = l, reg = Alt p q}
-- αβ
conc :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
-> Reg c s
conc l p q = Reg {emptyRe = emptyRe p * emptyRe q,
final = final p * emptyRe q + final q, cg = l, reg = Seq p q}
-- α*
rep :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
rep l r = Reg {emptyRe = one, final = newcg (final r) l, cg = l,
reg = Rep r}
-- α⁺
pl :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s -> Reg c s
pl l r = conc l r (rep l r)
-- _
anyL :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Reg c s
anyL l = sym l (\_ -> one)
-- [as]
bracket :: (Semiringc s, CGFunction s, Eq s) => [Int] ->
[Either Char (Char, Char)] -> Reg (Int, Char) s
bracket li l = let l' = map lift l
f = foldl (\g h -> (\c -> (g c) || (h c))) (head l')
(tail l')
in symc li (\c -> f c)
-- [^as]
negBracket :: (Semiringc s, CGFunction s, Eq s) => [Int] ->
[Either Char (Char, Char)] -> Reg (Int, Char) s
negBracket li l = let l' = map negLift l
f = foldl (\g h -> (\c -> (g c) && (h c)))(head l')
(tail l')
in symc li (\c -> f c)
-- ^α
start :: (Semiringc s, CGFunction s, Eq s) => [Int] -> Reg (Int, Char) s -> Bool
-> Reg (Int, Char) s
start l r True = Reg {emptyRe = emptyRe r, final = newcg (final r) l, cg = l,
reg = Start r}
start l r False = conc l (Reg {emptyRe = emptyRe r, final = newcg (final r) l,
cg = l, reg = Start r}) (rep l (symc l (\_ -> True)))
-- α$
end :: (Semiringc s, CGFunction s, Eq s) => [Int] -> Reg (Int, Char) s -> Bool
-> Reg (Int, Char) s
end l r True = Reg {emptyRe = emptyRe r, final = newcg (final r) l, cg = l,
reg = End r}
end l r False = conc l (rep l (symc l (\_ -> True))) (Reg {emptyRe = emptyRe r,
final = newcg (final r) l, cg = l, reg = End r})
-- α{n,m}
times :: (Semiring s, CGFunction s, Ord c, Eq s) => [Int] -> Int -> Int ->
Reg c s -> Reg c s
times l n m r =
if m < n || n < 0
then Reg {emptyRe = zero, final = zero, cg = l, reg = Times (n,m) r}
else if n == 0
then Reg {emptyRe = one, final = final r, cg = l, reg =
Times (n,m) r}
else Reg {emptyRe = emptyRe r, final = final r * time n, cg = l,
reg = Times (n,m) r}
where time k = if k == 0 then one else zero
-- /(α)/
captureG :: (Semiring s, CGFunction s, Ord c, Eq s) => Int -> Reg c s -> Reg c s
captureG _ r = r
-- For Bracket.
lift :: Either Char (Char,Char) -> Char -> Bool
lift (Left a) = \c -> c == a
lift (Right (a,b)) = \c -> (c >= a && c <= b)
-- For NegBracket.
negLift :: Either Char (Char,Char) -> Char -> Bool
negLift a c = not (lift a c)
-- The match function is used to match lists with regular expressions
-- efficient.
--- @param r - The regular expression
--- @param l - The input list
--- @result Semiring.one if matched else Semiring.zero
matchEff :: (Semiring s, CGFunction s, Ord c, Eq s) => Reg c s -> [c] -> s
matchEff r [] = emptyRe r
matchEff r (c:cs) = final (foldl (shift zero) (shift one r c) cs)
--- The shift function is used within the match function.
--- It marks (sets final) the right positions in the regular expression.
--- @param m - The mark
--- @param r - The regular expression
--- @param c - The next literal (of the input list)
--- @result reg - The marked regular expression
shift :: (Semiring s, CGFunction s, Ord c, Eq s) => s -> Reg c s -> c -> Reg c s
shift m r c = case reg r of
Eps -> eps (cg r)
(Sym f) -> (sym (cg r) f){final = (newcg (m * (f (c, cg r))) (cg r))}
(Alt p q) -> alt (cg r) (shift m p c) (shift m q c)
(Seq p q) -> conc (cg r) (shift m p c) (shift (m * emptyRe p + final p)
q c)
(Rep re) -> rep (cg r) (shift (newcg (m + justr (final re) (cg r))
(cg r)) re c)
(Start re) -> shift m re c
(End re) -> shift m re c
(Times (i,j) re) -> let nre = (shift (newcg (m + justr (final re) (cg r))
(cg r)) re c)
in times (cg r) (max (if (final nre == one) then i-1
else i) 0) (if (final nre == one) then j-1 else j) nre
--- The submatch function is used to check if a substring matches the regular
--- expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result Semiring.one with indices if matched else Semiring.zero
submatch :: (Semiring s, CGFunction s, Ord c, Eq s) => Reg (Int, c) s -> [c]
-> s
submatch r s = matchEff (conc [] arb (conc [] r arb)) (zip [0..] s)
where arb = rep [] (sym [] (\_ -> one))
--- The grep function returns efficient a list with startingpositions of
--- substrings that match the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list of startingpositions of matching substrings
grep :: Ord a => Reg (Int, a) All -> [a] -> [Int]
grep re s = case submatch re s of
All (List l) -> l
_ -> []
--- The grepPos function returns efficient the first startingposition of a
--- substring which matches the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result n - The startindex of the first substring that matches,
--- -1 if no substring matches
grepPos :: Ord a => Reg (Int, a) All -> [a] -> Int
grepPos re s = case grep re s of
[] -> -1
pos -> head pos
--- The grepShow function returns efficient a list of substrings that match
--- the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @return l - The list of substrings from s that match r
grepShow :: Ord a => Reg (Int, a) AllRange -> [a] -> [[a]]
grepShow re s = case submatch re s of
AllRange (Ranges list0) -> map (\(i,j) -> drop i (take (j Prelude.+1)s)) list0
_ -> []
-- As grepShow but without doubles.
grepShowUnique :: Ord a => Reg (Int, a) AllRange -> [a] -> [[a]]
grepShowUnique re s = nub (grepShow re s)
--- The capture function is used to return capture groups efficient.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list with the capture groups and
--- the respectively matching substrings
capture :: Ord c => Reg (Int, c) CaptureGroups -> [c] -> [(Int, [[c]])]
capture r s = case (matchEff r (zip [0.. ] s)) of
CaptureGroups (Groups (l, _)) -> combine $ map (\(n, li) -> (n, (map (\(i, j)
-> drop i (take (j Prelude.+ 1) s)) li))) l
_ -> []
-- Combines the strings of numbers with the same absolute value.
combine :: Ord c => [(Int, [[c]])] -> [(Int, [[c]])]
combine [] = []
combine ((n1, s1) : ns) = case filter (\(i, _) -> abs(i) == abs(n1)) ns of
[] -> (abs(n1), s1) : combine ns
li -> (abs(n1), foldr (++) [] $ map (\(_, s2) -> s2) ((n1, s1) : li)):
combine (filter (\(i, _) -> not $ abs(i) == abs(n1)) ns)
-- Match with capture groups.
match :: Ord c => Reg (Int, c) CaptureGroups -> [c] -> Bool
match r s = let l = capture r s
in if l == []
then False
else if (snd $ head l) == []
then False
else (head $ snd $ head l) == s
|