sourcecode:
|
{-# OPTIONS_FRONTEND -Wno-incomplete-patterns #-}
module RegExp
( RegExp, ORegExp(..), eps, literal, alt, conc, rep, pl, anyL, bracket, negBracket
, start, end, times, captureG
, match, grep, grepPos, grepShow, grepShowUnique, capture
)
where
import Data.List
import Control.Search.Unsafe ( allValues )
------------------------------------------------------------------------------
--- Data type for regex representation in Curry.
type RegExp a = [ORegExp a]
data ORegExp a = Nil
| Literal a
| Xor (RegExp a) (RegExp a)
| Star (RegExp a)
| Plus (RegExp a)
| AnyLiteral
| Bracket [Either a (a,a)]
| NegBracket [Either a (a,a)]
| Start (RegExp a)
| End (RegExp a)
| Times (Int,Int) (RegExp a)
| Capture Int (RegExp a)
------------------------------------------------------------------------------
-- Interface operations for regular expressions, used by the Curry preprocessor.
--- ε
eps :: [Int] -> RegExp a
eps _ = [Nil]
--- a
literal :: [Int] -> a -> RegExp a
literal _ a = [Literal a]
--- α|β
alt :: [Int] -> RegExp a -> RegExp a -> RegExp a
alt _ p q = [Xor p q]
--- αβ
conc :: [Int] -> RegExp a -> RegExp a -> RegExp a
conc _ p q = p ++ q
--- α*
rep :: [Int] -> RegExp a -> RegExp a
rep _ r = [Star r]
--- α⁺
pl :: [Int] -> RegExp a -> RegExp a
pl _ r = [Plus r]
--- _
anyL :: [Int] -> RegExp a
anyL _ = [AnyLiteral]
--- [as]
bracket :: [Int] -> [Either a (a,a)] -> RegExp a
bracket _ l = [Bracket l]
--- [^as]
negBracket :: [Int] -> [Either a (a,a)] -> RegExp a
negBracket _ l = [NegBracket l]
--- ^α
start :: [Int] -> RegExp a -> Bool -> RegExp a
start _ r _ = [Start r]
--- α$
end :: [Int] -> RegExp a -> Bool -> RegExp a
end _ r _ = [End r]
--- α{n,m}
times :: [Int] -> Int -> Int -> RegExp a -> RegExp a
times _ n m r = [Times (n,m) r]
--- /(α)/
captureG :: Int -> RegExp a -> RegExp a
captureG n r = [Capture n r]
------------------------------------------------------------------------------
-- Various operations on regular expressions.
--- The operation `grepPos` returns the first starting position of a substring
--- which matches the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result n - The startindex of the first substring that matches,
--- -1 if no substring matches
grepPos :: (Data a, Ord a) => RegExp a -> [a] -> Int
grepPos re s = grepPos' re s 0
where
-- grepPos' :: (Data a, Ord a) => RegExp a -> [a] -> Int -> Int
grepPos' _ [] _ = -1
grepPos' r (x : xs) n = if match (r ++ [Star ([AnyLiteral])]) (x : xs)
then n
else grepPos' r xs (n+1)
--- The operation `grep` returns a list with starting positions of substrings
--- that match the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list of startingpositions of matching substrings
grep :: (Data a, Ord a) => RegExp a -> [a] -> [Int]
grep re s = grep' re s 0
where
-- grep' :: (Data a, Ord a) => RegExp a -> [a] -> Int -> [Int]
grep' _ [] _ = []
grep' r (x : xs) n = if match (r ++ [Star ([AnyLiteral])]) (x : xs)
then (n : grep' r xs (n+1))
else grep' r xs (n+1)
--- The operation `grepShow` returns a list of substrings that match the regular
--- expression.
--- @param r - The regular expression
--- @param s - The input list
--- @return l - The list of substrings from s that match r
grepShow :: (Data a, Ord a) => RegExp a -> [a] -> [[a]]
grepShow re s = allValues (grepShow' re s)
where grepShow' re' s' | s' =:= _ ++ l ++ _ && match re' l = l
where l free
--- As `grepShow` but without duplicated elements.
grepShowUnique :: (Data a, Ord a) => RegExp a -> [a] -> [[a]]
grepShowUnique re s = nub (grepShow re s)
--- The match function is used to match lists with regular expressions
--- @param r - The regular expression
--- @param s - The input list
--- @result True if matched else False
match :: (Data a, Ord a) => RegExp a -> [a] -> Bool
match re s = case re of
[] -> s == []
(Nil:ors) -> match ors s
(Xor or1 or2:ors) -> match (or1 ++ ors) s || match (or2 ++ ors) s
(Literal c:ors) -> case s of
[] -> False
(d:ds) -> if (d == c) then match ors ds else False
(Star r:ors) -> matchstar r ors s
(Plus r:ors) -> matchplus r ors s
(AnyLiteral:ors) -> case s of
[] -> False
(_:ds) -> match ors ds
(Bracket b:ors) -> case s of
[] -> False
(d:ds) -> (matchbracket b d) && match ors ds
(NegBracket b:ors) -> case s of
[] -> False
(d:ds) -> not (matchbracket b d) && match ors ds
(Start r:ors) ->
not . null . filter id . allValues $ matchstart (Start r:ors) s
[End r] ->
not . null . filter id . allValues $ matchend (End r) s
[End r, Nil] ->
not . null . filter id . allValues $ matchend (End r) s
(End _ : _) -> False
(Times (n,m) r:ors) -> matchtimes s n m r ors
(Capture _ r:ors) -> match (r ++ ors) s
-- Matching start.
matchstart :: (Data a, Ord a) => RegExp a -> [a] -> Bool
matchstart (Start r : []) s = a ++ _ =:= s && match r a
where a free
matchstart (Start r : ors@(_:_)) s = a ++ b =:= s && match r a && match ors b
where a,b free
-- Matching end.
matchend :: (Data a, Ord a) => ORegExp a -> [a] -> Bool
matchend (End r) s = _ ++ b =:= s && match r b
where b free
-- Matching with a star
matchstar :: (Data a, Ord a) => RegExp a -> RegExp a -> [a] -> Bool
matchstar r rgx st = (||)
(match rgx st)
(tryeach (map (\x -> match r x) (inits st)) (tails st) r rgx)
tryeach :: (Data a, Ord a) => [Bool] -> [[a]] -> RegExp a -> RegExp a -> Bool
tryeach [] [] _ _ = False
tryeach (b:bs) (t:ts) r rgx =
(||)
(if b
then
(match rgx t || matchstar r rgx t)
else False)
(tryeach bs ts r rgx)
-- Matching with a plus
matchplus :: (Data a, Ord a) => RegExp a -> RegExp a -> [a] -> Bool
matchplus r rgx st = tryeach (map (\x -> match r x) ini) tls r rgx
where
ini = tail (inits st)
tls = tail (tails st)
-- Matching with a bracket
matchbracket :: (Data a, Ord a) => [Either a (a,a)] -> a -> Bool
matchbracket [] _ = False
matchbracket (Left c:es) d | c == d = True
| c /= d = matchbracket es d
matchbracket (Right (c1,c2):es) d =
(||)
(d >= c1 && d <= c2)
(matchbracket es d)
-- Matching an amount of times between a range
matchtimes :: (Data a, Ord a) => [a] -> Int -> Int -> RegExp a -> RegExp a
-> Bool
matchtimes s n m r rgx | m == 0 = match rgx s
| m > 0 =
tryeachRestricted (m-n) (map (\x -> match mr x) (inits s)) (tails s) r rgx
where
mr = concat (replicate n r)
tryeachRestricted :: (Data a, Ord a) => Int -> [Bool] -> [[a]] -> RegExp a
-> RegExp a -> Bool
tryeachRestricted _ [] [] _ _ = False
tryeachRestricted m (b:bs) (t:ts) r rgx =
(||)
(if b
then
(match rgx t || matchtimes t 1 m r rgx)
else False)
(tryeachRestricted m bs ts r rgx)
--- The operation `capture` is used to return capture groups.
--- The capture group with number 0 is always the complete string.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list with the capture groups and
--- the respectively matching substrings
capture :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
capture re s = case re of
[] -> []
(Nil:ors) -> capture ors s
(Xor or1 or2:ors) -> if match (or1 ++ ors) s
then capture (or1 ++ ors) s
else capture (or2 ++ ors) s
(Literal c:ors) -> case s of
[] -> []
(d:ds) -> if (d == c) then capture ors ds else []
(Star r:ors) -> con $ allValues $ captureStar (Star r : ors) s
(Plus r:ors) -> capture (r ++ [Star r] ++ ors) s
(AnyLiteral:ors) -> case s of
[] -> []
(_:ds) -> capture ors ds
(Bracket b:ors) -> case s of
[] -> []
(d:ds) -> if matchbracket b d
then capture ors ds
else []
(NegBracket b:ors) -> case s of
[] -> []
(d:ds) -> if not (matchbracket b d)
then capture ors ds
else []
(Start r:ors) -> con $ allValues $ captureStart (Start r : ors) s
[End r] -> con $ allValues $ captureEnd (End r) s
[End r, Nil] -> con $ allValues $ captureEnd (End r) s
(End _ : _) -> []
(Times (n,m) r:ors) -> con $ allValues $ captureTimes (Times (n,m) r : ors) s
(Capture n r:ors) -> con $ allValues $ captureCapture (Capture n r : ors) s
-- Finds the capture groups within a star.
captureStar :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureStar (Star r : ors) s =
if s =:= s1 ++ s2 && match ([Star r]) s1 && match ors s2
then concat (map (capture r) (grepShow r s1)) ++ capture ors s2
else []
where s1, s2 free
-- Finds the capture groups within a start.
captureStart :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureStart re s = case re of
(Start r : []) -> if match (Start r : []) s
then capture r (head (grepShow r s))
else []
(Start r : ors) ->
if s =:= s1 ++ s2 && match (Start r : ors) s && match ors s2
then capture r s1 ++ capture ors s2
else []
where s1, s2 free
-- Finds the capture groups within an end.
captureEnd :: (Data a, Ord a) => ORegExp a -> [a] -> [(Int, [[a]])]
captureEnd (End r) s =
if s =:= _ ++ s2 && match r s2
then capture r s2
else []
where s2 free
-- Finds the capture groups within a times.
captureTimes :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureTimes (Times (n,m) r : ors) s =
if s =:= s1 ++ s2 && match ([Times (n,m) r]) s1 && match ors s2
then concat (map (capture r) (grepShow r s1)) ++ capture ors s2
else []
where s1, s2 free
-- Finds capture groups.
captureCapture :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureCapture (Capture n r : ors) s =
if s =:= s1 ++ s2 && match r s1 && match ors s2
then [(n, [s1])] ++ capture r s1 ++ capture ors s2
else []
where s1, s2 free
-- Auxiliary operation to unpack result lists and clean.
con :: (Data a, Ord a) => [[(Int, [[a]])]] -> [(Int, [[a]])]
con li = comprime (concat li)
-- Gets a list with pairs of int and some lists and returns a list
-- where the pairs with the same int-number are combined.
comprime :: (Data a, Ord a) => [(Int, [[a]])] -> [(Int, [[a]])]
comprime [] = []
comprime ((n, as) : rl) = case filter (\(m, _) -> n == m) rl of
[] -> (n, as) : comprime rl
l -> (n, as ++ concatMap snd l):comprime (filter (\(m, _) -> not(n == m)) rl)
------------------------------------------------------------------------------
|