CurryInfo: regexp-4.1.0 / RegExp

classes:

              
documentation:
------------------------------------------------------------------------------
--- A library to defines a structure of regular expressions and
--- a simple match operation for regular expressions.
--- This library is used to translate integrated code in the form
--- of POSIX extended regular expressions into Curry programs.
--- It is based on the bachelor thesis "Foreign Code Integration in Curry" of
--- Jasper Sikorra (March 2014).
---
--- @author Corinna Wambsganz
--- @version October 2022
------------------------------------------------------------------------------
name:
RegExp
operations:
alt anyL bracket capture captureG conc end eps grep grepPos grepShow grepShowUnique literal match negBracket pl rep start times
sourcecode:
{-# OPTIONS_FRONTEND -Wno-incomplete-patterns #-}


module RegExp
  ( RegExp, ORegExp(..), eps, literal, alt, conc, rep, pl, anyL, bracket, negBracket
  , start, end, times, captureG
  , match, grep, grepPos, grepShow, grepShowUnique, capture
  ) 
 where

import Data.List
import Control.Search.Unsafe ( allValues )

------------------------------------------------------------------------------
--- Data type for regex representation in Curry.
type RegExp a = [ORegExp a]

data ORegExp a = Nil
               | Literal a
               | Xor (RegExp a) (RegExp a)
               | Star (RegExp a)
               | Plus (RegExp a)
               | AnyLiteral
               | Bracket [Either a (a,a)]
               | NegBracket [Either a (a,a)]
               | Start (RegExp a)
               | End (RegExp a)
               | Times (Int,Int) (RegExp a)
               | Capture Int (RegExp a)


------------------------------------------------------------------------------
-- Interface operations for regular expressions, used by the Curry preprocessor.

--- ε
eps :: [Int] -> RegExp a
eps _ = [Nil]

--- a
literal :: [Int] -> a -> RegExp a
literal _ a = [Literal a]

--- α|β
alt :: [Int] -> RegExp a -> RegExp a -> RegExp a
alt _ p q = [Xor p q]

--- αβ
conc :: [Int] -> RegExp a -> RegExp a -> RegExp a
conc _ p q = p ++ q

--- α*
rep :: [Int] -> RegExp a -> RegExp a
rep _ r = [Star r]

--- α⁺
pl :: [Int] -> RegExp a -> RegExp a
pl _ r = [Plus r]

--- _
anyL :: [Int] -> RegExp a
anyL _ = [AnyLiteral]

--- [as]
bracket :: [Int] -> [Either a (a,a)] -> RegExp a
bracket _ l = [Bracket l]

--- [^as]
negBracket :: [Int] -> [Either a (a,a)]  -> RegExp a
negBracket _ l = [NegBracket l]

--- ^α
start :: [Int] -> RegExp a -> Bool -> RegExp a
start _ r _ = [Start r]

--- α$
end :: [Int] -> RegExp a -> Bool -> RegExp a
end _ r _ = [End r]

--- α{n,m}
times :: [Int] -> Int -> Int -> RegExp a -> RegExp a
times _ n m r = [Times (n,m) r]

--- /(α)/
captureG :: Int -> RegExp a -> RegExp a
captureG n r = [Capture n r]

------------------------------------------------------------------------------
-- Various operations on regular expressions.

--- The operation `grepPos` returns the first starting position of a substring
--- which matches the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result n - The startindex of the first substring that matches,
--- -1 if no substring matches
grepPos ::  (Data a, Ord a) => RegExp a -> [a] -> Int
grepPos re s = grepPos' re s 0
 where
  -- grepPos' :: (Data a, Ord a) => RegExp a -> [a] -> Int -> Int
  grepPos' _  []       _ = -1
  grepPos' r (x : xs) n = if match (r ++ [Star ([AnyLiteral])]) (x : xs)
                            then n
                            else grepPos' r xs (n+1)

--- The operation `grep` returns a list with starting positions of substrings
--- that match the regular expression.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list of startingpositions of matching substrings
grep ::  (Data a, Ord a) => RegExp a -> [a] -> [Int]
grep re s = grep' re s 0
 where
  -- grep' :: (Data a, Ord a) => RegExp a -> [a] -> Int -> [Int]
  grep' _  []       _ = []
  grep' r (x : xs) n = if match (r ++ [Star ([AnyLiteral])]) (x : xs)
                            then (n : grep' r xs (n+1))
                            else grep' r xs (n+1)

--- The operation `grepShow` returns a list of substrings that match the regular
--- expression.
--- @param r - The regular expression
--- @param s - The input list
--- @return l - The list of substrings from s that match r
grepShow :: (Data a, Ord a) => RegExp a -> [a] -> [[a]]
grepShow re s = allValues (grepShow' re s)
 where grepShow' re' s' | s' =:= _ ++ l ++ _ && match re' l = l
                          where l free

--- As `grepShow` but without duplicated elements.
grepShowUnique :: (Data a, Ord a) => RegExp a -> [a] -> [[a]]
grepShowUnique re s = nub  (grepShow re s)

--- The match function is used to match lists with regular expressions
--- @param r - The regular expression
--- @param s - The input list
--- @result True if matched else False
match :: (Data a, Ord a) => RegExp a -> [a] -> Bool
match re s = case re of
  []                  -> s == []
  (Nil:ors)           -> match ors s
  (Xor or1 or2:ors)   -> match (or1 ++ ors) s || match (or2 ++ ors) s
  (Literal c:ors)     -> case s of
    []      -> False
    (d:ds)  -> if (d == c) then match ors ds else False
  (Star r:ors)        -> matchstar r ors s
  (Plus r:ors)        -> matchplus r ors s
  (AnyLiteral:ors)    -> case s of
    []      -> False
    (_:ds)  -> match ors ds
  (Bracket b:ors)     -> case s of
    []      -> False
    (d:ds)  -> (matchbracket b d) && match ors ds
  (NegBracket b:ors)  -> case s of
    []      -> False
    (d:ds)  -> not (matchbracket b d) && match ors ds
  (Start r:ors)       ->
    not . null . filter id . allValues $ matchstart (Start r:ors) s
  [End r]             ->
    not . null . filter id . allValues $ matchend (End r) s
  [End r, Nil]        ->
    not . null . filter id . allValues $ matchend (End r) s
  (End _ : _)         -> False
  (Times (n,m) r:ors) -> matchtimes s n m r ors
  (Capture _ r:ors)   -> match (r ++ ors) s

-- Matching start.
matchstart :: (Data a, Ord a) => RegExp a -> [a] -> Bool
matchstart (Start r : []) s = a ++ _ =:= s && match r a
 where a free
matchstart (Start r : ors@(_:_)) s = a ++ b =:= s && match r a && match ors b
 where a,b free

-- Matching end.
matchend :: (Data a, Ord a) => ORegExp a -> [a] -> Bool
matchend (End r) s = _ ++ b =:= s && match r b
  where b free

-- Matching with a star
matchstar :: (Data a, Ord a) => RegExp a -> RegExp a -> [a] -> Bool
matchstar r rgx st = (||)
  (match rgx st)
  (tryeach (map (\x -> match r x) (inits st)) (tails st) r rgx)

tryeach :: (Data a, Ord a) => [Bool] -> [[a]] -> RegExp a -> RegExp a -> Bool
tryeach [] []         _  _   = False
tryeach (b:bs) (t:ts) r  rgx =
  (||)
    (if b
      then
        (match rgx t || matchstar r rgx t)
      else False)
    (tryeach bs ts r rgx)

-- Matching with a plus
matchplus :: (Data a, Ord a) => RegExp a -> RegExp a -> [a] -> Bool
matchplus r rgx st = tryeach (map (\x -> match r x) ini) tls r rgx
  where
    ini = tail (inits st)
    tls = tail (tails st)

-- Matching with a bracket
matchbracket :: (Data a, Ord a) => [Either a (a,a)] -> a -> Bool
matchbracket [] _                          = False
matchbracket (Left c:es)        d | c == d = True
                                  | c /= d = matchbracket es d
matchbracket (Right (c1,c2):es) d          =
  (||)
    (d >= c1 && d <= c2)
    (matchbracket es d)

-- Matching an amount of times between a range
matchtimes :: (Data a, Ord a) => [a] -> Int -> Int -> RegExp a -> RegExp a
           -> Bool
matchtimes s n m r rgx | m == 0 = match rgx s
                       | m >  0 =
  tryeachRestricted (m-n) (map (\x -> match mr x) (inits s)) (tails s) r rgx
  where
    mr = concat (replicate n r)

tryeachRestricted :: (Data a, Ord a) => Int -> [Bool] -> [[a]] -> RegExp a
                  -> RegExp a -> Bool
tryeachRestricted _      []     []     _   _   = False
tryeachRestricted m      (b:bs) (t:ts) r  rgx  =
  (||)
    (if b
      then
        (match rgx t || matchtimes t 1 m r rgx)
      else False)
    (tryeachRestricted m bs ts r rgx)

--- The operation `capture` is used to return capture groups.
--- The capture group with number 0 is always the complete string.
--- @param r - The regular expression
--- @param s - The input list
--- @result l - The list with the capture groups and
--- the respectively matching substrings
capture :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
capture re s = case re of
  []                  -> []
  (Nil:ors)           -> capture ors s
  (Xor or1 or2:ors)   -> if match (or1 ++ ors) s
                            then capture (or1 ++ ors) s
                            else capture (or2 ++ ors) s
  (Literal c:ors)     -> case s of
    []      -> []
    (d:ds)  -> if (d == c) then capture ors ds else []
  (Star r:ors)        -> con $ allValues $ captureStar (Star r : ors) s
  (Plus r:ors)        -> capture (r ++ [Star r] ++ ors) s
  (AnyLiteral:ors)    -> case s of
    []      -> []
    (_:ds)  -> capture ors ds
  (Bracket b:ors)     -> case s of
    []      -> []
    (d:ds)  -> if matchbracket b d
                 then capture ors ds
                 else []
  (NegBracket b:ors)  -> case s of
    []      -> []
    (d:ds)  -> if not (matchbracket b d)
                then capture ors ds
                else []
  (Start r:ors)       -> con $ allValues $ captureStart (Start r : ors) s
  [End r]             -> con $ allValues $ captureEnd (End r) s
  [End r, Nil]        -> con $ allValues $ captureEnd (End r) s
  (End _ : _)         -> []
  (Times (n,m) r:ors) -> con $ allValues $ captureTimes (Times (n,m) r : ors) s
  (Capture n r:ors)   -> con $ allValues $ captureCapture (Capture n r : ors) s

-- Finds the capture groups within a star.
captureStar :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureStar (Star r : ors) s =
  if s =:= s1 ++ s2 && match ([Star r]) s1 && match ors s2
    then concat (map (capture r) (grepShow r s1)) ++ capture ors s2
    else []
      where s1, s2 free

-- Finds the capture groups within a start.
captureStart :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureStart re s = case re of
  (Start r : [])  -> if match (Start r : []) s
                       then capture r (head (grepShow r s))
                       else []
  (Start r : ors) ->
    if s =:= s1 ++ s2 && match (Start r : ors) s && match ors s2
      then capture r s1 ++ capture ors s2
      else []
        where s1, s2 free

-- Finds the capture groups within an end.
captureEnd :: (Data a, Ord a) => ORegExp a -> [a] -> [(Int, [[a]])]
captureEnd (End r) s =
  if s =:= _ ++ s2 && match r s2
    then capture r s2
    else []
 where s2 free

-- Finds the capture groups within a times.
captureTimes :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureTimes (Times (n,m) r : ors) s =
  if s =:= s1 ++ s2 && match ([Times (n,m) r]) s1 && match ors s2
    then concat (map (capture r) (grepShow r s1)) ++ capture ors s2
    else []
 where s1, s2 free

-- Finds capture groups.
captureCapture :: (Data a, Ord a) => RegExp a -> [a] -> [(Int, [[a]])]
captureCapture (Capture n r : ors) s =
  if s =:= s1 ++ s2 && match r s1 && match ors s2
    then [(n, [s1])] ++ capture r s1 ++ capture ors s2
    else []
 where s1, s2 free


-- Auxiliary operation to unpack result lists and clean.
con :: (Data a, Ord a) => [[(Int, [[a]])]] -> [(Int, [[a]])]
con li = comprime (concat li)

-- Gets a list with pairs of int and some lists and returns a list
-- where the pairs with the same int-number are combined.
comprime :: (Data a, Ord a) => [(Int, [[a]])] -> [(Int, [[a]])]
comprime []             = []
comprime ((n, as) : rl) = case filter (\(m, _) -> n == m) rl of
  [] -> (n, as) : comprime rl
  l  -> (n, as ++ concatMap snd l):comprime (filter (\(m, _) -> not(n == m)) rl)

------------------------------------------------------------------------------
types:
ORegExp RegExp
unsafe:
unsafe due to modules RegExp Control.Search.Unsafe