Module HTML.Parser

------------------------------------------------------------------------------
--- This module contains a very simple parser for HTML documents.
---
--- @author Michael Hanus
--- @version October 2020
------------------------------------------------------------------------------
{-# OPTIONS_CYMAKE -Wno-incomplete-patterns #-}

module HTML.Parser ( readHtmlFile, parseHtmlString )
 where

import Char
import HTML.Base

--- Reads a file with HTML text and returns the corresponding HTML expressions.
--- @param file - the name of a file containing HTML text
--- @return a list of HTML expressions (if the file contains exactly one
---         HTML document, this list should contain exactly one element)
readHtmlFile :: String -> IO [BaseHtml]
readHtmlFile file = readFile file >>= return . parseHtmlString

------------------------------------------------------------------------------
--- Transforms an HTML string into a list of HTML expressions.
--- If the HTML string is a well structured document, the list
--- of HTML expressions should contain exactly one element.
parseHtmlString :: String -> [BaseHtml]
parseHtmlString s = reverse (parseHtmlTokens [] (scanHtmlString s))

--- The data type for representing HTML tokens.
data HtmlToken = HText String | HElem String [(String,String)]

-- parse a list of HTML tokens into list of HTML expressions:
-- (first argument "helems" is a stack of already read tokens)
parseHtmlTokens :: [BaseHtml] -> [HtmlToken] -> [BaseHtml]
parseHtmlTokens helems []             = helems
parseHtmlTokens helems (HText s : hs) =
 parseHtmlTokens (BaseText s : helems) hs
parseHtmlTokens helems (HElem (t:ts) args : hs) =
 if t == '/'
   then let (structargs,elems,rest) = splitHtmlElems ts helems
        in parseHtmlTokens ([BaseStruct ts structargs elems] ++ rest) hs
   else parseHtmlTokens (BaseStruct (t:ts) args [] : helems) hs


-- split the HTML token stack up to a particular token:
splitHtmlElems :: String -> [BaseHtml]
               -> ([(String,String)],[BaseHtml],[BaseHtml])
splitHtmlElems _ [] = ([],[],[])
splitHtmlElems tag (BaseText s : hs) =
 let (largs,elems,rest) = splitHtmlElems tag hs
 in (largs, elems ++ [BaseText s], rest)
splitHtmlElems tag (BaseStruct s args cont@(_:_) : hs) =
 let (largs,elems,rest) = splitHtmlElems tag hs
 in (largs, elems ++ [BaseStruct s args cont], rest)
splitHtmlElems tag (BaseStruct s args []: hs) =
 if tag==s
   then (args,[],hs)
   else let (largs,elems,rest) = splitHtmlElems tag hs
        in  (largs, elems ++ [BaseStruct s args []], rest)


-- scan an HTML string into list of HTML tokens:
scanHtmlString :: String -> [HtmlToken]
scanHtmlString s = scanHtml s
 where
  scanHtml []     = []
  scanHtml (c:cs) =
    if c=='<'
    then if take 3 cs == "!--"
         then scanHtmlComment cs
         else if take 4 (map toLower cs) == "pre>"
              then scanHtmlPre "" (skipFirstNewLine (drop 4 cs))
              else scanHtmlElem [] cs
    else let (initxt,remtag) = break (=='<') (c:cs)
          in HText initxt : scanHtml remtag

-- scan an HTML element
scanHtmlElem :: String -> String -> [HtmlToken]
scanHtmlElem ct [] = [HText ("&lt;"++ct)] -- incomplete element
scanHtmlElem ct (c:cs)
  | c=='>'    = (if null ct
                 then HText "&lt;&gt;" -- invalid HTML, but we accept it...
                 else HElem ct [])  : scanHtmlString cs
  | isSpace c =
     if null ct
     then HText "&lt; " : scanHtmlString cs -- invalid HTML, but we accept it...
     else let (args,rest) = splitAtElement (=='>') (dropWhile isSpace cs)
              revargs = reverse args
           in if null args || head revargs /= '/'
              then HElem ct (string2args args) : scanHtmlString rest
              else HElem ct (string2args (reverse (tail revargs)))
                    : HElem ('/':ct) [] : scanHtmlString rest
  | c=='/' && head cs == '>' = HElem ct [] : HElem ('/':ct) []
                                           : scanHtmlString (tail cs)
  | otherwise = scanHtmlElem (ct++[toLower c]) cs

-- scan an HTML comment
scanHtmlComment :: String -> [HtmlToken]
scanHtmlComment [] = []
scanHtmlComment (c:cs) =
  if c=='-' && take 2 cs == "->"
  then scanHtmlString (drop 2 cs)
  else scanHtmlComment cs

-- scan an HTML preformatted element
scanHtmlPre :: String -> String -> [HtmlToken]
scanHtmlPre _ [] = []  -- errorneous incomplete element
scanHtmlPre pre (c:cs) =
  if c=='<' && take 5 (map toLower cs) == "/pre>"
  then HElem "pre" [] : HText (reverse pre) : HElem "/pre" []
       : scanHtmlString (drop 5 cs)
  else scanHtmlPre (c:pre) cs

-- split a string into blank separated list of strings:
string2args :: String -> [(String,String)]
string2args [] = []
string2args (c:cs) =
   let (arg1,rest) = splitAtElement isSpace (c:cs)
   in  deleteApo (splitAtElement (=='=') arg1)
        : string2args (dropWhile isSpace rest)

deleteApo :: (String,String) -> (String,String)
deleteApo (tag,[]) = (map toLower tag,[])
deleteApo (tag,c:cs) | c=='"'    = (map toLower tag, deleteLastApo cs)
                     | c=='\''   = (map toLower tag, deleteLastApo cs)
                     | otherwise = (map toLower tag, c:cs)

deleteLastApo :: String -> String
deleteLastApo [] = []
deleteLastApo [c] = if c=='"' || c=='\'' then [] else [c]
deleteLastApo (c1:c2:cs) = c1 : deleteLastApo (c2:cs)


-- split a list at the first element satisfying a predicate:
splitAtElement :: (a -> Bool) -> [a] -> ([a],[a])
splitAtElement _ [] = ([],[])
splitAtElement p (c:cs) =
  if p c then ([],cs)
         else let (first,rest) = splitAtElement p cs in (c:first,rest)

skipFirstNewLine :: String -> String
skipFirstNewLine [] = []
skipFirstNewLine (c:cs) =
  if c=='\n' then cs
             else if isSpace c then skipFirstNewLine cs else c:cs

-- end of HTML parser