sourcecode:
|
module HTML.Parser ( readHtmlFile, parseHtmlString )
where
import Data.Char
import HTML.Base
------------------------------------------------------------------------------
--- Reads a file with HTML text and returns the corresponding HTML expressions.
--- @param file - the name of a file containing HTML text
--- @return a list of HTML expressions (if the file contains exactly one
--- HTML document, this list should contain exactly one element)
readHtmlFile :: HTML h => String -> IO [h]
readHtmlFile file = readFile file >>= return . parseHtmlString
--- Transforms an HTML string into a list of `BaseHTML` expressions.
--- If the HTML string is a well structured document, the list
--- of HTML expressions should contain exactly one element.
parseHtmlString :: HTML h => String -> [h]
parseHtmlString = map fromStaticHtml . parseHtml
--- Transforms an HTML string into a list of `StaticHtml` expressions.
--- If the HTML string is a well structured document, the list
--- of HTML expressions should contain exactly one element.
parseHtml :: String -> [StaticHtml]
parseHtml s = reverse (parseHtmlTokens [] (scanHtmlString s))
------------------------------------------------------------------------------
--- The data type for representing HTML tokens.
data HtmlToken = HTText String | HTElem String [(String,String)]
-- parse a list of HTML tokens into list of HTML expressions:
-- (first argument "helems" is a stack of already read tokens)
parseHtmlTokens :: [StaticHtml] -> [HtmlToken] -> [StaticHtml]
parseHtmlTokens helems [] = helems
parseHtmlTokens helems (HTText s : hs) =
parseHtmlTokens (HText s : helems) hs
parseHtmlTokens helems (HTElem (t:ts) args : hs) =
if t == '/'
then let (structargs,elems,rest) = splitHtmlElems ts helems
in parseHtmlTokens ([HStruct ts structargs elems] ++ rest) hs
else parseHtmlTokens (HStruct (t:ts) args [] : helems) hs
parseHtmlTokens _ (HTElem [] _ : _) =
error "Internal error in HTML.Parser.parseHtmlTokens: empty list in HTElem"
-- split the HTML token stack up to a particular token:
splitHtmlElems :: String -> [StaticHtml]
-> ([(String,String)],[StaticHtml],[StaticHtml])
splitHtmlElems _ [] = ([],[],[])
splitHtmlElems tag (HText s : hs) =
let (largs,elems,rest) = splitHtmlElems tag hs
in (largs, elems ++ [HText s], rest)
splitHtmlElems tag (HStruct s args cont@(_:_) : hs) =
let (largs,elems,rest) = splitHtmlElems tag hs
in (largs, elems ++ [HStruct s args cont], rest)
splitHtmlElems tag (HStruct s args []: hs) =
if tag==s
then (args,[],hs)
else let (largs,elems,rest) = splitHtmlElems tag hs
in (largs, elems ++ [HStruct s args []], rest)
-- scan an HTML string into list of HTML tokens:
scanHtmlString :: String -> [HtmlToken]
scanHtmlString s = scanHtml s
where
scanHtml [] = []
scanHtml (c:cs) =
if c=='<'
then if take 3 cs == "!--"
then scanHtmlComment cs
else if take 4 (map toLower cs) == "pre>"
then scanHtmlPre "" (skipFirstNewLine (drop 4 cs))
else scanHtmlElem [] cs
else let (initxt,remtag) = break (=='<') (c:cs)
in HTText initxt : scanHtml remtag
-- scan an HTML element
scanHtmlElem :: String -> String -> [HtmlToken]
scanHtmlElem ct [] = [HTText ("<"++ct)] -- incomplete element
scanHtmlElem ct (c:cs)
| c=='>' = (if null ct
then HTText "<>" -- invalid HTML, but we accept it...
else HTElem ct []) : scanHtmlString cs
| isSpace c =
if null ct
then HTText "< " : scanHtmlString cs -- invalid HTML, but we accept it...
else let (args,rest) = splitAtElement (=='>') (dropWhile isSpace cs)
revargs = reverse args
in if null args || head revargs /= '/'
then HTElem ct (string2args args) : scanHtmlString rest
else HTElem ct (string2args (reverse (tail revargs)))
: HTElem ('/':ct) [] : scanHtmlString rest
| c=='/' && head cs == '>' = HTElem ct [] : HTElem ('/':ct) []
: scanHtmlString (tail cs)
| otherwise = scanHtmlElem (ct++[toLower c]) cs
-- scan an HTML comment
scanHtmlComment :: String -> [HtmlToken]
scanHtmlComment [] = []
scanHtmlComment (c:cs) =
if c=='-' && take 2 cs == "->"
then scanHtmlString (drop 2 cs)
else scanHtmlComment cs
-- scan an HTML preformatted element
scanHtmlPre :: String -> String -> [HtmlToken]
scanHtmlPre _ [] = [] -- errorneous incomplete element
scanHtmlPre pre (c:cs) =
if c=='<' && take 5 (map toLower cs) == "/pre>"
then HTElem "pre" [] : HTText (reverse pre) : HTElem "/pre" []
: scanHtmlString (drop 5 cs)
else scanHtmlPre (c:pre) cs
-- split a string into blank separated list of strings:
string2args :: String -> [(String,String)]
string2args [] = []
string2args (c:cs) =
let (arg1,rest) = splitAtElement isSpace (c:cs)
in deleteApo (splitAtElement (=='=') arg1)
: string2args (dropWhile isSpace rest)
deleteApo :: (String,String) -> (String,String)
deleteApo (tag,[]) = (map toLower tag,[])
deleteApo (tag,c:cs) | c=='"' = (map toLower tag, deleteLastApo cs)
| c=='\'' = (map toLower tag, deleteLastApo cs)
| otherwise = (map toLower tag, c:cs)
deleteLastApo :: String -> String
deleteLastApo [] = []
deleteLastApo [c] = if c=='"' || c=='\'' then [] else [c]
deleteLastApo (c1:c2:cs) = c1 : deleteLastApo (c2:cs)
-- split a list at the first element satisfying a predicate:
splitAtElement :: (a -> Bool) -> [a] -> ([a],[a])
splitAtElement _ [] = ([],[])
splitAtElement p (c:cs) =
if p c then ([],cs)
else let (first,rest) = splitAtElement p cs in (c:first,rest)
skipFirstNewLine :: String -> String
skipFirstNewLine [] = []
skipFirstNewLine (c:cs) =
if c=='\n' then cs
else if isSpace c then skipFirstNewLine cs else c:cs
-- end of HTML parser
|