CurryInfo: markdown-3.3.0 / Text.Markdown

classes:

              
documentation:
------------------------------------------------------------------------------
--- Library to translate
--- [markdown documents](http://en.wikipedia.org/wiki/Markdown)
--- into HTML or LaTeX.
--- The slightly restricted subset of the markdown syntax recognized by
--- this implementation is
--- [documented in this page](https://github.com/curry-packages/markdown/blob/main/docs/markdown-syntax.md).
---
--- @author Michael Hanus
--- @version November 2023
------------------------------------------------------------------------------
name:
Text.Markdown
operations:
formatMarkdownFileAsPDF formatMarkdownInputAsPDF fromMarkdownText markdownEscapeChars markdownText2CompleteHTML markdownText2CompleteLaTeX markdownText2HTML markdownText2LaTeX markdownText2LaTeXWithFormat removeEscapes
sourcecode:
module Text.Markdown
  ( MarkdownDoc, MarkdownElem(..), fromMarkdownText
  , removeEscapes, markdownEscapeChars
  , markdownText2HTML, markdownText2CompleteHTML
  , markdownText2LaTeX, markdownText2LaTeXWithFormat
  , markdownText2CompleteLaTeX
  , formatMarkdownFileAsPDF, formatMarkdownInputAsPDF
  )
 where

import Data.Char
import System.IO   ( getContents )
import Data.List
import System.Process

import HTML.Base
import HTML.LaTeX

-----------------------------------------------------------------------
--- A markdown document is a list of markdown elements.
type MarkdownDoc = [MarkdownElem]

--- The data type for representing the different elements
--- occurring in a markdown document.
--- @cons Text s - a simple text in a markdown document
--- @cons Emph s - an emphasized text in a markdown document
--- @cons Strong s - a strongly emphaszed text in a markdown document
--- @cons Strong s - a code string in a markdown document
--- @cons Code s - a code string in a markdown document
--- @cons HRef s u - a reference to URL `u` with text `s` in a markdown document
--- @cons Par md - a paragraph in a markdown document
--- @cons CodeBlock s - a code block in a markdown document
--- @cons UList mds - an unordered list in a markdown document
--- @cons OList mds - an ordered list in a markdown document
--- @cons Quote md - a quoted paragraph in a markdown document
--- @cons HRule - a hoirzontal rule in a markdown document
--- @cons Header l s - a level `l` header with title `s`
---                    in a markdown document
data MarkdownElem = Text String
                  | Emph String
                  | Strong String
                  | Code String
                  | HRef String String
                  | Par MarkdownDoc
                  | CodeBlock String
                  | UList [MarkdownDoc]
                  | OList [MarkdownDoc]
                  | Quote MarkdownDoc
                  | HRule
                  | Header Int String

-----------------------------------------------------------------------
--- The data type for representing the different elements
--- of a source markdown document. Basically, it is the same
--- as the final markdown document execept for single list items
--- that occur in the source but are combined into a list in
--- the final document.
data SourceMDElem = SMDText String
                  | SMDEmph String
                  | SMDStrong String
                  | SMDCode String
                  | SMDHRef String String
                  | SMDPar MarkdownDoc
                  | SMDCodeBlock String
                  | SMDUItem String
                  | SMDOItem String
                  | SMDQuote MarkdownDoc
                  | SMDHRule
                  | SMDHeader Int String

isSMDUItem :: SourceMDElem -> Bool
isSMDUItem md = case md of SMDUItem _ -> True
                           _          -> False

isSMDOItem :: SourceMDElem -> Bool
isSMDOItem md = case md of SMDOItem _ -> True
                           _          -> False

textOfItem :: SourceMDElem -> String
textOfItem md = case md of SMDUItem txt -> txt
                           SMDOItem txt -> txt
                           _            -> ""

-----------------------------------------------------------------------
--- Parse markdown document from its textual representation.
fromMarkdownText :: String -> MarkdownDoc
fromMarkdownText = groupMarkDownElems . markdownText

-- Group adjacent item elements together in a markdown list.
groupMarkDownElems :: [SourceMDElem] -> MarkdownDoc
groupMarkDownElems [] = []
groupMarkDownElems (SMDUItem itxt :mds) = joinItems UList isSMDUItem [itxt] mds
groupMarkDownElems (SMDOItem itxt :mds) = joinItems OList isSMDOItem [itxt] mds
groupMarkDownElems (SMDText s : mds) = Text s : groupMarkDownElems mds
groupMarkDownElems (SMDEmph s : mds) = Emph s : groupMarkDownElems mds
groupMarkDownElems (SMDStrong s : mds) = Strong s : groupMarkDownElems mds
groupMarkDownElems (SMDCode s : mds) = Code s : groupMarkDownElems mds
groupMarkDownElems (SMDHRef s u : mds) = HRef s u : groupMarkDownElems mds
groupMarkDownElems (SMDPar md : mds) = Par md : groupMarkDownElems mds
groupMarkDownElems (SMDCodeBlock s : mds) = CodeBlock s : groupMarkDownElems mds
groupMarkDownElems (SMDQuote md : mds) = Quote md : groupMarkDownElems mds
groupMarkDownElems (SMDHRule : mds) = HRule : groupMarkDownElems mds
groupMarkDownElems (SMDHeader l s : mds) = Header l s : groupMarkDownElems mds

joinItems :: ([[MarkdownElem]] -> MarkdownElem) -> (SourceMDElem -> Bool)
          -> [String] -> [SourceMDElem] -> [MarkdownElem]
joinItems mdlcons _ items [] = [mdlcons (reverse (map fromMarkdownText items))]
joinItems mdlcons isitem items (md:mds) =
  if isitem md
    then joinItems mdlcons isitem (textOfItem md : items) mds
    else mdlcons (reverse (map fromMarkdownText items))
          : groupMarkDownElems (md:mds)

-- Basic reader for a markdown text.
markdownText :: String -> [SourceMDElem]
markdownText [] = []
markdownText txt@(_:_) = markdownLine fstline (dropFirst remtxt)
 where (fstline,remtxt) = break (=='\n') txt

-- Analyze the first line of a markdown text:
markdownLine :: String -> String -> [SourceMDElem]
markdownLine fstline remtxt
  | all isSpace fstline
  = markdownText remtxt
  | blanklen > 0 -- four space indent for code
  = markdownCodeBlock blanklen (drop blanklen fstline) remtxt
  | isLevel1Line
  = SMDHeader 1 fstline : markdownText (dropFirst furtherlines)
  | isLevel2Line
  = SMDHeader 2 fstline : markdownText (dropFirst furtherlines)
  | take 1 fstline == "#"
  = tryMDHeader fstline remtxt
  | isHRule fstline
  = SMDHRule : markdownText remtxt
  | take 2 fstline == "> " -- start of a quoted text
  = markdownQuote (drop 2 fstline) remtxt
  | uitemlen > 0 -- start of an unordered item
  = markdownItem SMDUItem uitemlen (drop uitemlen fstline) remtxt
  | nitemlen > 0 -- start of a numbered item
  = markdownItem SMDOItem nitemlen (drop nitemlen fstline) remtxt
  | otherwise = markdownPar fstline remtxt
 where
  blanklen = isCodeLine fstline
  (sndline,furtherlines) = break (=='\n') remtxt
  isLevel1Line = not (null sndline) && all (=='=') sndline
  isLevel2Line = not (null sndline) && all (=='-') sndline
  nitemlen = isNumberedItemLine fstline
  uitemlen = isUnorderedItemLine fstline

dropFirst :: [a] -> [a]
dropFirst s = if null s then [] else tail s

-- translate a header line
tryMDHeader :: String -> String -> [SourceMDElem]
tryMDHeader s rtxt =
  let (sharps,htxt) = break (==' ') s
      level = length sharps
   in if null htxt || level>6
        then markdownPar s rtxt
        else SMDHeader level (dropFirst htxt) : markdownText rtxt

-- is a line a horizontal rule:
isHRule :: String -> Bool
isHRule l =
  (all (\c -> isSpace c || c=='-') l && length (filter (=='-') l) > 3) ||
  (all (\c -> isSpace c || c=='*') l && length (filter (=='*') l) > 3)

-- check whether a line starts with an unordered item indicator ("* ")
-- and return indent:
isUnorderedItemLine :: String -> Int
isUnorderedItemLine s =
  let (blanks,nonblanks) = span (==' ') s
  in if take 2 nonblanks `elem` ["* ","- ","+ "] then length blanks+2 else 0

-- check whether a line starts with an indented number and return indent value:
isNumberedItemLine :: String -> Int
isNumberedItemLine s =
  let (blanks,nonblanks) = span (==' ') s
      numblanks = length blanks
  in checkNumber numblanks nonblanks
 where
  checkNumber indt numtxt =
    let (ns,brt) = break (==' ') numtxt
        (blanks,rtxt) = break (/=' ') brt
        nsl = length ns
    in if nsl>0 && all isDigit (take (nsl-1) ns) && ns!!(nsl-1)=='.' &&
          not (null blanks) && not (null rtxt)
         then indt+nsl+length blanks
         else 0

-- Check whether a line starts with at least four blanks and
-- return indent value:
isCodeLine :: String -> Int
isCodeLine s =
  let (blanks,nonblanks) = span (==' ') s
      numblanks = length blanks
  in if not (null nonblanks) && numblanks >= 4 then numblanks else 0

-- parse a paragraph (where the initial part of the paragraph is given
-- as the first argument):
markdownPar :: String -> String -> [SourceMDElem]
markdownPar ptxt txt
 | null txt || head txt `elem` ['\n'] ||
   uitemlen>0 || nitemlen>0
  = SMDPar (groupMarkDownElems (outsideMarkdownElem "" ptxt)) : markdownText txt
 | null remtxt
  = [SMDPar (groupMarkDownElems (outsideMarkdownElem "" (ptxt++'\n':fstline)))]
 | otherwise = markdownPar (ptxt++'\n':fstline) (tail remtxt)
 where
  (fstline,remtxt) = break (=='\n') txt
  nitemlen = isNumberedItemLine fstline
  uitemlen = isUnorderedItemLine fstline

-- parse a quoted section:
markdownQuote :: String -> String -> [SourceMDElem]
markdownQuote qtxt alltxt =
  let txt = if take 2 alltxt == ">\n" -- allow empty quote lines
            then "> " ++ drop 1 alltxt
            else alltxt
  in if take 2 txt == "> "
       then let (fstline,remtxt) = break (=='\n') (drop 2 txt)
            in if null remtxt
                 then [SMDQuote (fromMarkdownText (qtxt++'\n':fstline))]
                 else markdownQuote (qtxt++'\n':fstline) (tail remtxt)
       else SMDQuote (fromMarkdownText qtxt) : markdownText txt

-- parse a program block (where the indent and the initial code block is given):
markdownCodeBlock :: Int -> String -> String -> [SourceMDElem]
markdownCodeBlock n ctxt txt =
  if all (==' ') (take n txt)
    then let (fstline,remtxt) = break (=='\n') (drop n txt)
         in if null remtxt
              then [SMDCodeBlock (ctxt ++ '\n' : fstline)]
              else markdownCodeBlock n (ctxt ++ '\n' : fstline)
                                     (tail remtxt)
    else SMDCodeBlock ctxt : markdownText txt

-- parse a markdown list item:
markdownItem :: (String -> SourceMDElem) -> Int -> String -> String
             -> [SourceMDElem]
markdownItem icons n itxt txt =
  if take n txt == take n (repeat ' ')
  then let (fstline,remtxt) = break (=='\n') (drop n txt)
       in if null remtxt
            then [icons (itxt++'\n':fstline)]
            else markdownItem icons n (itxt++'\n':fstline) (tail remtxt)
  else let (fstline,remtxt) = break (=='\n') txt
       in if all isSpace fstline
            then if null remtxt
                   then [icons itxt]
                   else markdownItem icons n (itxt++"\n") (tail remtxt)
            else icons itxt : markdownText txt

--- Remove the backlash of escaped markdown characters in a string.
removeEscapes :: String -> String
removeEscapes s = case s of
  []          -> []
  ('\\':c:cs) -> if c `elem` markdownEscapeChars
                   then c : removeEscapes cs
                   else '\\' : removeEscapes (c:cs)
  (c:cs)      -> c : removeEscapes cs

--- Escape characters supported by markdown.
markdownEscapeChars :: [Char]
markdownEscapeChars =
  ['\\','`','*','_','{','}','[',']','(',')','<','>','#','+','-','.',' ','!']

-- Analyze markdown text outside an element like emphasis, code, strong:
outsideMarkdownElem :: String -> String -> [SourceMDElem]
outsideMarkdownElem txt s = case s of
  [] -> addPrevious txt []
  ('\\':c:cs)  -> if c `elem` markdownEscapeChars
                    then outsideMarkdownElem (c:'\\':txt) cs
                    else outsideMarkdownElem ('\\':txt) (c:cs)
  ('*':'*':cs) -> addPrevious txt $ insideMarkdownElem "**" [] cs
  ('_':'_':cs) -> addPrevious txt $ insideMarkdownElem "__" [] cs
  ('*':cs)     -> addPrevious txt $ insideMarkdownElem "*" [] cs
  ('_':cs)     -> addPrevious txt $ insideMarkdownElem "_" [] cs
  ('`':cs) -> let (ticks, cs') = span (=='`') cs in
              addPrevious txt $ insideMarkdownElem
                                  (replicate (length ticks + 1) '`') [] cs'
  ('[':cs) -> addPrevious txt $ tryParseLink cs
  ('<':cs) -> addPrevious txt $ markdownHRef cs
  (c:cs)   -> outsideMarkdownElem (c:txt) cs

addPrevious :: String -> [SourceMDElem] -> [SourceMDElem]
addPrevious ptxt xs = if null ptxt then xs else SMDText (reverse ptxt) : xs

-- Try to parse a link of the form [link test](url)
tryParseLink :: String -> [SourceMDElem]
tryParseLink txt = let (linktxt,rtxt) = break (==']') txt in
  if null rtxt || null (tail rtxt) || (rtxt!!1 /= '(')
    then outsideMarkdownElem "[" txt
    else let (url,mtxt) = break (==')') (drop 2 rtxt)
         in if null mtxt
              then outsideMarkdownElem "[" txt
              else SMDHRef linktxt url : outsideMarkdownElem "" (tail mtxt)

markdownHRef :: String -> [SourceMDElem]
markdownHRef txt = let (url,rtxt) = break (=='>') txt in
  if null rtxt
    then outsideMarkdownElem "<" txt
    else SMDHRef url url : outsideMarkdownElem "" (dropFirst rtxt)

insideMarkdownElem :: String -> String -> String -> [SourceMDElem]
insideMarkdownElem marker etext s =
  if marker `isPrefixOf` s
    then text2MDElem marker (reverse etext)
          : outsideMarkdownElem "" (drop (length marker) s)
    else case s of
           []     -> [SMDText (marker ++ reverse etext)] -- end marker missing
           ('\\':c:cs) -> if c `elem` markdownEscapeChars
                            then insideMarkdownElem marker (c:'\\':etext) cs
                            else insideMarkdownElem marker ('\\':etext) (c:cs)
           (c:cs)      -> insideMarkdownElem marker (c:etext) cs

text2MDElem :: String -> String -> SourceMDElem
text2MDElem marker txt = case marker of
  "**"                   -> SMDStrong txt
  "__"                   -> SMDStrong txt
  "*"                    -> SMDEmph txt
  "_"                    -> SMDEmph txt
  _ | all (=='`') marker -> SMDCode txt
    | otherwise          -> error $ "Markdown.text2MDElem: unknown marker \"" ++
                                     marker ++ "\""


-----------------------------------------------------------------------
-- Translate markdown document to HTML.

mdDoc2html :: HTML h => MarkdownDoc -> [h]
mdDoc2html = map mdElem2html

-- translate markdown special characters in text to HTML
mdtxt2html :: HTML h => String -> h
mdtxt2html = htmlText . htmlQuote . removeEscapes

mdElem2html :: HTML h => MarkdownElem -> h
mdElem2html (Text s)      = mdtxt2html s
mdElem2html (Emph s)      = emphasize [mdtxt2html s]
mdElem2html (Strong s)    = htmlStruct "strong" [] [mdtxt2html s]
mdElem2html (HRef s url)  = if s==url
                              then href url [code [mdtxt2html s]]
                              else href url [mdtxt2html s]
mdElem2html (Code s)      = code [htmlText (htmlQuote s)]
mdElem2html (CodeBlock s) = verbatim s
mdElem2html (Quote md)    = htmlStruct "blockquote" [] (mdDoc2html md)
mdElem2html (Par md)      = par (mdDoc2html md)
mdElem2html (UList mds)   = ulist (map mdDoc2htmlWithoutPar mds)
mdElem2html (OList mds)   = olist (map mdDoc2htmlWithoutPar mds)
mdElem2html HRule         = hrule
mdElem2html (Header l s)  = htmlStruct ('h':show l) [] [mdtxt2html s]

mdDoc2htmlWithoutPar :: HTML h => MarkdownDoc -> [h]
mdDoc2htmlWithoutPar mdoc = case mdoc of
  []                    -> []
  [Par md]              -> mdDoc2html md
  [md]                  -> [mdElem2html md]
  (Par md1 : md2 : mds) -> mdDoc2html md1 ++ breakline :
                           mdDoc2htmlWithoutPar (md2:mds)
  (md1 : md2 : mds)     -> mdElem2html md1 : mdDoc2htmlWithoutPar (md2:mds)

--- Translate a markdown text into a (partial) HTML document.
markdownText2HTML :: HTML h => String -> [h]
markdownText2HTML = mdDoc2html . fromMarkdownText

--- Translate a markdown text into a complete HTML text
--- that can be viewed as a standalone document by a browser.
--- The first argument is the title of the document.
markdownText2CompleteHTML :: String -> String -> String
markdownText2CompleteHTML title mdtxt =
   showHtmlPage (page title (markdownText2HTML mdtxt))

-----------------------------------------------------------------------
--- Translate markdown document to a LaTeX string where the first
--- argument is a function to translate the basic text occurring
--- in markdown elements to a LaTeX string.
--- Note that the basic text (execept for code blocks)
--- contains escaped markdown characters
--- that needs also to be removed by the translation function.
mdDoc2latex :: (String->String) -> MarkdownDoc -> String
mdDoc2latex txt2latex = concatMap (mdElem2latex txt2latex)

mdElem2latex :: (String->String) -> MarkdownElem -> String
mdElem2latex txt2latex (Text s) = txt2latex s
mdElem2latex txt2latex (Emph s) = "\\emph{"++txt2latex s++"}"
mdElem2latex txt2latex (Strong s) = "\\textbf{"++txt2latex s++"}"
mdElem2latex txt2latex (HRef s url) =
  if s==url then "\\url{"++url++"}"
            else "\\href{"++url++"}{"++txt2latex s++"}"
mdElem2latex txt2latex (Code s) = "\\texttt{"++txt2latex (htmlQuote s)++"}"
mdElem2latex _ (CodeBlock s) =
  "\\begin{verbatim}\n"++s++"\n\\end{verbatim}\n"
mdElem2latex txt2latex (Quote md) =
  "\\begin{quote}\n"++mdDoc2latex txt2latex md++"\\end{quote}\n"
mdElem2latex txt2latex (Par md) = mdDoc2latex txt2latex md++"\n\n"
mdElem2latex txt2latex (UList s) =
  "\\begin{itemize}"++
    concatMap (\i -> "\n\\item\n"++mdDoc2latex txt2latex i) s ++
  "\\end{itemize}\n"
mdElem2latex txt2latex (OList s) =
  "\\begin{enumerate}"++
    concatMap (\i -> "\n\\item\n"++mdDoc2latex txt2latex i) s ++
  "\\end{enumerate}\n"
mdElem2latex _ HRule = "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n"
mdElem2latex txt2latex (Header l s) = case l of
  1 -> "\\section{"++txt2latex s++"}\n\n"
  2 -> "\\subsection{"++txt2latex s++"}\n\n"
  3 -> "\\subsubsection{"++txt2latex s++"}\n\n"
  4 -> "\\paragraph{"++txt2latex s++"}\n\n"
  5 -> "\\textbf{"++txt2latex s++"}\n\n"
  _ -> "\\emph{"++txt2latex s++"}\n\n"


--- Translator for basic text to LaTeX.
--- Markdown escapes are removed and translated to LaTeX.
text2latex :: String -> String
text2latex = showLatexExps . (\s -> [htxt s]) . removeEscapes

--- Translate a markdown text into a (partial) LaTeX document.
--- All characters with a special meaning in LaTeX, like dollar
--- or ampersand signs, are quoted.
markdownText2LaTeX :: String -> String
markdownText2LaTeX = mdDoc2latex text2latex . fromMarkdownText

--- Translate a markdown text into a (partial) LaTeX document
--- where the first argument is a function to translate the basic text
--- occurring in markdown elements to a LaTeX string.
--- For instance, one can use a translation operation
--- that supports passing mathematical formulas in LaTeX style
--- instead of quoting all special characters.
markdownText2LaTeXWithFormat :: (String->String) -> String -> String
markdownText2LaTeXWithFormat txt2latex = mdDoc2latex txt2latex . fromMarkdownText

--- Translate a markdown text into a complete LaTeX document
--- that can be formatted as a standalone document.
markdownText2CompleteLaTeX :: String -> String
markdownText2CompleteLaTeX mds =
  latexHeader ++ mdDoc2latex text2latex (fromMarkdownText mds) ++
  "\\end{document}\n"

latexHeader :: String
latexHeader = unwords
  [ "\\documentclass{article}"
  , "\\usepackage[utf8x]{inputenc}"
  , "\\usepackage{url}"
  , "\\usepackage[breaklinks=true,unicode=true]{hyperref}"
  , "\\setlength{\\parindent}{0pt}"
  , "\\setlength{\\parskip}{6pt plus 2pt minus 1pt}"
  , "\\setcounter{secnumdepth}{0}"
  , "\\begin{document}"
  ]

--- Format the standard input (containing markdown text) as PDF.
formatMarkdownInputAsPDF :: String -> IO ()
formatMarkdownInputAsPDF outfile = getContents >>= formatMarkdownAsPDF outfile

--- Format a file containing markdown text as PDF.
formatMarkdownFileAsPDF :: String -> String -> IO ()
formatMarkdownFileAsPDF infile outfile =
  readFile infile >>= formatMarkdownAsPDF outfile

--- Format a file containing markdown text as PDF.
formatMarkdownAsPDF :: String -> String -> IO ()
formatMarkdownAsPDF outfile mdstr = do
  pid <- getPID
  let tmp = "tmp_" ++ show pid
  writeFile (tmp ++ ".tex") (markdownText2CompleteLaTeX mdstr)
  pdflatexFile tmp outfile

-- Format a file tmp.tex with pdflatex and show the result
pdflatexFile :: String -> String -> IO ()
pdflatexFile tmp outfile = do
  system $ "pdflatex \'\\nonstopmode\\input{" ++ tmp ++ ".tex}\'"
  system $ unwords ["/bin/rm -f", tmp ++ ".tex", tmp ++ ".aux", tmp ++ ".log",
                    tmp ++ ".out"]
  if null outfile
    then do system $ "evince " ++ tmppdf
            system $ "/bin/rm -f " ++ tmppdf
    else system $ "/bin/mv -f " ++ tmppdf ++ " " ++ outfile
  return ()
 where
  tmppdf = tmp ++ ".pdf"

-----------------------------------------------------------------------
types:
MarkdownDoc MarkdownElem
unsafe:
safe