{-
html2rst.hs - converts html to reStructuredText 
John MacFarlane
20 Jan 2006

compile with GHC using 
Haskell XML Toolbox http://www.fh-wedel.de/~si/HXmlToolbox/

usage:
html2rst <URL>
-}

module Main
where
 
import Text.XML.HXT.Parser
import Text.XML.HXT.DOM.Unicode
import System
import Data.Char

main :: IO ()
main
   = do
     argv <- getArgs
     (input, readErrs, rc) <- getXmlDocument [(a_parse_html, "1"), (a_remove_whitespace, "1")] (inputFile argv)
     if rc >= c_err
       then issueErrors readErrs
       else if (outputFile argv) == "-" 
            then putStr (unicodeToUtf8 (processTree input))
            else writeFile (outputFile argv) (unicodeToUtf8 (processTree input))

inputFile :: [String] -> String
inputFile argv = if (length argv) >= 1 
    then argv !! 0
    else "-"

outputFile :: [String] -> String
outputFile argv = if (length argv) >= 2 
    then argv !! 1
    else "-"

issueErrors :: XmlTrees -> IO ()
issueErrors [NTree (XError _ x) _] = print x 
issueErrors x = print x

indent :: Int -> String -> String
indent _ "" = ""
indent spaces (c:cs) = if c == '\n' then "\n" ++ (replicate (spaces - 1) ' ') ++ (indent (spaces) cs) else c:(indent spaces cs)

wrap :: Int -> Int -> Int -> [String] -> String
wrap _ _ _ [] = ""
wrap leftm rightm starting (w:ws) = if ((starting + length w) < rightm)
     then w ++ " " ++ (wrap leftm rightm (starting + length w) ws)
     else "\n" ++ (replicate (leftm - 1) ' ') ++ w ++ " " ++ (wrap leftm rightm (leftm + length w) ws)                      

underline :: String -> Char -> String
underline str ch = str ++ "\n" ++ (replicate (length str) ch ++ "\n\n")

replaceAtBeginLine :: Char -> String -> String -> String
replaceAtBeginLine old new str = unlines (map (replaceAt1 old new) (lines str))

replaceAt1 :: Char -> String -> String -> String
replaceAt1 old new []   = []
replaceAt1 old new (x:xs) = if (x == old) 
                           then (new ++ xs)
                           else (x:xs)

processImage :: [XmlTree] -> String 
processImage [] = ""
processImage [NTree (XAttr (QN {namePrefix = _, localPart = "src", namespaceUri = _})) [NTree (XText imagesrc) []]] =  "\\ |" ++ imagesrc ++ "|\\ "
processImage (x:xs) = processImage xs  

barredText :: String -> [String]
-- returns list of bits of text between vertical bars |text|
barredText [] = []
barredText xs = barred:(barredText rest)
                where (barred, rest) = nextBarredBit xs

nextBarredBit :: String -> (String, String)
-- returns next bit of text between vertical bars and rest of string
nextBarredBit [] = ([],[])
nextBarredBit (x:xs) = if (x == '|') then (barred, tail remainder)
                                     else nextBarredBit xs
                                     where (barred, remainder) = span (/= '|') xs


makeImageLink :: String -> String
makeImageLink "" = ""
makeImageLink str = ".. |" ++ str ++ "| image:: " ++ str

imageLinks :: String -> String
imageLinks str = unlines (map makeImageLink (barredText str))

format :: String -> String -> String
format tagtype str = 
     case tagtype of 
        "p" -> (wrap 1 60 1 (words str)) ++ "\n\n"
        "blockquote" -> (indent 4 ("\n" ++ wrap 1 60 1 (words str))) ++ "\n\n"
        "br" -> "\n"
        "div" -> str ++ "\n\n"
        "table" -> "\n" ++ str ++ "\n\n"
        "tr" -> str
        "th" -> str ++ "\n"
        "td" -> str ++ "\n"
        "em" -> "*" ++ str ++ "*"
        "i"  -> "*" ++ str ++ "*"
        "b"  -> "**" ++ str ++ "**"
        "strong" -> "**" ++ str ++ "**"
        "tt" -> "``" ++ str ++ "``"
        "title" -> (underline str '=')
        "h1" -> (underline str '-') 
        "h2" -> (underline str '~') 
        "h3" -> (underline str '+') 
        "h4" -> (underline str '^')
        "h5" -> (underline str '#')
        "li" -> ", " ++ (indent 4 (dropWhile isSpace str)) ++ "\n" 
        "ul" -> (indent 4 ("\n" ++ replaceAtBeginLine ',' "- " str)) ++ "\n"
        "ol" -> (indent 4 ("\n" ++ replaceAtBeginLine ',' "#." str)) ++ "\n"
        "dl" -> "\n" ++ str ++ "\n"
        "dt" -> str ++ "\n"
        "dd" -> wrap 4 60 1 (words str) ++ "\n"
        "pre" -> "::\n\n" ++ (indent 3 str) ++ "\n"
        "body" -> str ++ "\n\n" ++ imageLinks str 
        _ -> str

processTree :: XmlTree -> String

processTree (NTree (XTag (QN {namePrefix = _, localPart = "a", namespaceUri = _}) reflst) textlst) = 
    if concat (map processTree reflst) == "" 
        then concat (map processTree textlst)
        else "`" ++ concat (map processTree textlst) ++ " <" ++ concat (map processTree reflst) ++ ">`_ "

processTree (NTree (XAttr (QN {namePrefix = _, localPart = "href", namespaceUri = _})) treelst) = concat (map processTree treelst)

processTree (NTree (XText string) trees) = string 

processTree (NTree (XTag (QN {namePrefix = _, localPart = "img", namespaceUri = _}) subtree) trees) = processImage subtree

processTree (NTree (XTag (QN {namePrefix = _, localPart = tagtype, namespaceUri = _}) subtree) trees) = format tagtype (concat (map processTree trees))

processTree _ = ""


