{-# LANGUAGE OverloadedStrings #-}
module Text.XmlHtml.XML.Parse where
import Control.Applicative
import Control.Monad
import Data.Char
import Data.List
import Data.Maybe
import Text.XmlHtml.Common
import Text.XmlHtml.TextParser
import qualified Text.Parsec as P
import Data.Map (Map)
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as T
docFragment :: Encoding -> Parser Document
docFragment e = do
(dt, nodes1) <- prolog
nodes2 <- content
return $ XmlDocument e dt (nodes1 ++ nodes2)
whiteSpace :: Parser ()
whiteSpace = some (P.satisfy (`elem` [' ','\t','\r','\n'])) *> return ()
isNameStartChar :: Char -> Bool
isNameStartChar c | c == ':' = True
| c == '_' = True
| c >= 'a' && c <= 'z' = True
| c >= 'A' && c <= 'Z' = True
| c >= '\xc0' && c <= '\xd6' = True
| c >= '\xd8' && c <= '\xf6' = True
| c >= '\xf8' && c <= '\x2ff' = True
| c >= '\x370' && c <= '\x37d' = True
| c >= '\x37f' && c <= '\x1fff' = True
| c >= '\x200c' && c <= '\x200d' = True
| c >= '\x2070' && c <= '\x218f' = True
| c >= '\x2c00' && c <= '\x2fef' = True
| c >= '\x3001' && c <= '\xd7ff' = True
| c >= '\xf900' && c <= '\xfdcf' = True
| c >= '\xfdf0' && c <= '\xfffd' = True
| c >= '\x10000' && c <= '\xeffff' = True
| otherwise = False
isNameChar :: Char -> Bool
isNameChar c | isNameStartChar c = True
| c == '-' = True
| c == '.' = True
| c == '\xb7' = True
| c >= '0' && c <= '9' = True
| c >= '\x300' && c <= '\x36f' = True
| c >= '\x203f' && c <= '\x2040' = True
| otherwise = False
name :: Parser Text
name = do
c <- P.satisfy isNameStartChar
r <- takeWhile0 isNameChar
return $ T.cons c r
attrValue :: Parser Text
attrValue = fmap T.concat (singleQuoted <|> doubleQuoted)
where
singleQuoted = P.char '\'' *> refTill ['<','&','\''] <* P.char '\''
doubleQuoted = P.char '"' *> refTill ['<','&','"'] <* P.char '"'
refTill end = many (takeWhile1 (not . (`elem` end)) <|> reference)
systemLiteral :: Parser Text
systemLiteral = singleQuoted <|> doubleQuoted
where
singleQuoted = do
_ <- P.char '\''
x <- takeWhile0 (not . (== '\''))
_ <- P.char '\''
return x
doubleQuoted = do
_ <- P.char '\"'
x <- takeWhile0 (not . (== '\"'))
_ <- P.char '\"'
return x
pubIdLiteral :: Parser Text
pubIdLiteral = singleQuoted <|> doubleQuoted
where
singleQuoted = do
_ <- P.char '\''
x <- takeWhile0 (\c -> isPubIdChar c && c /= '\'')
_ <- P.char '\''
return x
doubleQuoted = do
_ <- P.char '\"'
x <- takeWhile0 isPubIdChar
_ <- P.char '\"'
return x
isPubIdChar :: Char -> Bool
isPubIdChar c | c >= 'a' && c <= 'z' = True
| c >= 'A' && c <= 'Z' = True
| c >= '0' && c <= '9' = True
| c `elem` otherChars = True
| otherwise = False
where
otherChars = " \r\n-\'()+,./:=?;!*#@$_%" :: [Char]
charData :: Parser Node
charData = TextNode <$> takeWhile1 (not . (`elem` ['<','&']))
comment :: Parser (Maybe Node)
comment = text "<!--" *> (Just <$> Comment <$> commentText) <* text "-->"
where
commentText = fmap T.concat $ many $
nonDash <|> P.try (T.cons <$> P.char '-' <*> nonDash)
nonDash = takeWhile1 (not . (== '-'))
processingInstruction :: Parser (Maybe Node)
processingInstruction = do
_ <- text "<?"
_ <- piTarget
_ <- emptyEnd <|> contentEnd
return Nothing
where
emptyEnd = P.try (P.string "?>")
contentEnd = P.try $ do
_ <- whiteSpace
P.manyTill P.anyChar (P.try $ text "?>")
piTarget :: Parser ()
piTarget = do
n <- name
when (T.map toLower n == "xml") $ fail "xml declaration can't occur here"
cdata :: [Char] -> Parser a -> Parser Node
cdata cs end = TextNode <$> T.concat <$> P.manyTill part end
where part = takeWhile1 (not . (`elem` cs))
<|> T.singleton <$> P.anyChar
cdSect :: Parser (Maybe Node)
cdSect = Just <$> do
_ <- text "<![CDATA["
cdata "]" (text "]]>")
prolog :: Parser (Maybe DocType, [Node])
prolog = do
_ <- optional xmlDecl
nodes1 <- many misc
rest <- optional $ do
dt <- docTypeDecl
nodes2 <- many misc
return (dt, nodes2)
case rest of
Nothing -> return (Nothing, catMaybes nodes1)
Just (dt, nodes2) -> return (Just dt, catMaybes (nodes1 ++ nodes2))
xmlDecl :: Parser (Maybe Text)
xmlDecl = do
_ <- text "<?xml"
_ <- versionInfo
e <- optional encodingDecl
_ <- optional sdDecl
_ <- optional whiteSpace
_ <- text "?>"
return e
versionInfo :: Parser ()
versionInfo = do
whiteSpace *> text "version" *> eq *> (singleQuoted <|> doubleQuoted)
where
singleQuoted = P.char '\'' *> versionNum <* P.char '\''
doubleQuoted = P.char '\"' *> versionNum <* P.char '\"'
versionNum = do
_ <- text "1."
_ <- some (P.satisfy (\c -> c >= '0' && c <= '9'))
return ()
eq :: Parser ()
eq = optional whiteSpace *> P.char '=' *> optional whiteSpace *> return ()
misc :: Parser (Maybe Node)
misc = comment <|> processingInstruction <|> (whiteSpace *> return Nothing)
docTypeDecl :: Parser DocType
docTypeDecl = do
_ <- text "<!DOCTYPE"
whiteSpace
tag <- name
_ <- optional whiteSpace
extid <- externalID
_ <- optional whiteSpace
intsub <- internalDoctype
_ <- P.char '>'
return (DocType tag extid intsub)
data InternalDoctypeState = IDSStart
| IDSScanning Int
| IDSInQuote Int Char
| IDSCommentS1 Int
| IDSCommentS2 Int
| IDSCommentS3 Int
| IDSComment Int
| IDSCommentD1 Int
| IDSCommentE1 Int
internalDoctype :: Parser InternalSubset
internalDoctype = InternalText <$> T.pack <$> scanText (dfa IDSStart)
<|> return NoInternalSubset
where dfa IDSStart '[' = ScanNext (dfa (IDSScanning 0))
dfa IDSStart _ = ScanFail "Not a DOCTYPE internal subset"
dfa (IDSInQuote n c) d
| c == d = ScanNext (dfa (IDSScanning n))
| otherwise = ScanNext (dfa (IDSInQuote n c))
dfa (IDSScanning n) '[' = ScanNext (dfa (IDSScanning (n+1)))
dfa (IDSScanning 0) ']' = ScanFinish
dfa (IDSScanning n) ']' = ScanNext (dfa (IDSScanning (n-1)))
dfa (IDSScanning n) '\'' = ScanNext (dfa (IDSInQuote n '\''))
dfa (IDSScanning n) '\"' = ScanNext (dfa (IDSInQuote n '\"'))
dfa (IDSScanning n) '<' = ScanNext (dfa (IDSCommentS1 n))
dfa (IDSScanning n) _ = ScanNext (dfa (IDSScanning n))
dfa (IDSCommentS1 n) '[' = ScanNext (dfa (IDSScanning (n+1)))
dfa (IDSCommentS1 0) ']' = ScanFinish
dfa (IDSCommentS1 n) ']' = ScanNext (dfa (IDSScanning (n-1)))
dfa (IDSCommentS1 n) '\'' = ScanNext (dfa (IDSInQuote n '\''))
dfa (IDSCommentS1 n) '\"' = ScanNext (dfa (IDSInQuote n '\"'))
dfa (IDSCommentS1 n) '!' = ScanNext (dfa (IDSCommentS2 n))
dfa (IDSCommentS1 n) _ = ScanNext (dfa (IDSScanning n))
dfa (IDSCommentS2 n) '[' = ScanNext (dfa (IDSScanning (n+1)))
dfa (IDSCommentS2 0) ']' = ScanFinish
dfa (IDSCommentS2 n) ']' = ScanNext (dfa (IDSScanning (n-1)))
dfa (IDSCommentS2 n) '\'' = ScanNext (dfa (IDSInQuote n '\''))
dfa (IDSCommentS2 n) '\"' = ScanNext (dfa (IDSInQuote n '\"'))
dfa (IDSCommentS2 n) '-' = ScanNext (dfa (IDSCommentS3 n))
dfa (IDSCommentS2 n) _ = ScanNext (dfa (IDSScanning n))
dfa (IDSCommentS3 n) '[' = ScanNext (dfa (IDSScanning (n+1)))
dfa (IDSCommentS3 0) ']' = ScanFinish
dfa (IDSCommentS3 n) ']' = ScanNext (dfa (IDSScanning (n-1)))
dfa (IDSCommentS3 n) '\'' = ScanNext (dfa (IDSInQuote n '\''))
dfa (IDSCommentS3 n) '\"' = ScanNext (dfa (IDSInQuote n '\"'))
dfa (IDSCommentS3 n) '-' = ScanNext (dfa (IDSComment n))
dfa (IDSCommentS3 n) _ = ScanNext (dfa (IDSScanning n))
dfa (IDSComment n) '-' = ScanNext (dfa (IDSCommentD1 n))
dfa (IDSComment n) _ = ScanNext (dfa (IDSComment n))
dfa (IDSCommentD1 n) '-' = ScanNext (dfa (IDSCommentE1 n))
dfa (IDSCommentD1 n) _ = ScanNext (dfa (IDSComment n))
dfa (IDSCommentE1 n) '>' = ScanNext (dfa (IDSScanning n))
dfa (IDSCommentE1 _) _ = ScanFail "Poorly formatted comment"
sdDecl :: Parser ()
sdDecl = do
_ <- P.try $ whiteSpace *> text "standalone"
eq
_ <- single <|> double
return ()
where
single = P.char '\'' *> yesno <* P.char '\''
double = P.char '\"' *> yesno <* P.char '\"'
yesno = text "yes" <|> text "no"
element :: Parser Node
element = do
(t,a,b) <- emptyOrStartTag
if b then return (Element t a [])
else nonEmptyElem t a
where
nonEmptyElem t a = do
c <- content
endTag t
return (Element t a c)
emptyOrStartTag :: Parser (Text, [(Text, Text)], Bool)
emptyOrStartTag = do
t <- P.try $ P.char '<' *> name
a <- many $ P.try $ do
whiteSpace
attribute
when (hasDups a) $ fail "Duplicate attribute names in element"
_ <- optional whiteSpace
e <- optional (P.char '/')
_ <- P.char '>'
return (t, a, isJust e)
where
hasDups a = length (nub (map fst a)) < length a
attribute :: Parser (Text, Text)
attribute = do
n <- name
eq
v <- attrValue
return (n,v)
endTag :: Text -> Parser ()
endTag s = do
_ <- text "</"
t <- name
when (s /= t) $ fail $ "mismatched tags: </" ++ T.unpack t ++
"> found inside <" ++ T.unpack s ++ "> tag"
_ <- optional whiteSpace
_ <- text ">"
return ()
content :: Parser [Node]
content = do
n <- optional charData
ns <- fmap concat $ many $ do
s <- ((Just <$> TextNode <$> reference)
<|> cdSect
<|> processingInstruction
<|> comment
<|> fmap Just element)
t <- optional charData
return [s,t]
return $ coalesceText $ catMaybes (n:ns)
where
coalesceText (TextNode s : TextNode t : ns)
= coalesceText (TextNode (T.append s t) : ns)
coalesceText (n:ns)
= n : coalesceText ns
coalesceText []
= []
charRef :: Parser Text
charRef = hexCharRef <|> decCharRef
where
decCharRef = do
_ <- text "&#"
ds <- some digit
_ <- P.char ';'
let c = chr $ foldl' (\a b -> 10 * a + b) 0 ds
when (not (isValidChar c)) $ fail $
"Reference is not a valid character"
return $ T.singleton c
where
digit = do
d <- P.satisfy (\c -> c >= '0' && c <= '9')
return (ord d - ord '0')
hexCharRef = do
_ <- text "&#x"
ds <- some digit
_ <- P.char ';'
let c = chr $ foldl' (\a b -> 16 * a + b) 0 ds
when (not (isValidChar c)) $ fail $
"Reference is not a valid character"
return $ T.singleton c
where
digit = num <|> upper <|> lower
num = do
d <- P.satisfy (\c -> c >= '0' && c <= '9')
return (ord d - ord '0')
upper = do
d <- P.satisfy (\c -> c >= 'A' && c <= 'F')
return (10 + ord d - ord 'A')
lower = do
d <- P.satisfy (\c -> c >= 'a' && c <= 'f')
return (10 + ord d - ord 'a')
reference :: Parser Text
reference = charRef <|> entityRef
entityRef :: Parser Text
entityRef = do
_ <- P.char '&'
n <- name
_ <- P.char ';'
case M.lookup n entityRefLookup of
Nothing -> fail $ "Unknown entity reference: " ++ T.unpack n
Just t -> return t
where
entityRefLookup :: Map Text Text
entityRefLookup = M.fromList [
("amp", "&"),
("lt", "<"),
("gt", ">"),
("apos", "\'"),
("quot", "\"")
]
externalID :: Parser ExternalID
externalID = systemID <|> publicID <|> return NoExternalID
where
systemID = do
_ <- text "SYSTEM"
whiteSpace
fmap System systemLiteral
publicID = do
_ <- text "PUBLIC"
whiteSpace
pid <- pubIdLiteral
whiteSpace
sid <- systemLiteral
return (Public pid sid)
encodingDecl :: Parser Text
encodingDecl = do
_ <- P.try $ whiteSpace *> text "encoding"
_ <- eq
singleQuoted <|> doubleQuoted
where
singleQuoted = P.char '\'' *> encName <* P.char '\''
doubleQuoted = P.char '\"' *> encName <* P.char '\"'
encName = do
c <- P.satisfy isEncStart
cs <- takeWhile0 isEnc
return (T.cons c cs)
isEncStart c | c >= 'A' && c <= 'Z' = True
| c >= 'a' && c <= 'z' = True
| otherwise = False
isEnc c | c >= 'A' && c <= 'Z' = True
| c >= 'a' && c <= 'z' = True
| c >= '0' && c <= '9' = True
| c `elem` ['.','_','-'] = True
| otherwise = False