-- Illustrating the incremental XML processing that nevertheless
-- always detects ill-formed XML, and, optionally, invalid XML
-- An XML document is transformed to an XMLStream and then to
-- the stream of (Key,Value) pairs.
-- We look up a Value with a given key. Even though the final
-- iteratee finishes as soon as it locates the key, we (intentionally!)
-- fully parse the document nevertheless, to detect and report
-- ill-formedness errors.
-- The example demonstrates that it is possible to detect ill-formedness
-- errors and process the document without loading it all in memory
-- first (using as little memory as needed to hold the state of the
-- final processor -- just a single Value in our example).
--
-- This code also illustrates the library of parsing combinators, which
-- represent the element structure (`DTD').
module XMLookup where
import XMLIter
import IterateeM
import Prelude hiding (head)
import qualified Data.Map as M
import Control.Monad.Trans
import Control.Monad.Identity -- for tests
-- An XML document representing a finite map: a collection
-- of key-value pairs
-- (The document is as ugly as typical for xml, cf. fontconfig)
-- This document does NOT have the end tag for the
-- root element
xmldoc_trunc = unlines [
""]
-- This document is both well-formed and valid
xmldoc_full_ok = xmldoc_trunc ++ unlines [
"4v4",
""]
-- testing the stream
run_from_str :: Doc ->
Iteratee (XMLStream ()) Identity a -> Either ErrorMsg a
run_from_str docstr iter = runIdentity $
run . en_handle show =<<
enum_pure_1chunk docstr .| xml_enum default_handlers .| xml_normalize iter
-- First we check the XMLStream produced by the XML parser
tstream_xml1 = run_from_str xmldoc_trunc stream2list
-- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\""
tstream_xml2 = run_from_str xmldoc_full_bad stream2list
{-
Right [XMLStart map (fromList []) (fromList []),
XMLStart kv (fromList []) (fromList []),
XMLStart key (fromList []) (fromList []),XMLString "1" "",
XMLEnd key (fromList []) (fromList []),
XMLStart value (fromList []) (fromList []),XMLString "v1" "",
XMLEnd value (fromList []) (fromList []),
XMLEnd kv (fromList []) (fromList []),
XMLStart kv (fromList []) (fromList []),
...
XMLEnd map (fromList []) (fromList [])]
-}
type Key = Int
type Value = String
-- An enumeratee that converts the XMLStream to the stream
-- of key-value pairs. The enumeratee relies on a simple
-- combinator library to parse the XML stream.
kv_enum :: Monad m =>
Enumeratee (XMLStream ()) (Key,Value) m a
kv_enum = parse_element (name "map") map_parser
where
map_parser n iter =
parse_many (name "kv") kv1 iter >>=
finish_element n
-- Parse the body of one "kv" element
kv1 n iter = do
key <- parse_element (name "key") (\n _ -> as_readable n) 0
val <- parse_element (name "value") (\n _ -> as_str n) ""
yield_to iter (key,val) >>= finish_element n
yield_to :: Monad m => Iteratee e m a -> e -> Iteratee eo m (Iteratee e m a)
yield_to (IE_cont Nothing k) v = lift (feedI k (Chunk [v]))
yield_to iter _ = return iter
-- We test the enumeratee kv_enum
tstream_kv1 = run_from_str xmldoc_trunc (id .| kv_enum stream2list)
-- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\""
tstream_kv2 = run_from_str xmldoc_full_bad (id .| kv_enum stream2list)
{-
Left "Reading failure\nStream after the error:\n
[XMLStart value (fromList []) (fromList []),XMLString \"v3\" \"\",....
-}
tstream_kv3 = run_from_str xmldoc_full_ok (id .| kv_enum stream2list)
-- Right [(1,"v1"),(2,"v2"),(4,"v4")]
-- Finally, we write an iteratee to
-- lookup in the key-value stream
lkup :: Monad m => Key -> Iteratee (Key,Value) m (Maybe Value)
lkup key = headM >>= check
where
check (Just (k,v)) | k == key = return (Just v) -- premature termination
check Nothing = return Nothing
check _ = lkup key
-- Putting it all together
xml_lookup :: Monad m => Key -> Iteratee Char m (Maybe Value)
xml_lookup key = id .| xml_enum default_handlers .| xml_normalize
.| kv_enum (lkup key)
type Doc = String -- for the purpose of the example,
-- the document comes from a String
type ErrorMsg = String
run_test :: Key -> Doc -> Either ErrorMsg (Maybe Value)
run_test key str =
runIdentity $ run . en_handle show =<<
enum_pure_1chunk str (xml_lookup key)
-- Both well-formedness and validation errors are always reported,
-- even if the desired key has been located early in the stream.
test_trunc = run_test 1 xmldoc_trunc
-- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\""
test_full1b = run_test 1 xmldoc_full_bad
-- Left "Reading failure...
test_full1 = run_test 2 xmldoc_full_ok
-- Right (Just "v2")
test_full2 = run_test 10 xmldoc_full_ok
-- Right Nothing
-- Now we wish to stop the validation after we found the element
-- with a given key. We still wish to continue checking for
-- well-formedness.
kv_enum_pterm :: Monad m =>
Enumeratee (XMLStream ()) (Key,Value) m a
kv_enum_pterm = parse_element (name "map") map_parser
where
map_parser n iter =
parse_many (name "kv") kv1 iter >>=
finish_element n
-- Parse the body of one "kv" element, stopping early
-- We could've used exceptions
kv1 n iter@(IE_cont Nothing _) = do
key <- parse_element (name "key") (\n _ -> as_readable n) 0
val <- parse_element (name "value") (\n _ -> as_str n) ""
yield_to iter (key,val) >>= finish_element n
kv1 n iter = finish_element_early n iter
xml_lookup_pterm :: Monad m => Key -> Iteratee Char m (Maybe Value)
xml_lookup_pterm key = id .| xml_enum default_handlers .| xml_normalize
.| kv_enum_pterm (lkup key)
run_test_pterm :: Key -> Doc -> Either ErrorMsg (Maybe Value)
run_test_pterm key str =
runIdentity $ run . en_handle show =<<
enum_pure_1chunk str (xml_lookup_pterm key)
test_trunc_pterm = run_test_pterm 1 xmldoc_trunc
-- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\""
-- If the lookup succeeds before encountering the element with the
-- bad key, we now return the lookup result rather than failure.
test_full1b_pterm = run_test_pterm 1 xmldoc_full_bad
-- Right (Just "v1")
test_full2b_pterm = run_test_pterm 2 xmldoc_full_bad
-- Right (Just "v2")
test_full3b_pterm = run_test_pterm 3 xmldoc_full_bad
-- Left "Reading failure
-- ------------------------------------------------------------------------
-- Utility parsers
-- Small parser combinator library for XMLStream
-- Parser for a body of an element (when its start tag has been read)
-- This is like a function to fold over
-- The body parser must handle the end tag for QNameR
type BodyParser m a = QNameR -> a -> Iteratee (XMLStream ()) m a
-- Check that the stream contains the start of the given element
-- Parse the element with a given body parser.
-- (This function is just like a fold)
parse_element :: Monad m =>
QNameR -> BodyParser m a -> a -> Iteratee (XMLStream ()) m a
parse_element n h acc = head >>= \x -> case x of
(XMLStart n' attrs _) | n == n' -> h n acc
x -> parser_error $
"Expected the start of " ++ show n ++ " found: " ++ show x
-- Like a fold, but it is an accumulating fold
parse_many :: Monad m =>
QNameR -> BodyParser m a -> a -> Iteratee (XMLStream ()) m a
parse_many n body acc = peek >>= loop acc
where
loop acc (Just (XMLStart n' _ _)) | n == n' = do
head
acc <- body n acc
peek >>= loop acc
loop acc _ = return acc
-- Get the character content of an element
-- (whose name is passed as the first argument)
get_char_content :: Monad m =>
QNameR -> Iteratee (XMLStream ()) m (String,AttList QNameR)
get_char_content n = loop ""
where
loop str = head >>= \x -> case x of
XMLString str' "" | str == "" -> loop str'
XMLEnd n' attrs _ | n' == n -> return (str,attrs)
x -> parser_error $ "Unexpected in XML Stream for char content of element "
++ show n ++ ": " ++ show x
as_str :: Monad m => QNameR -> Iteratee (XMLStream ()) m String
as_str n = fst `liftM` get_char_content n
as_readable :: (Read a, Monad m) => QNameR -> Iteratee (XMLStream ()) m a
as_readable n = as_str n >>= check . reads
where
check [(v,"")] = return v
check _ = parser_error $ "Reading failure"
-- All our names are in the empty namespace
name :: NCName -> QNameR
name str = QNameR "" str
-- Check to see the stream ends the element with a given name.
-- In this example, we ignore all attributes
-- In the other code, we extract the attributes from XMLEnd and add
-- them to the state.
finish_element :: Monad m => BodyParser m a
finish_element n acc = head >>= \x -> case x of
(XMLEnd n' attrs _) | n == n' -> return acc
x -> parser_error $
"Unexpected while looking for the end of " ++ show n ++
": " ++ show x
-- Skip the stream until found the end of the given element
finish_element_early :: Monad m => BodyParser m a
finish_element_early n acc = IterateeM.dropWhile test >> head >> return acc
where
test (XMLEnd n' attrs _) | n == n' = False
test _ = True