-- Illustrating the incremental XML processing that nevertheless -- always detects ill-formed XML, and, optionally, invalid XML -- An XML document is transformed to an XMLStream and then to -- the stream of (Key,Value) pairs. -- We look up a Value with a given key. Even though the final -- iteratee finishes as soon as it locates the key, we (intentionally!) -- fully parse the document nevertheless, to detect and report -- ill-formedness errors. -- The example demonstrates that it is possible to detect ill-formedness -- errors and process the document without loading it all in memory -- first (using as little memory as needed to hold the state of the -- final processor -- just a single Value in our example). -- -- This code also illustrates the library of parsing combinators, which -- represent the element structure (`DTD'). module XMLookup where import XMLIter import IterateeM import Prelude hiding (head) import qualified Data.Map as M import Control.Monad.Trans import Control.Monad.Identity -- for tests -- An XML document representing a finite map: a collection -- of key-value pairs -- (The document is as ugly as typical for xml, cf. fontconfig) -- This document does NOT have the end tag for the -- root element xmldoc_trunc = unlines [ "", "1v1", "2v2" ] -- This document adds the closing tag and is hence well-formed -- It has a 'bad' key. The document is still well-formed, -- but it is not valid (it does not match the intended content model: -- the key must be parseable as an Int) xmldoc_full_bad = xmldoc_trunc ++ unlines [ "badv3", "4v4", ""] -- This document is both well-formed and valid xmldoc_full_ok = xmldoc_trunc ++ unlines [ "4v4", ""] -- testing the stream run_from_str :: Doc -> Iteratee (XMLStream ()) Identity a -> Either ErrorMsg a run_from_str docstr iter = runIdentity $ run . en_handle show =<< enum_pure_1chunk docstr .| xml_enum default_handlers .| xml_normalize iter -- First we check the XMLStream produced by the XML parser tstream_xml1 = run_from_str xmldoc_trunc stream2list -- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\"" tstream_xml2 = run_from_str xmldoc_full_bad stream2list {- Right [XMLStart map (fromList []) (fromList []), XMLStart kv (fromList []) (fromList []), XMLStart key (fromList []) (fromList []),XMLString "1" "", XMLEnd key (fromList []) (fromList []), XMLStart value (fromList []) (fromList []),XMLString "v1" "", XMLEnd value (fromList []) (fromList []), XMLEnd kv (fromList []) (fromList []), XMLStart kv (fromList []) (fromList []), ... XMLEnd map (fromList []) (fromList [])] -} type Key = Int type Value = String -- An enumeratee that converts the XMLStream to the stream -- of key-value pairs. The enumeratee relies on a simple -- combinator library to parse the XML stream. kv_enum :: Monad m => Enumeratee (XMLStream ()) (Key,Value) m a kv_enum = parse_element (name "map") map_parser where map_parser n iter = parse_many (name "kv") kv1 iter >>= finish_element n -- Parse the body of one "kv" element kv1 n iter = do key <- parse_element (name "key") (\n _ -> as_readable n) 0 val <- parse_element (name "value") (\n _ -> as_str n) "" yield_to iter (key,val) >>= finish_element n yield_to :: Monad m => Iteratee e m a -> e -> Iteratee eo m (Iteratee e m a) yield_to (IE_cont Nothing k) v = lift (feedI k (Chunk [v])) yield_to iter _ = return iter -- We test the enumeratee kv_enum tstream_kv1 = run_from_str xmldoc_trunc (id .| kv_enum stream2list) -- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\"" tstream_kv2 = run_from_str xmldoc_full_bad (id .| kv_enum stream2list) {- Left "Reading failure\nStream after the error:\n [XMLStart value (fromList []) (fromList []),XMLString \"v3\" \"\",.... -} tstream_kv3 = run_from_str xmldoc_full_ok (id .| kv_enum stream2list) -- Right [(1,"v1"),(2,"v2"),(4,"v4")] -- Finally, we write an iteratee to -- lookup in the key-value stream lkup :: Monad m => Key -> Iteratee (Key,Value) m (Maybe Value) lkup key = headM >>= check where check (Just (k,v)) | k == key = return (Just v) -- premature termination check Nothing = return Nothing check _ = lkup key -- Putting it all together xml_lookup :: Monad m => Key -> Iteratee Char m (Maybe Value) xml_lookup key = id .| xml_enum default_handlers .| xml_normalize .| kv_enum (lkup key) type Doc = String -- for the purpose of the example, -- the document comes from a String type ErrorMsg = String run_test :: Key -> Doc -> Either ErrorMsg (Maybe Value) run_test key str = runIdentity $ run . en_handle show =<< enum_pure_1chunk str (xml_lookup key) -- Both well-formedness and validation errors are always reported, -- even if the desired key has been located early in the stream. test_trunc = run_test 1 xmldoc_trunc -- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\"" test_full1b = run_test 1 xmldoc_full_bad -- Left "Reading failure... test_full1 = run_test 2 xmldoc_full_ok -- Right (Just "v2") test_full2 = run_test 10 xmldoc_full_ok -- Right Nothing -- Now we wish to stop the validation after we found the element -- with a given key. We still wish to continue checking for -- well-formedness. kv_enum_pterm :: Monad m => Enumeratee (XMLStream ()) (Key,Value) m a kv_enum_pterm = parse_element (name "map") map_parser where map_parser n iter = parse_many (name "kv") kv1 iter >>= finish_element n -- Parse the body of one "kv" element, stopping early -- We could've used exceptions kv1 n iter@(IE_cont Nothing _) = do key <- parse_element (name "key") (\n _ -> as_readable n) 0 val <- parse_element (name "value") (\n _ -> as_str n) "" yield_to iter (key,val) >>= finish_element n kv1 n iter = finish_element_early n iter xml_lookup_pterm :: Monad m => Key -> Iteratee Char m (Maybe Value) xml_lookup_pterm key = id .| xml_enum default_handlers .| xml_normalize .| kv_enum_pterm (lkup key) run_test_pterm :: Key -> Doc -> Either ErrorMsg (Maybe Value) run_test_pterm key str = runIdentity $ run . en_handle show =<< enum_pure_1chunk str (xml_lookup_pterm key) test_trunc_pterm = run_test_pterm 1 xmldoc_trunc -- Left "XML [43] (EOF) broken for element map\nStream after the error:\n\"\"" -- If the lookup succeeds before encountering the element with the -- bad key, we now return the lookup result rather than failure. test_full1b_pterm = run_test_pterm 1 xmldoc_full_bad -- Right (Just "v1") test_full2b_pterm = run_test_pterm 2 xmldoc_full_bad -- Right (Just "v2") test_full3b_pterm = run_test_pterm 3 xmldoc_full_bad -- Left "Reading failure -- ------------------------------------------------------------------------ -- Utility parsers -- Small parser combinator library for XMLStream -- Parser for a body of an element (when its start tag has been read) -- This is like a function to fold over -- The body parser must handle the end tag for QNameR type BodyParser m a = QNameR -> a -> Iteratee (XMLStream ()) m a -- Check that the stream contains the start of the given element -- Parse the element with a given body parser. -- (This function is just like a fold) parse_element :: Monad m => QNameR -> BodyParser m a -> a -> Iteratee (XMLStream ()) m a parse_element n h acc = head >>= \x -> case x of (XMLStart n' attrs _) | n == n' -> h n acc x -> parser_error $ "Expected the start of " ++ show n ++ " found: " ++ show x -- Like a fold, but it is an accumulating fold parse_many :: Monad m => QNameR -> BodyParser m a -> a -> Iteratee (XMLStream ()) m a parse_many n body acc = peek >>= loop acc where loop acc (Just (XMLStart n' _ _)) | n == n' = do head acc <- body n acc peek >>= loop acc loop acc _ = return acc -- Get the character content of an element -- (whose name is passed as the first argument) get_char_content :: Monad m => QNameR -> Iteratee (XMLStream ()) m (String,AttList QNameR) get_char_content n = loop "" where loop str = head >>= \x -> case x of XMLString str' "" | str == "" -> loop str' XMLEnd n' attrs _ | n' == n -> return (str,attrs) x -> parser_error $ "Unexpected in XML Stream for char content of element " ++ show n ++ ": " ++ show x as_str :: Monad m => QNameR -> Iteratee (XMLStream ()) m String as_str n = fst `liftM` get_char_content n as_readable :: (Read a, Monad m) => QNameR -> Iteratee (XMLStream ()) m a as_readable n = as_str n >>= check . reads where check [(v,"")] = return v check _ = parser_error $ "Reading failure" -- All our names are in the empty namespace name :: NCName -> QNameR name str = QNameR "" str -- Check to see the stream ends the element with a given name. -- In this example, we ignore all attributes -- In the other code, we extract the attributes from XMLEnd and add -- them to the state. finish_element :: Monad m => BodyParser m a finish_element n acc = head >>= \x -> case x of (XMLEnd n' attrs _) | n == n' -> return acc x -> parser_error $ "Unexpected while looking for the end of " ++ show n ++ ": " ++ show x -- Skip the stream until found the end of the given element finish_element_early :: Monad m => BodyParser m a finish_element_early n acc = IterateeM.dropWhile test >> head >> return acc where test (XMLEnd n' attrs _) | n == n' = False test _ = True