-- Deriving Pure Iteratees module IterDeriv where import Control.Monad -- We start with a simple IO program: read lines from the standard -- input until the empty line is read. -- A line is a sequence of non-newline characters. -- We write the program in C-style, using the standard Haskell -- function getChar, or the non-exceptional version of it below. -- The function getchar is like the one familiar from C, returning -- a char or the EOF indicator. -- In Haskell, we can model such `lifted' characters properly, as -- Maybe Char type LChar = Maybe Char -- lifted character -- We add numeric suffix to many of our functions since -- we will be re-implementing them several times. getchar0 :: IO LChar getchar0 = (getChar >>= return . Just) `catch` \_ -> return Nothing -- First we write a function to read one line getline0 :: IO String getline0 = loop "" where loop acc = getchar0 >>= check acc check acc (Just c) | c /= '\n' = loop (c:acc) check acc _ = return (reverse acc) -- We combine line readers to read several lines until the empty line getlines0 :: IO [String] getlines0 = loop [] where loop acc = getline0 >>= check acc check acc "" = return (reverse acc) check acc l = loop (l:acc) -- From the standpoint of process calculus, we may view getlines0 as a process -- that receives lifted characters on a dedicated channel stdin. -- We model the process as a final co-algebra of the functor -- T(X) = A + X ^ LChar -- (that is, a finitely branching tree with finite and infinite branches) -- The two operations that we care about: the process is either -- finished, or wants a character data I a = Done a | GetC (LChar -> I a) -- A sample process: reading a line until '\n' -- The code is almost identical to getline0 getline :: I String getline = loop "" where loop acc = GetC (check acc) check acc (Just c) | c /= '\n' = loop (c:acc) check acc _ = Done (reverse acc) -- The observation function: the interpreter -- It takes a string and feeds it to the process eval :: String -> I a -> a eval "" (GetC k) = eval "" $ k Nothing eval (c:t) (GetC k) = eval t $ k (Just c) eval str (Done x) = x -- The divergence is possible: loop1 = GetC (const loop1) -- That is, upon receiving the EOF, the input process may just -- keep reading. We can regard that as an error (and even prevent -- using a bit more sophisticated session types), -- but we leave it for another time. After all, if a Haskell -- program wants to loop, it can always do so, in many different ways. -- (In Iteratee, we do report an error if an input process -- asks for input after being told that there is none). -- How to build bigger processes? In particular, how -- to combine getline to read several lines? -- It turns out that I has a monadic structure -- (up to the observation: see Hinze ICFP 2000) instance Monad I where return = Done Done x >>= f = f x GetC k >>= f = GetC (k >>> f) -- Kleisli composition -- `Lifting' function composition to monadic functions -- The operator (>>>) here is a special -- cases of that in Control.Category (>>>):: Monad m => (a -> m b) -> (b -> m c) -> (a -> m c) f1 >>> f2 = \x -> f1 x >>= f2 -- We write the reader of lines identically to getlines0 getlines :: I [String] getlines = loop [] where loop acc = getline >>= check acc check acc "" = return (reverse acc) check acc l = loop (l:acc) -- tests t110 = eval "abd\nxxx\nf" getline t111 = eval "abd\nxxx\nf" getlines -- ["abd","xxx","f"] -- ---------------------------------------------------------------------- -- Enumerators -- We factor eval as the composition of run and en_str -- Especially the latter has good algebraic properties -- A small change to eval, returning Iteratee en_str :: String -> I a -> I a en_str "" i = i en_str (c:t) (GetC k) = en_str t $ k (Just c) en_str _ (Done x) = Done x -- The function to run Iteratee, -- telling it that the input is finished and it should yield the result run :: I a -> a run (GetC k) = run $ k Nothing run (Done x) = x -- A more practically useful function -- If the process does not want to finish upon receiving the EOF -- we should report an error rather than diverge. -- Denotationally, run' is the same as run; practically run' -- is quite more helpful. run' :: I a -> a run' (GetC k) = case k Nothing of Done x -> x _ -> error "divergent iteratee" run' (Done x) = x -- ---------------------------------------------------------------------- -- Parsing combinator library -- primitive parsers -- An iteratee that does not recognize anything (never gets Done) failure :: I a failure = GetC (const failure) -- An iteratee (Done v) recognizes the empty string empty :: a -> I a empty v = Done v -- A parser for one lifted character oneL :: I LChar oneL = GetC Done -- Left-biased alternation -- Informally, it races the two processes until one process -- finishes. If both finish at the same time, the first process wins. ( I a -> I a Done x k1 c >= \c -> (i2 c >= i2) >= i3) -- Left distributivity does NOT hold -- (i1 >= k =/= i1 >>= k >= k -- Counter-example: i1 = return true, i2 = return false -- k = \c -> if c then fail else return c -- Derived parsers -- recognizes one character string that satisfies the given pred pSat :: (LChar -> Bool) -> I LChar pSat pred = oneL >>= \c -> if pred c then return c else failure -- The same but for the proper character pSat' :: (Char -> Bool) -> I Char pSat' pred = oneL >>= \c -> case c of Just c | pred c -> return c _ -> failure -- Recognizes one-character string one :: I Char one = oneL >>= maybe failure return -- We redo our getline using parser combinators pGetline :: I String pGetline = nl c == Just '\n' || c == Nothing) return "" tp1 = run $ en_str "abd\nxxx\nf" pGetline -- "abd" tp2 = run $ en_str "ab" pGetline -- "ab" -- pGetline is very inefficient. We can optimize using equational -- laws. First, we inline the definitions of one and pSat and desugar -- the do-notation pGetline1 :: I String pGetline1 = nl >= \c -> if c == Just '\n' || c == Nothing then return c else failure) >> return "" char = (oneL >>= maybe failure return) >>= \c -> liftM (c:) pGetline1 tp11 = run $ en_str "abd\nxxx\nf" pGetline1 -- "abd" -- Associativity of bind pGetline2 :: I String pGetline2 = nl >= (\c -> if c == Just '\n' || c == Nothing then return c >> return "" else failure >> return "") char = oneL >>= (\c -> maybe failure return c >>= \c -> liftM (c:) pGetline2) -- Case-distribution -- Monad laws; failure >>= k === failure pGetline3 :: I String pGetline3 = nl >= (\c -> if c == Just '\n' || c == Nothing then return "" else failure) char = oneL >>= (\c -> case c of Just c -> liftM (c:) pGetline3 Nothing -> failure) tp13 = run $ en_str "abd\nxxx\nf" pGetline3 -- "abd" -- Right-distributivity pGetline4 :: I String pGetline4 = oneL >>= \c -> nl' c liftM (c:) pGetline4 Nothing -> failure tp14 = run $ en_str "abd\nxxx\nf" pGetline4 -- "abd" -- Pulling out the case analysis pGetline5 :: I String pGetline5 = oneL >>= check where check (Just '\n') = return "" >= check where check (Just '\n') = return "" check Nothing = return "" check (Just c) = liftM (c:) pGetline' tp1' = run $ en_str "abd\nxxx\nf" pGetline' -- "abd" tp2' = run $ en_str "ab" pGetline' -- "ab" -- A more interesting example -- Parse a given string pString :: String -> I String pString "" = empty "" pString (h:t) = liftM2 (:) (pSat' (==h)) (pString t) pEOF :: a -> I a pEOF x = pSat (==Nothing) >> return x -- Parse the language ww -- the context-sensitive language pww prefix = (pString prefix >>= pEOF) >= \c -> pww (prefix ++ [c])) tww1 = run' $ en_str "" $ pww [] -- "" tww2 = run' $ en_str "aaaaaa" $ pww [] -- "aaa" tww3 = run' $ en_str "aaaaaaa" $ pww [] -- "*** Exception: divergent iteratee tww4 = run' $ en_str "12211221" $ pww [] -- "1221" tww5 = run' $ en_str "12341234" $ pww [] -- "1234" tww6 = run' $ en_str "123412341" $ pww [] -- "*** Exception: divergent iteratee tww7 = run' $ en_str "1234112341" $ pww [] -- "12341" -- The combinators are efficient: the input stream -- is consumed character-by-character and is never -- backed up. There is no backtracking. -- The combinators also appear powerful, capable -- of parsing context-sensitive languages. -- And yet, expressing something like Kleene star is ugly. -- We essentially emulate CPS to express left-distributed alternation. -- Kleene star: p `many` pk parses "p* pk" -- Think of it as a special-sort of bind many :: I a -> ([a] -> I w) -> I w many p pk = (p >>= \a -> many p (pk . (a:))) ((pString "abab") `many` \s2 -> pEOF [s1,s2])) -- [["abab","abab"],[]] tab4 = run' $ en_str "ababababcd" ((pString "ab") `many` \s1 -> ((pString "abcd") `many` \s2 -> pEOF [s1,s2])) -- [["ab","ab","ab"],["abcd"]] -- A better solution is to permit a look-ahead, which is an `effect', -- to be expressed with effectful iteratees. See IterDerivM.hs