-- Parsing with left-recursive grammars and eps-loops module LeftRecursion where import Control.Monad import Debug.Trace -- to check the time complexity import Data.List -- to implement a poor memo table -- We start with tests -- S = Sa | eps gram0 = inc $ (gram0 >>> char 'a') <+> eps test0 = run "aaaaa" gram0 -- Just "aaaaa" -- Our gram0 is a recognizer. -- We can attach semantic actions via an Applicative interface, for example. -- This is an orthogonal issue. To help investigate the complexity -- (for example, to trace activations of productions and show the intermediate -- parsing state) it helps to attach labels to non-terminals. -- The labels are NOT needed for parsing at all. The labels are ONLY -- for the sake of debugging. -- S = Sa | eps gram1 = inc $ label "S" $ (gram1 >>> char 'a') <+> eps test1 = run "aaaaa" gram1 -- Just "S(S(S(S(S(S()a)a)a)a)a)" test11 = run "" gram1 -- Just "S()" test21 = run "aaaab" gram1 -- Nothing test22 = run "b" gram1 -- Nothing -- The grammar : S -> eps >>> S test30 = run "abc" gram where gram = inc $ label "S" $ eps >>> gram -- Nothing -- The left star combinator: user-defined and left-recursive star p = inc $ (star p >>> p) <+> eps test40 = run "abab" (star $ char 'a' >>> char 'b') -- Just "abab" test41 = run "ababa" (star $ char 'a' >>> char 'b') -- Nothing -- S -> Sa | Sb | eps gram3 = inc $ label "S" $ gram3 >>> char 'a' <+> gram3 >>> char 'b' <+> eps test50 = run "aba" gram3 -- Just "S(S(S(S()a)b)a)" test51 = run "abac" gram3 -- Nothing -- The most complex grammar, which contains eps-cycles in addition to left -- recursion {- S -> S A C | C A -> B | aCa B -> B C -> b | C A -} gram4 = s where s = inc $ label "S" $ s >>> a >>> c <+> c a = inc $ label "A" $ b <+> char 'a' >>> c >>> char 'a' b = inc $ label "B" $ b c = inc $ label "C" $ char 'b' <+> c >>> a test61 = run "abab" gram4 -- Nothing test62 = run "babab" gram4 -- Just "S(S(C(b))A(aC(b)a)C(b))" test63 = run "babababa" gram4 -- Just "S(S(C(b))A(aC(b)a)C(C(b)A(aC(b)a)))" -- We investigate the complexity of the parsing and memoization near -- the end of this file -- Implementation data Stream a = Fail | Done a | Inc (Stream a) -- The crucial part | Plus (Stream a) (Stream a) deriving Show instance Monad Stream where return = Done Fail >>= f = Fail Done a >>= f = f a Inc x >>= f = Inc (x >>= f) Plus x y >>= f = mplus (x >>= f) (y >>= f) instance MonadPlus Stream where mzero = Fail Fail `mplus` m2 = m2 m1@Done{} `mplus` m2 = Plus m1 m2 Plus m11 m12 `mplus` m2 = Plus m11 (mplus m12 m2) -- rotate m1 `mplus` Fail = m1 m1 `mplus` m2@Done{} = Plus m2 m1 m1 `mplus` Plus m21 m22 = Plus m21 (mplus m1 m22) Inc m1 `mplus` Inc m2 = Inc (m1 `mplus` m2) type Input = String type ParseLabel = String -- A parse takes input and yields the remaining input and the trace -- (the sequence of labels of matches productions) type Parser = Input -> Stream (Input,ParseLabel) -- Primitive parsers eps x = return (x,"") fail x = mzero char c (c1:x) | c == c1 = return (x,[c1]) char _ _ = mzero -- Parser combinators -- Assign a label to a parser label :: ParseLabel -> Parser -> Parser label l p = \x -> p x >>= \ (x',lab) -> return (x',l++"("++lab++")") -- sequential composition of parsers infixl 7 >>> p1 >>> p2 = \x -> do (x1,l1) <- p1 x (x2,l2) <- p2 x1 return (x2,l1++l2) -- parallel composition of parsers infixl 6 <+> p1 <+> p2 = \x -> p1 x `mplus` p2 x inc p = \x -> Inc (p x) -- The successful parse should consume all input -- We take advantage of the fact that Inc is introduced by left-recursion. -- In order for parsing to succeed, a production must consume some input, -- at least one character. -- It follows then that in a successful parse, a left-recursive rule -- cannot be executed more times than there are characters in the input. -- That means we can fail the branches of the parse tree that require -- more than |input| eliminations of the Inc constructor. run :: Input -> Parser -> Maybe ParseLabel run inp p = run' (succ (length inp)) (p inp) run' _ Fail = Nothing run' _ (Done ("",l)) = Just l run' _ (Done _) = Nothing -- input remained unconsumed: failed to parse all run' 0 (Inc _) = Nothing -- exhausted the budget of Inc run' n (Inc x) = run' (pred n) x run' n (Plus m1 m2) = run' n m1 `mplus` run' n m2 -- Investigating time complexity and memoization -- We add tracing of non-terminal activations tlabel :: ParseLabel -> Parser -> Parser tlabel l p = \x -> p x >>= \ (x',lab) -> trace ("Trace: " ++ l++"("++lab++")") $ return (x',l++"("++lab++")") -- As we show below, so far parsing is inefficient because of repeated -- reparses on back-tracking -- To show the inefficiency and enable memoization, we re-write -- the grammar using open recursion gram1nr self = tlabel "S" $ (self >>> char 'a') <+> eps fix f = f (fix f) -- The repeated re-evaluations of S(), etc. are obvious tg13 = run "aaa" (fix $ inc . gram1nr) {- Trace: S() Trace: S() Trace: S(S()a) Trace: S() Trace: S(S()a) Trace: S(S(S()a)a) Trace: S() Trace: S(S()a) Trace: S(S(S()a)a) Trace: S(S(S(S()a)a)a) Just "S(S(S(S()a)a)a)" -} -- 10 lines of tracing tg14 = run "aaaa" (fix $ inc . gram1nr) {- Trace: S() Trace: S() Trace: S(S()a) Trace: S() Trace: S(S()a) Trace: S(S(S()a)a) Trace: S() Trace: S(S()a) Trace: S(S(S()a)a) Trace: S(S(S(S()a)a)a) Trace: S() Trace: S(S()a) Trace: S(S(S()a)a) Trace: S(S(S(S()a)a)a) Trace: S(S(S(S(S()a)a)a)a) Just "S(S(S(S(S()a)a)a)a)" -} -- 15 lines of tracing tg15 = run "aaaaa" (fix $ inc . gram1nr) -- 21 lines of tracing -- We now introduce the memoization runfix :: Input -> (Parser -> Parser) -> Maybe ParseLabel runfix inp f = run inp parser where memo = map (inc parser) (tails inp) parser = f (\i -> let n = length inp - length i in memo !! n) tg23 = runfix "aaa" gram1nr {- Trace: S() Trace: S() Trace: S(S()a) Trace: S(S()a) Trace: S(S(S()a)a) Trace: S(S(S()a)a) Trace: S(S(S(S()a)a)a) Just "S(S(S(S()a)a)a)" -} tg24 = runfix "aaaa" gram1nr {- Trace: S() Trace: S() Trace: S(S()a) Trace: S(S()a) Trace: S(S(S()a)a) Trace: S(S(S()a)a) Trace: S(S(S(S()a)a)a) Trace: S(S(S(S()a)a)a) Trace: S(S(S(S(S()a)a)a)a) Just "S(S(S(S(S()a)a)a)a)" -} tg25 = runfix "aaaaa" gram1nr -- similar, 11 trace lines are printed -- We see that memoization does occur and does help. -- Now, a non-predicate is fired exactly twice on each input. -- For the grammar in question, the time complexity is -- 2n+1, where n is the length of the input string. -- S -> Sa | Sb | eps gram3nr self = tlabel "S" $ self >>> char 'a' <+> self >>> char 'b' <+> eps tg33 = run "aaab" (fix $ inc . gram3nr) -- 46 trace lines, very many repeated re-evaluations of S(), for example. -- Let us investigate: -- run "aaa" (fix gram3nr) -- 15 trace lines, 8 evaluations of S() -- run "aaaa" (fix gram3nr) -- 31 trace lines, 16 evaluations of S() -- run "aaab" (fix gram3nr) -- 46 trace lines, 24 evaluations of S() -- The exponential behavior of the original algorithm is apparent -- However, -- runfix "aaa" gram3nr -- 7 trace lines, 2 evaluations of S() -- runfix "aaaa" gram3nr -- 9 trace lines, 2 evaluations of S() -- runfix "aaab" gram3nr -- 9 trace lines, 2 evaluations of S() -- runfix "aaaaa" gram3nr -- 11 trace lines, 2 evaluations of S() -- runfix "aaabbbbaaa" gram3nr -- 21 trace lines, 2 evaluations of S() -- runfix "aaabbbbaaabbbc" gram3nr -- 28 trace lines, 2 evaluations of S() -- Thus the memoization cured the exponential blow-up, at least in our -- particular case. The time complexity is 2n+1. -- A proper CFG, with an EPS loop -- S -> '[' S ']' | S | eps gram7nr self = tlabel "S" $ char '[' >>> self >>> char ']' <+> self <+> eps tg42 = runfix "[[]]" gram7nr {- Trace: S() Trace: S() Trace: S() Trace: S(S()) Trace: S() Trace: S([S()]) Trace: S([S([S()])]) Just "S([S([S()])])" -} -- runfix "[[]]" gram7nr -- 7 trace lines, 4 evaluations of S() -- runfix "[[[]]]" gram7nr -- 12 trace lines, 5 evaluations of S() -- runfix "[[[[]]]]" gram7nr -- 18 trace lines, 6 evaluations of S() -- runfix "[[[[[]]]]]" gram7nr -- 25 trace lines, 7 evaluations of S() -- We observe quadratic behavior on successful parses -- runfix "[[]" gram7nr -- 15 trace lines, 4 evaluations of S() -- runfix "[[[]]" gram7nr -- 39 trace lines, 5 evaluations of S() -- runfix "[[[[]]]" gram7nr -- 59+ trace lines, 6 evaluations of S() -- It looks like an exponential behavior on failed parses. Note -- that the number of re-evaluations does not blow up; -- simply there are more alternatives now. -- We can extend to mutual recursion gram4nr ~[s,a,b,c] = [s1,a1,b1,c1] where s1 = tlabel "S" $ s >>> a >>> c <+> c a1 = tlabel "A" $ b <+> char 'a' >>> c >>> char 'a' b1 = tlabel "B" $ b c1 = tlabel "C" $ char 'b' <+> c >>> a -- The following is a general memoizing fixed-point combinator -- for mutually-recursive functions runfixP :: Input -> ([Parser] -> [Parser]) -> Maybe ParseLabel runfixP inp f = run inp parser_start where parser_start = head parsers memos = map (\parser -> map (inc parser) (tails inp)) parsers parsers = f (map memo_parser memos) memo_parser memo i = let n = length inp - length i in memo !! n {- *LeftRecursion> runfixP "babab" gram4nr Trace: C(b) Trace: S(C(b)) Trace: S(C(b)) Trace: C(b) Trace: A(aC(b)a) Trace: C(C(b)A(aC(b)a)) Trace: S(C(C(b)A(aC(b)a))) Trace: C(b) Trace: S(S(C(b))A(aC(b)a)C(b)) Just "S(S(C(b))A(aC(b)a)C(b))" -- 9 tracing lines -} -- runfixP "babababa" gram4nr -- 17 tracing lines -- Without memoization, the test takes at least 84 lines