-- Parsing with left-recursive grammars and eps-loops

module LeftRecursion where

import Control.Monad
import Debug.Trace			-- to check the time complexity
import Data.List			-- to implement a poor memo table

-- We start with tests

-- S = Sa | eps
gram0 = inc $ (gram0 >>> char 'a') <+> eps
test0 = run "aaaaa" gram0
-- Just "aaaaa" 
-- Our gram0 is a recognizer.
-- We can attach semantic actions via an Applicative interface, for example.
-- This is an orthogonal issue. To help investigate the complexity
-- (for example, to trace activations of productions and show the intermediate
-- parsing state) it helps to attach labels to non-terminals.
-- The labels are NOT needed for parsing at all. The labels are ONLY
-- for the sake of debugging.

-- S = Sa | eps
gram1 = inc $ label "S" $ (gram1 >>> char 'a') <+> eps

test1 = run "aaaaa" gram1
-- Just "S(S(S(S(S(S()a)a)a)a)a)"

test11 = run "" gram1
-- Just "S()"

test21 = run "aaaab" gram1
-- Nothing

test22 = run "b" gram1
-- Nothing

-- The grammar : S -> eps >>> S
test30 = run "abc" gram
  where gram = inc $ label "S" $ eps >>> gram
-- Nothing

-- The left star combinator: user-defined and left-recursive

star p = inc $ (star p >>> p) <+> eps

test40 = run "abab" (star $ char 'a' >>> char 'b')
-- Just "abab"

test41 = run "ababa" (star $ char 'a' >>> char 'b')
-- Nothing

-- S -> Sa | Sb | eps
gram3 = inc $ label "S" $ gram3 >>> char 'a' <+> gram3 >>> char 'b' <+> eps

test50 = run "aba" gram3
-- Just "S(S(S(S()a)b)a)"

test51 = run "abac" gram3
-- Nothing

-- The most complex grammar, which contains eps-cycles in addition to left
-- recursion
{-
   S -> S A C | C
   A -> B | aCa
   B -> B
   C -> b | C A
-}

gram4 = s 
 where
 s = inc $ label "S" $ s >>> a >>> c <+> c
 a = inc $ label "A" $ b <+> char 'a' >>> c >>> char 'a'
 b = inc $ label "B" $ b
 c = inc $ label "C" $ char 'b' <+> c >>> a

test61 = run "abab" gram4
-- Nothing
test62 = run "babab" gram4
-- Just "S(S(C(b))A(aC(b)a)C(b))"
test63 = run "babababa" gram4
-- Just "S(S(C(b))A(aC(b)a)C(C(b)A(aC(b)a)))"

-- We investigate the complexity of the parsing and memoization near
-- the end of this file

-- Implementation

data Stream a = Fail 
	      | Done a 
	      | Inc (Stream a) 		-- The crucial part
	      | Plus (Stream a) (Stream a)
		deriving Show

instance Monad Stream where
    return = Done

    Fail     >>= f = Fail
    Done a   >>= f = f a
    Inc x    >>= f = Inc (x >>= f)
    Plus x y >>= f = mplus (x >>= f) (y >>= f)

instance MonadPlus Stream where
    mzero = Fail

    Fail         `mplus` m2 = m2
    m1@Done{}    `mplus` m2 = Plus m1 m2
    Plus m11 m12 `mplus` m2 = Plus m11 (mplus m12 m2) -- rotate
    m1 `mplus` Fail         = m1
    m1 `mplus` m2@Done{}    = Plus m2 m1
    m1 `mplus` Plus m21 m22 = Plus m21 (mplus m1 m22)
    Inc m1 `mplus` Inc m2 = Inc (m1 `mplus` m2)


type Input = String
type ParseLabel = String

-- A parse takes input and yields the remaining input and the trace
-- (the sequence of labels of matches productions)
type Parser = Input -> Stream (Input,ParseLabel)

-- Primitive parsers
eps x  = return (x,"")
fail x = mzero

char c (c1:x) | c == c1 = return (x,[c1])
char _ _ = mzero

-- Parser combinators

-- Assign a label to a parser

label :: ParseLabel -> Parser -> Parser
label l p = \x -> p x >>= \ (x',lab) -> return (x',l++"("++lab++")")

-- sequential composition of parsers
infixl 7 >>>
p1 >>> p2 = \x -> do
		   (x1,l1) <- p1 x
		   (x2,l2) <- p2 x1
		   return (x2,l1++l2)


-- parallel composition of parsers
infixl 6 <+>
p1 <+> p2 = \x -> p1 x `mplus` p2 x

inc p = \x -> Inc (p x)

-- The successful parse should consume all input

-- We take advantage of the fact that Inc is introduced by left-recursion.
-- In order for parsing to succeed, a production must consume some input,
-- at least one character.
-- It follows then that in a successful parse, a left-recursive rule
-- cannot be executed more times than there are characters in the input.
-- That means we can fail the branches of the parse tree that require
-- more than |input| eliminations of the Inc constructor.

run :: Input -> Parser -> Maybe ParseLabel
run inp p = run' (succ (length inp)) (p inp)

run' _ Fail = Nothing
run' _ (Done ("",l)) = Just l
run' _ (Done _) = Nothing  -- input remained unconsumed: failed to parse all
run' 0 (Inc _) = Nothing   -- exhausted the budget of Inc
run' n (Inc x) = run' (pred n) x
run' n (Plus m1 m2) = run' n m1 `mplus` run' n m2

-- Investigating time complexity and memoization

-- We add tracing of non-terminal activations
tlabel :: ParseLabel -> Parser -> Parser
tlabel l p = \x -> p x >>= \ (x',lab) -> 
	     trace ("Trace: " ++ l++"("++lab++")") $
	     return (x',l++"("++lab++")")

-- As we show below, so far parsing is inefficient because of repeated 
-- reparses on back-tracking
-- To show the inefficiency and enable memoization, we re-write
-- the grammar using open recursion
gram1nr self = tlabel "S" $ (self >>> char 'a') <+> eps

fix f = f (fix f)

-- The repeated re-evaluations of S(), etc. are obvious
tg13 = run "aaa" (fix $ inc . gram1nr)
{-
Trace: S()
Trace: S()
Trace: S(S()a)
Trace: S()
Trace: S(S()a)
Trace: S(S(S()a)a)
Trace: S()
Trace: S(S()a)
Trace: S(S(S()a)a)
Trace: S(S(S(S()a)a)a)
Just "S(S(S(S()a)a)a)"
-}
-- 10 lines of tracing
tg14 = run "aaaa" (fix $ inc . gram1nr)
{-
Trace: S()
Trace: S()
Trace: S(S()a)
Trace: S()
Trace: S(S()a)
Trace: S(S(S()a)a)
Trace: S()
Trace: S(S()a)
Trace: S(S(S()a)a)
Trace: S(S(S(S()a)a)a)
Trace: S()
Trace: S(S()a)
Trace: S(S(S()a)a)
Trace: S(S(S(S()a)a)a)
Trace: S(S(S(S(S()a)a)a)a)
Just "S(S(S(S(S()a)a)a)a)"
-}
-- 15 lines of tracing
tg15 = run "aaaaa" (fix $ inc . gram1nr)
-- 21 lines of tracing

-- We now introduce the memoization
runfix :: Input -> (Parser -> Parser) -> Maybe ParseLabel
runfix inp f = run inp parser
 where 
 memo = map (inc parser) (tails inp)
 parser = f (\i -> let n = length inp - length i
	           in memo !! n)

tg23 = runfix "aaa" gram1nr
{-
Trace: S()
Trace: S()
Trace: S(S()a)
Trace: S(S()a)
Trace: S(S(S()a)a)
Trace: S(S(S()a)a)
Trace: S(S(S(S()a)a)a)
Just "S(S(S(S()a)a)a)"
-}
tg24 = runfix "aaaa" gram1nr
{-
Trace: S()
Trace: S()
Trace: S(S()a)
Trace: S(S()a)
Trace: S(S(S()a)a)
Trace: S(S(S()a)a)
Trace: S(S(S(S()a)a)a)
Trace: S(S(S(S()a)a)a)
Trace: S(S(S(S(S()a)a)a)a)
Just "S(S(S(S(S()a)a)a)a)"
-}
tg25 = runfix "aaaaa" gram1nr
-- similar, 11 trace lines are printed

-- We see that memoization does occur and does help.
-- Now, a non-predicate is fired exactly twice on each input.
-- For the grammar in question, the time complexity is
-- 2n+1, where n is the length of the input string.

-- S -> Sa | Sb | eps
gram3nr self = tlabel "S" $ self >>> char 'a' <+> self >>> char 'b' <+> eps


tg33 = run "aaab" (fix $ inc . gram3nr)
-- 46 trace lines, very many repeated re-evaluations of S(), for example.

-- Let us investigate:
-- run "aaa" (fix gram3nr)
-- 15 trace lines, 8  evaluations of S()
-- run "aaaa" (fix gram3nr)
-- 31 trace lines, 16 evaluations of S()
-- run "aaab" (fix gram3nr)
-- 46 trace lines, 24 evaluations of S()
-- The exponential behavior of the original algorithm is apparent


-- However,
-- runfix "aaa" gram3nr 
-- 7 trace lines, 2 evaluations of S()
-- runfix "aaaa" gram3nr 
-- 9 trace lines, 2 evaluations of S()
-- runfix "aaab" gram3nr 
-- 9 trace lines, 2 evaluations of S()
-- runfix "aaaaa" gram3nr 
-- 11 trace lines, 2 evaluations of S()
-- runfix "aaabbbbaaa" gram3nr
-- 21 trace lines, 2 evaluations of S()
-- runfix "aaabbbbaaabbbc" gram3nr
-- 28 trace lines, 2 evaluations of S()

-- Thus the memoization cured the exponential blow-up, at least in our
-- particular case. The time complexity is 2n+1.

-- A proper CFG, with an EPS loop
-- S -> '[' S ']' | S | eps
gram7nr self = tlabel "S" $ char '[' >>> self >>> char ']' <+> self <+> eps

tg42 = runfix "[[]]" gram7nr 
{-
Trace: S()
Trace: S()
Trace: S()
Trace: S(S())
Trace: S()
Trace: S([S()])
Trace: S([S([S()])])
Just "S([S([S()])])"
-}

-- runfix "[[]]" gram7nr 
-- 7 trace lines, 4 evaluations of S()
-- runfix "[[[]]]" gram7nr 
-- 12 trace lines, 5 evaluations of S()
-- runfix "[[[[]]]]" gram7nr 
-- 18 trace lines, 6 evaluations of S()
-- runfix "[[[[[]]]]]" gram7nr 
-- 25 trace lines, 7 evaluations of S()
-- We observe quadratic behavior on successful parses


-- runfix "[[]" gram7nr 
-- 15 trace lines, 4 evaluations of S()
-- runfix "[[[]]" gram7nr 
-- 39 trace lines, 5 evaluations of S()
-- runfix "[[[[]]]" gram7nr 
-- 59+ trace lines, 6 evaluations of S()
-- It looks like an exponential behavior on failed parses. Note
-- that the number of re-evaluations does not blow up;
-- simply there are more alternatives now.

-- We can extend to mutual recursion
gram4nr ~[s,a,b,c] = [s1,a1,b1,c1]
 where
 s1 = tlabel "S" $ s >>> a >>> c <+> c
 a1 = tlabel "A" $ b <+> char 'a' >>> c >>> char 'a'
 b1 = tlabel "B" $ b
 c1 = tlabel "C" $ char 'b' <+> c >>> a

-- The following is a general memoizing fixed-point combinator
-- for mutually-recursive functions
runfixP :: Input -> ([Parser] -> [Parser]) -> Maybe ParseLabel
runfixP inp f = run inp parser_start
 where
 parser_start = head parsers
 memos = map (\parser -> map (inc parser) (tails inp)) parsers
 parsers = f (map memo_parser memos)
 memo_parser memo i = let n = length inp - length i
		      in memo !! n


{-
*LeftRecursion> runfixP "babab" gram4nr
Trace: C(b)
Trace: S(C(b))
Trace: S(C(b))
Trace: C(b)
Trace: A(aC(b)a)
Trace: C(C(b)A(aC(b)a))
Trace: S(C(C(b)A(aC(b)a)))
Trace: C(b)
Trace: S(S(C(b))A(aC(b)a)C(b))
Just "S(S(C(b))A(aC(b)a)C(b))"
-- 9 tracing lines
-}

-- runfixP "babababa" gram4nr
-- 17 tracing lines

-- Without memoization, the test takes at least 84 lines