{-# LANGUAGE NoMonomorphismRestriction #-}

-- * Adding explicit sharing to our DSL

module ExpLet where

import ExpF

-- * Why do we even need explicit sharing
-- * Help the human reader (cf. Principia Mathematica)

-- * Can't we just use Haskell's let?
-- Haskell's lets is certainly explicit and so clearly
-- helps the reader. Does it helps CSE eliminators?
-- Does it help tagless interpreters?

-- The multiplication-by-4 example written explicitly
-- * exp_mul4 = mul 4 (variable "i1")

exp_mul4 = 
 let x = variable "i1" in
 let y = add x x in
 add y y

-- * Question: Does Haskell guarantee sharing?
-- * Question: can we rely on memoization?

tree_mul4 = case exp_mul4 of ExpI t -> t
{-
  Add (Add (Variable "i1") (Variable "i1")) 
      (Add (Variable "i1") (Variable "i1"))
-}

dag_mul4 = run_expN exp_mul4
-- (2,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1)])

-- * //
-- * Haskell's let does work, sometimes!
-- The question about Haskell let is non-trivial because
-- it sometimes works.

-- To demonstrate how Haskell let helps some interpreters
-- but not the others, we define two tagless-final
-- interpreters.

-- One computes the size of an expression (the number of the
-- constructors)

-- * Haskell's let does help some DSL interpreters
-- * The size interpreter
newtype Size t = Size Int

instance Exp Size where
    constant _ = Size 1
    variable _ = Size 1
    add (Size x) (Size y) = Size (x+y+1)

Size size_mul4 = exp_mul4  -- 7

Size size_large = mul (2^30) (variable "i1")
-- 2147483647
-- Nearly instantaneous: only 0.01 secs

-- * The print interpreter
-- Another interpreter will print the expression
newtype Print t = Print (IO ())
instance Exp Print where
    constant = Print . putStr . show 
    variable = Print . putStr
    add (Print x) (Print y) = Print (x >> putStr " + " >> y)

-- This interpreter will take a long time to print
-- (mul (2^30) (variable "i1")), _even_ if we redirect
-- the output to /dev/null
Print print_mul4 = exp_mul4
-- i1 + i1 + i1 + i1
-- We see the duplication in the output, thus the duplication
-- of the effort to print. 
-- * Print pm30 = mul (2^30) (variable "i1")
-- Here, running will take a long time, even if we print to /dev/null
-- So, here Haskell's let did not help!

-- We thus need a sharing form in the DSL itself

-- * //
-- * Adding a new form let_ to our DSL
-- We extend the DSL language with a new expression form, let_,
-- to indicate the explicit sharing
-- Tagless-final interpreters are easily extensible
class ExpLet repr where
  let_ :: repr a -> (repr a -> repr b) -> repr b

-- Re-written exp_mul4 using the explicit sharing
exp_mul4' = 
 let_ (variable "i1") (\x ->
 let_ (add x x)       (\y->
 add y y))

-- * Extending the existing interpreters with let_
-- As one might expect, let_ is basically the reverse application
instance ExpLet R where
  let_ x f = f x

-- As expected, exp_mul4 with and without explicit sharing evaluate
-- to the same results. After all, sharing is an optimization,
-- it should not affect the results of DSL programs
val_mul4  = unR exp_mul4 [("i1",5)] -- 20
val_mul4' = unR exp_mul4' [("i1",5)] -- 20

-- * How to see sharing
-- We would like to `see' the sharing.
-- We have to define a show-like function then, an interpreter
-- of tagless-final expressions as strings.
-- Actually, we need a bit more than strings: to show sharing
-- (as let-expressions) we need a way to generate `pointers',
-- or local variable names.

type LetVarCount = Int			-- counter for generating let-var-names
newtype S t = S{unS :: LetVarCount -> String}

instance Exp S where
    constant  = S . const . show
    variable  = S . const
    add e1 e2 = S(\c -> unS e1 c ++ " + " ++ unS e2 c)

instance ExpLet S where
    let_ e f  = S(\c -> let vname = "v" ++ show c
		        in unwords ["let",vname,"=",unS e c,
				    "in",unS (f (S (const vname))) (succ c)])

run_expS :: S t -> String
run_expS (S m) = m 0

sh_mul4  = run_expS exp_mul4
-- "i1 + i1 + i1 + i1"

sh_mul4' = run_expS exp_mul4'
-- "let v0 = i1 in let v1 = v0 + v0 in v1 + v1"

-- * //
-- * Payoff: extending the N interpreter to handle explicit sharing
-- Sharing a computation means performing a computation and sharing 
-- result. The code below says exactly that.
instance ExpLet N where
    let_ e f = N(do
		 x <- unN e
		 unN $ f (N (return x)))

-- Now, we can evaluate exp_mul4' as a DAG
-- The result is the same as dag_mul4.
dag_mul4' = run_expN exp_mul4'
-- (2,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1)])

-- * Benchmarking
-- To really see the difference, we need bigger multiplications

-- We don't want to write the expressions like |exp_mul4'| by hand,
-- we want to generate them.
-- We rewrite the |mul| generator to use the explicit sharing.
-- The difference from |mul| is on the last-but-one line.
mul' :: (ExpLet repr, Exp repr) => Int -> repr Int -> repr Int
mul' 0 _ = constant 0
mul' 1 x = x
mul' n x | n `mod` 2 == 0 = let_ x (\x' -> mul' (n `div` 2) (add x' x'))
mul' n x = add x (mul' (n-1) x)

-- Is there another way to write mul'?
-- mul' n x | n `mod` 2 == 0 = let_ (add x x) (\y -> mul' (n `div` 2) y)
-- "let v0 = i1 + i1 in let v1 = v0 + v0 in v1"

sh_mul4'' = run_expS (mul' 4 (variable "i1"))
-- "let v0 = i1 in let v1 = v0 + v0 in v1 + v1"

-- * //
-- * Benchmarking. Compare with bench12 and bench13 from ExpF.hs
bench_mul' n = do_bench $
	       run_expN (mul' n (variable "i"))

bench12 = bench_mul' (2^12)
bench13 = bench_mul' (2^13)

-- The exponential speed-up is apparent
{-
*ExpLet> bench12
13
(0.02 secs, 3171764 bytes)
*ExpLet> bench13
14
(0.01 secs, 534548 bytes)
-}

-- The mul (2^30) is the expression tree with 2^30 leaves!
{-
*ExpLet> bench_mul' (2^20)
21
(0.01 secs, 1585752 bytes)
*ExpLet> bench_mul' (2^30)
31
(0.01 secs, 1055932 bytes)
-}

-- * Some sharing is left to discover:
{-
*ExpLet> run_expS (mul' 15 (variable "i"))
 "i + let v0 = i in v0 + v0 + 
      let v1 = v0 + v0 in v1 + v1 + let v2 = v1 + v1 in v2 + v2"
-}

-- It is found and eliminated
{-
*ExpLet> run_expN (mul' 15 (variable "i"))
(6,DAG BiMap[
   (0,NVar "i"),
   (1,NAdd 0 0),
   (2,NAdd 1 1),
   (3,NAdd 2 2),
   (4,NAdd 2 3),
   (5,NAdd 1 4),
   (6,NAdd 0 5)])
-}

-- rather quickly:
{-
*ExpLet> bench_mul' (2^30-1)
59
(0.01 secs, 1078232 bytes)
-}


main = do
       print tree_mul4
       print dag_mul4
       print val_mul4
       print val_mul4'
       print sh_mul4'
       print sh_mul4''

       print bench13