{-# LANGUAGE NoMonomorphismRestriction #-}
-- * Adding explicit sharing to our DSL
module ExpLet where
import ExpF
-- * Why do we even need explicit sharing
-- * Help the human reader (cf. Principia Mathematica)
-- * Can't we just use Haskell's let?
-- Haskell's lets is certainly explicit and so clearly
-- helps the reader. Does it helps CSE eliminators?
-- Does it help tagless interpreters?
-- The multiplication-by-4 example written explicitly
-- * exp_mul4 = mul 4 (variable "i1")
exp_mul4 =
let x = variable "i1" in
let y = add x x in
add y y
-- * Question: Does Haskell guarantee sharing?
-- * Question: can we rely on memoization?
tree_mul4 = case exp_mul4 of ExpI t -> t
{-
Add (Add (Variable "i1") (Variable "i1"))
(Add (Variable "i1") (Variable "i1"))
-}
dag_mul4 = run_expN exp_mul4
-- (2,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1)])
-- * //
-- * Haskell's let does work, sometimes!
-- The question about Haskell let is non-trivial because
-- it sometimes works.
-- To demonstrate how Haskell let helps some interpreters
-- but not the others, we define two tagless-final
-- interpreters.
-- One computes the size of an expression (the number of the
-- constructors)
-- * Haskell's let does help some DSL interpreters
-- * The size interpreter
newtype Size t = Size Int
instance Exp Size where
constant _ = Size 1
variable _ = Size 1
add (Size x) (Size y) = Size (x+y+1)
Size size_mul4 = exp_mul4 -- 7
Size size_large = mul (2^30) (variable "i1")
-- 2147483647
-- Nearly instantaneous: only 0.01 secs
-- * The print interpreter
-- Another interpreter will print the expression
newtype Print t = Print (IO ())
instance Exp Print where
constant = Print . putStr . show
variable = Print . putStr
add (Print x) (Print y) = Print (x >> putStr " + " >> y)
-- This interpreter will take a long time to print
-- (mul (2^30) (variable "i1")), _even_ if we redirect
-- the output to /dev/null
Print print_mul4 = exp_mul4
-- i1 + i1 + i1 + i1
-- We see the duplication in the output, thus the duplication
-- of the effort to print.
-- * Print pm30 = mul (2^30) (variable "i1")
-- Here, running will take a long time, even if we print to /dev/null
-- So, here Haskell's let did not help!
-- We thus need a sharing form in the DSL itself
-- * //
-- * Adding a new form let_ to our DSL
-- We extend the DSL language with a new expression form, let_,
-- to indicate the explicit sharing
-- Tagless-final interpreters are easily extensible
class ExpLet repr where
let_ :: repr a -> (repr a -> repr b) -> repr b
-- Re-written exp_mul4 using the explicit sharing
exp_mul4' =
let_ (variable "i1") (\x ->
let_ (add x x) (\y->
add y y))
-- * Extending the existing interpreters with let_
-- As one might expect, let_ is basically the reverse application
instance ExpLet R where
let_ x f = f x
-- As expected, exp_mul4 with and without explicit sharing evaluate
-- to the same results. After all, sharing is an optimization,
-- it should not affect the results of DSL programs
val_mul4 = unR exp_mul4 [("i1",5)] -- 20
val_mul4' = unR exp_mul4' [("i1",5)] -- 20
-- * How to see sharing
-- We would like to `see' the sharing.
-- We have to define a show-like function then, an interpreter
-- of tagless-final expressions as strings.
-- Actually, we need a bit more than strings: to show sharing
-- (as let-expressions) we need a way to generate `pointers',
-- or local variable names.
type LetVarCount = Int -- counter for generating let-var-names
newtype S t = S{unS :: LetVarCount -> String}
instance Exp S where
constant = S . const . show
variable = S . const
add e1 e2 = S(\c -> unS e1 c ++ " + " ++ unS e2 c)
instance ExpLet S where
let_ e f = S(\c -> let vname = "v" ++ show c
in unwords ["let",vname,"=",unS e c,
"in",unS (f (S (const vname))) (succ c)])
run_expS :: S t -> String
run_expS (S m) = m 0
sh_mul4 = run_expS exp_mul4
-- "i1 + i1 + i1 + i1"
sh_mul4' = run_expS exp_mul4'
-- "let v0 = i1 in let v1 = v0 + v0 in v1 + v1"
-- * //
-- * Payoff: extending the N interpreter to handle explicit sharing
-- Sharing a computation means performing a computation and sharing
-- result. The code below says exactly that.
instance ExpLet N where
let_ e f = N(do
x <- unN e
unN $ f (N (return x)))
-- Now, we can evaluate exp_mul4' as a DAG
-- The result is the same as dag_mul4.
dag_mul4' = run_expN exp_mul4'
-- (2,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1)])
-- * Benchmarking
-- To really see the difference, we need bigger multiplications
-- We don't want to write the expressions like |exp_mul4'| by hand,
-- we want to generate them.
-- We rewrite the |mul| generator to use the explicit sharing.
-- The difference from |mul| is on the last-but-one line.
mul' :: (ExpLet repr, Exp repr) => Int -> repr Int -> repr Int
mul' 0 _ = constant 0
mul' 1 x = x
mul' n x | n `mod` 2 == 0 = let_ x (\x' -> mul' (n `div` 2) (add x' x'))
mul' n x = add x (mul' (n-1) x)
-- Is there another way to write mul'?
-- mul' n x | n `mod` 2 == 0 = let_ (add x x) (\y -> mul' (n `div` 2) y)
-- "let v0 = i1 + i1 in let v1 = v0 + v0 in v1"
sh_mul4'' = run_expS (mul' 4 (variable "i1"))
-- "let v0 = i1 in let v1 = v0 + v0 in v1 + v1"
-- * //
-- * Benchmarking. Compare with bench12 and bench13 from ExpF.hs
bench_mul' n = do_bench $
run_expN (mul' n (variable "i"))
bench12 = bench_mul' (2^12)
bench13 = bench_mul' (2^13)
-- The exponential speed-up is apparent
{-
*ExpLet> bench12
13
(0.02 secs, 3171764 bytes)
*ExpLet> bench13
14
(0.01 secs, 534548 bytes)
-}
-- The mul (2^30) is the expression tree with 2^30 leaves!
{-
*ExpLet> bench_mul' (2^20)
21
(0.01 secs, 1585752 bytes)
*ExpLet> bench_mul' (2^30)
31
(0.01 secs, 1055932 bytes)
-}
-- * Some sharing is left to discover:
{-
*ExpLet> run_expS (mul' 15 (variable "i"))
"i + let v0 = i in v0 + v0 +
let v1 = v0 + v0 in v1 + v1 + let v2 = v1 + v1 in v2 + v2"
-}
-- It is found and eliminated
{-
*ExpLet> run_expN (mul' 15 (variable "i"))
(6,DAG BiMap[
(0,NVar "i"),
(1,NAdd 0 0),
(2,NAdd 1 1),
(3,NAdd 2 2),
(4,NAdd 2 3),
(5,NAdd 1 4),
(6,NAdd 0 5)])
-}
-- rather quickly:
{-
*ExpLet> bench_mul' (2^30-1)
59
(0.01 secs, 1078232 bytes)
-}
main = do
print tree_mul4
print dag_mul4
print val_mul4
print val_mul4'
print sh_mul4'
print sh_mul4''
print bench13