{-# LANGUAGE NoMonomorphismRestriction #-}

-- * Implicit sharing in the tagless-final style, or:
-- * seemingly pure hash-consing
-- The imperative details of hash-consing are hidden better in a
-- final-tagless style of the DSL embedding, described next.


module ExpF (Exp(..),
	     ExpI(..),
	     R(..),
	     N(..),NodeId, DAG(..),
	     run_expN,
	     do_bench,
             mul,
	    ) where

import Control.Monad.State
import BiMap
import qualified ExpI (Exp(..), sklansky)


-- * //
-- * Tagless-final EDSL embedding

-- In the tagless-final approach, embedded DSL
-- expressions are built with `constructor functions' such as |constant|,
-- |variable|, |add| rather than the data constructors |Constant|,
-- |Variable|, |Add|. The constructor functions yield a representation
-- for the DSL expression being built. The representation could be a
-- string (for pretty-printing), an integer (for evaluator), etc. Since
-- the same DSL expression may be concretely represented in several ways,
-- the constructor functions are polymorphic, parameterized by the
-- representation |repr|. In other words, the constructor functions are
-- the members of the type class

class Exp repr where
   constant :: Int -> repr Int
   variable :: String -> repr Int
   add      :: repr Int -> repr Int -> repr Int

-- The data type Exp from ExpI.hs becomes a type class

-- Our expressions here are of only one type, Int. We could have dropped
-- Int and made 'repr' to be a type variable of the kind *.
-- We keep the 'repr Int' notation nevertheless, for consistency
-- with the tagless final paper, and to allow for extensions (e.g.,
-- the addition of booleans).

-- Sample expressions from ExpI.hs now look as follows
exp_a = add (constant 10) (variable "i1")
exp_b = add exp_a (variable "i2")

-- * like those in ExpI.hs modulo the case of the identifiers:
-- * everything is in lower case.

-- The multiplication example:
-- the only two differences is the case of the identifiers
-- and the type signature
mul :: Exp repr => Int -> repr Int -> repr Int
mul 0 _ = constant 0
mul 1 x = x
mul n x | n `mod` 2 == 0 = mul (n `div` 2) (add x x)
mul n x = add x (mul (n-1) x)

exp_mul4 = mul 4 (variable "i1")
exp_mul8 = mul 8 (variable "i1")

-- * Interpreters for our DSL: instances of Exp

-- The first interpreter interprets a tagless-final expression
-- as ExpI.Exp data type, converting the DSL expressions here into
-- the so-called `initial form' of ExpI.hs
-- This is one way to print the tagless-final expressions, since
-- we have derived a Show instance for ExpI.Exp

newtype ExpI t = ExpI (ExpI.Exp)
-- * Question: why do we need the wrapper?

instance Exp ExpI where
    constant = ExpI . ExpI.Constant
    variable = ExpI . ExpI.Variable
    add (ExpI x) (ExpI y) = ExpI (ExpI.Add x y)

test_shb = case exp_b of ExpI e -> e
-- Add (Add (Constant 10) (Variable "i1")) (Variable "i2")

test_sh4 = case exp_mul4 of ExpI e -> e
{-
 Add (Add (Variable "i1") (Variable "i1")) 
     (Add (Variable "i1") (Variable "i1"))
-}

-- The conversion to ExpI takes the form of an instance
-- of the class Exp that provides the interpretation for the expression
-- primitives, as the values for the domain ExpI.

-- * Another interpreter: the evaluator
-- That interpretation of tagless-final expressions is not
-- the only one possible. We can write an evaluator, interpreting
-- each expression as an element of the domain R

type REnv = [(String,Int)]
newtype R t = R{unR :: REnv -> t} -- A reader Monad, actually

-- that is, an integer in an environment that gives values for
-- free `variables' that may occur in the expression

instance Exp R where
    constant x = R (\_ -> x)
    variable x = R (\env -> maybe (error $ "no var: " ++ x) id $ 
		     lookup x env)
    add e1 e2  = R (\env -> unR e1 env + unR e2 env)


test_val4 = unR exp_mul4 [("i1",5)] -- 20
-- * Evaluating sample expressions is good for debugging
-- * Write once (exp_mul4), interpret many times
-- We are using exactly the same exp_mul4 as in
-- test_sh4. We are evaluating it differently. The gist of 
-- the final tagless approach is to write an expression once
-- and evaluate it many times.

-- * //
-- ------------------------------------------------------------------------
-- * Detecting implicit sharing (common subexpression elimination)

-- * Goal: detect structurally equal subexpressions and share them, 
-- * converting an expression tree into a DAG
-- * Idea: rather than convert an ASTree to ASDag, build the
-- * ASDag to begin with.

-- * Representing a DAG
-- a collection of Nodes identified by NodeIds,

type NodeId = Int

-- We stress: Node is NOT a recursive data structure, so the comparison
-- of Node values takes constant time!
data Node = NConst Int
	  | NVar   String
	  | NAdd   !NodeId !NodeId
	    deriving (Eq,Ord,Show)

-- we could use several bimaps for different operations (for
-- addition, subtraction, etc).

newtype DAG = DAG (BiMap Node) deriving Show

-- * BiMap -- (partial) bijection a <-> Int
-- The mapping between |Node|s and |NodeId|s is realized
-- through a BiMap interface
-- BiMap a establishes a bijection between the values of the type |a|
-- and integers, with the operations to retrieve the value given its key,
-- to find the key for the existing value, and to extend the bijection
-- with a new association.

-- * //
-- * Bottom-up DAG construction
-- As we construct a node for a subexpression, we check if the DAG already
-- has the equal node. If so, we return its |NodeId|; otherwise, we add the node
-- to the DAG.
-- The computation is stateful (we are using the State monad)

-- Hash-consing proper: insert Node into the DAG if it isn't
-- there already, and return its hash code.
hashcons :: Node -> State DAG NodeId
hashcons e = do
  DAG m <- get
  case lookup_key e m of
    Nothing -> let (k,m') = insert e m
               in put (DAG m') >> return k
    Just k  -> return k

-- * Bottom-up DAG construction: tagless-final helps
-- The bottom-up DAG construction maps well to computing a representation
-- for a tagless-final expression, which is also evaluated bottom-up. The
-- DAG construction can therefore be written as a tagless-final
-- interpreter, an instance of the type class Exp. The interpreter maps
-- a tagless-final expression to the concrete representation 

-- * Interpreting Exp as a Node in the current DAG
newtype N t = N{unN :: State DAG NodeId}

instance Exp N where
  constant x = N(hashcons $ NConst x)
  variable x = N(hashcons $ NVar x)
  add e1 e2  = N(do
		 h1 <- unN e1
		 h2 <- unN e2
		 hashcons $ NAdd h1 h2)

-- * The state is hidden behind the tagless-final veneer

-- run the DAG-construction interpreter and the node, 
-- as a reference within a DAG.
run_expN :: N t -> (NodeId, DAG)
run_expN (N m) = runState m (DAG empty)

-- We re-interpret exp_mul4 differently this time.
-- The DAG-representation makes the sharing patent

-- A DAG is printed as the list of |(NodeId,Node)| associations. The
-- sharing of the left and right summands below is patent

test_sm4 = run_expN exp_mul4
-- (2,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1)])

test_sm8 = run_expN exp_mul8
-- (3,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1),(3,NAdd 2 2)])

-- * We have constructed netlists, topologically sorted
-- a netlist is a low-level representation of a circuit listing the gates and
-- their connections, used in circuit manufacturing. Since our BiMap allocated
-- monotonically increasing NodeIds, the resulting netlist comes out
-- topologically sorted. Therefore, we can straightforwardly generate machine 
-- code after the standard register allocation.

-- We retain all the information about exp_mul4. In addition, all
-- sharing is fully explicit. As we can see, the evaluation process finds
-- common subexpressions automatically.

-- * //
-- ------------------------------------------------------------------------
-- * Superficially `effectless' common sub-expression elimination

-- sklansky example by Matthew Naylor, with further credit to
-- Chalmers folk, Mary Sheeran and Emil Axelsson.
-- It is said to be similar to scanl1, but contains more parallelism
-- The function sklansky is defined in ExpI.hs

-- * To remind what sklansky produces:
test_sklansky_o n = ExpI.sklansky addition xs
   where
     addition x y = "(" ++ x ++ "+" ++ y ++ ")"
     xs = Prelude.map (("v"++) . show) [1..n]

-- (v1+v2) is used three times
{-
test_sklansky_o 4
["v1","(v1+v2)","((v1+v2)+v3)","((v1+v2)+(v3+v4))"]
-}

-- (v1+v2) is used seven times
{-
test_sklansky_o 8
["v1","(v1+v2)",
 "((v1+v2)+v3)",
 "((v1+v2)+(v3+v4))",
 "(((v1+v2)+(v3+v4))+v5)",
 "(((v1+v2)+(v3+v4))+(v5+v6))",
 "(((v1+v2)+(v3+v4))+((v5+v6)+v7))",
 "(((v1+v2)+(v3+v4))+((v5+v6)+(v7+v8)))"]
-}

-- * sklansky challenge: sharing across expressions, not within an expression

-- * We re-write in the tagless-final style
-- * scratch that: we use sklansky as it was
-- :t ExpI.sklansky
-- ExpI.sklansky :: (a -> a -> a) -> [a] -> [a]
-- Actually, there is no re-write
-- We use Matthew Naylor's code, in pure Haskell, as it was

-- * //
-- We run it differently though
test_sklansky n = runState sk (DAG empty)
  where
  sk = sequence (map unN (ExpI.sklansky add xs))
  xs = map (variable . show) [1..n]

-- Implicit sharing works: hash code 2 (for v1+v2) is used three times

{-
*ExpF> test_sklansky 4
([0,2,4,7],
 DAG BiMap[(0,NVar "1"),(1,NVar "2"),
  (2,NAdd 0 1),(3,NVar "3"),
  (4,NAdd 2 3),(5,NVar "4"),
  (6,NAdd 3 5),(7,NAdd 2 6)])
-}
-- We indeed obtained the set of nodes all pointing within the same DAG
-- The stateful nature was well-hidden till the very end (running)

-- We see the deep sharing: hash code 2, which is v1+v2, is used twice.
-- Then code 7, which is (v1+v2)+(v3+v4), is used 4 times
-- after being computed
{-
*ExpF> test_sklansky 8
([0,2,4,7,9,12,15,19],
 DAG BiMap[(0,NVar "1"),(1,NVar "2"),
 (2,NAdd 0 1),(3,NVar "3"),
 (4,NAdd 2 3),(5,NVar "4"),
 (6,NAdd 3 5),(7,NAdd 2 6),  -- just as it was for test_sklansky 4
 (8,NVar "5"),(9,NAdd 7 8),
 (10,NVar "6"),(11,NAdd 8 10),
 (12,NAdd 7 11),(13,NVar "7"),
 (14,NAdd 11 13),(15,NAdd 7 14),
 (16,NVar "8"),(17,NAdd 13 16),
 (18,NAdd 11 17),(19,NAdd 7 18)])
-}

-- * //
-- ------------------------------------------------------------------------
-- * Performance is still a problem

-- We have demonstrated the sharing detection technique that represents
-- a DSL program as a DAG, eliminating multiple occurrences of common
-- subexpressions. Alas, to find all these common subexpressions we have
-- to examine the entire expression _tree_, which may take long time for large
-- programs (large circuits). 
-- See ExpLet.hs for the exponential speed-up.

-- To force the evaluation
do_bench :: (a,DAG) -> Int
do_bench (_,DAG d) = size d

bench_mul n = do_bench $ 
	      run_expN (mul n (variable "i"))


-- It takes effort to find the common subexpressions, just because
-- we have to traverse the whole tree.

bench12 = bench_mul (2^12)
bench13 = bench_mul (2^13)

{-
-- Interpreted code
*ExpF> bench_mul (2^11)
12
(0.06 secs, 4688952 bytes)
*ExpF> bench_mul (2^12)
13
(0.09 secs, 5344760 bytes)
*ExpF> bench_mul (2^13)
14
(0.20 secs, 9791092 bytes)


-- compiled with -O2
Prelude ExpF> bench_mul (2^20)
21
(0.44 secs, 80752592 bytes)
Prelude ExpF> bench_mul (2^21)
22
(0.87 secs, 160434744 bytes)
-}


{-
*ExpF> test_sklansky 128
([0,2,4,7,9,12,15,19,21, ... 560,567,575], ...)
(0.37 secs, 13650184 bytes)
-}

-- Not so bad... The implicit sharing detection is quite fast
bench_skl n = do_bench $ test_sklansky n

{-
-- Interpreted code
*ExpF> bench_skl 128
576
(0.33 secs, 10854396 bytes)
*ExpF> bench_skl 256
1280
(1.79 secs, 38952968 bytes)

-- compiled with -O2
Prelude ExpF> bench_skl 128
576
(0.02 secs, 2163436 bytes)
Prelude ExpF> bench_skl 256
1280
(0.08 secs, 4333380 bytes)
Prelude ExpF> bench_skl 512
2816
(0.37 secs, 13908376 bytes)
-}


main = do
       print test_shb
       print test_sh4

       print test_val4
       print test_sm4
       print test_sm8

       print $ test_sklansky_o 4
       print $ test_sklansky 4