{-# LANGUAGE NoMonomorphismRestriction #-} -- * Implicit sharing in the tagless-final style, or: -- * seemingly pure hash-consing -- The imperative details of hash-consing are hidden better in a -- final-tagless style of the DSL embedding, described next. module ExpF (Exp(..), ExpI(..), R(..), N(..),NodeId, DAG(..), run_expN, do_bench, mul, ) where import Control.Monad.State import BiMap import qualified ExpI (Exp(..), sklansky) -- * // -- * Tagless-final EDSL embedding -- In the tagless-final approach, embedded DSL -- expressions are built with `constructor functions' such as |constant|, -- |variable|, |add| rather than the data constructors |Constant|, -- |Variable|, |Add|. The constructor functions yield a representation -- for the DSL expression being built. The representation could be a -- string (for pretty-printing), an integer (for evaluator), etc. Since -- the same DSL expression may be concretely represented in several ways, -- the constructor functions are polymorphic, parameterized by the -- representation |repr|. In other words, the constructor functions are -- the members of the type class class Exp repr where constant :: Int -> repr Int variable :: String -> repr Int add :: repr Int -> repr Int -> repr Int -- The data type Exp from ExpI.hs becomes a type class -- Our expressions here are of only one type, Int. We could have dropped -- Int and made 'repr' to be a type variable of the kind *. -- We keep the 'repr Int' notation nevertheless, for consistency -- with the tagless final paper, and to allow for extensions (e.g., -- the addition of booleans). -- Sample expressions from ExpI.hs now look as follows exp_a = add (constant 10) (variable "i1") exp_b = add exp_a (variable "i2") -- * like those in ExpI.hs modulo the case of the identifiers: -- * everything is in lower case. -- The multiplication example: -- the only two differences is the case of the identifiers -- and the type signature mul :: Exp repr => Int -> repr Int -> repr Int mul 0 _ = constant 0 mul 1 x = x mul n x | n `mod` 2 == 0 = mul (n `div` 2) (add x x) mul n x = add x (mul (n-1) x) exp_mul4 = mul 4 (variable "i1") exp_mul8 = mul 8 (variable "i1") -- * Interpreters for our DSL: instances of Exp -- The first interpreter interprets a tagless-final expression -- as ExpI.Exp data type, converting the DSL expressions here into -- the so-called `initial form' of ExpI.hs -- This is one way to print the tagless-final expressions, since -- we have derived a Show instance for ExpI.Exp newtype ExpI t = ExpI (ExpI.Exp) -- * Question: why do we need the wrapper? instance Exp ExpI where constant = ExpI . ExpI.Constant variable = ExpI . ExpI.Variable add (ExpI x) (ExpI y) = ExpI (ExpI.Add x y) test_shb = case exp_b of ExpI e -> e -- Add (Add (Constant 10) (Variable "i1")) (Variable "i2") test_sh4 = case exp_mul4 of ExpI e -> e {- Add (Add (Variable "i1") (Variable "i1")) (Add (Variable "i1") (Variable "i1")) -} -- The conversion to ExpI takes the form of an instance -- of the class Exp that provides the interpretation for the expression -- primitives, as the values for the domain ExpI. -- * Another interpreter: the evaluator -- That interpretation of tagless-final expressions is not -- the only one possible. We can write an evaluator, interpreting -- each expression as an element of the domain R type REnv = [(String,Int)] newtype R t = R{unR :: REnv -> t} -- A reader Monad, actually -- that is, an integer in an environment that gives values for -- free `variables' that may occur in the expression instance Exp R where constant x = R (\_ -> x) variable x = R (\env -> maybe (error $ "no var: " ++ x) id $ lookup x env) add e1 e2 = R (\env -> unR e1 env + unR e2 env) test_val4 = unR exp_mul4 [("i1",5)] -- 20 -- * Evaluating sample expressions is good for debugging -- * Write once (exp_mul4), interpret many times -- We are using exactly the same exp_mul4 as in -- test_sh4. We are evaluating it differently. The gist of -- the final tagless approach is to write an expression once -- and evaluate it many times. -- * // -- ------------------------------------------------------------------------ -- * Detecting implicit sharing (common subexpression elimination) -- * Goal: detect structurally equal subexpressions and share them, -- * converting an expression tree into a DAG -- * Idea: rather than convert an ASTree to ASDag, build the -- * ASDag to begin with. -- * Representing a DAG -- a collection of Nodes identified by NodeIds, type NodeId = Int -- We stress: Node is NOT a recursive data structure, so the comparison -- of Node values takes constant time! data Node = NConst Int | NVar String | NAdd !NodeId !NodeId deriving (Eq,Ord,Show) -- we could use several bimaps for different operations (for -- addition, subtraction, etc). newtype DAG = DAG (BiMap Node) deriving Show -- * BiMap -- (partial) bijection a <-> Int -- The mapping between |Node|s and |NodeId|s is realized -- through a BiMap interface -- BiMap a establishes a bijection between the values of the type |a| -- and integers, with the operations to retrieve the value given its key, -- to find the key for the existing value, and to extend the bijection -- with a new association. -- * // -- * Bottom-up DAG construction -- As we construct a node for a subexpression, we check if the DAG already -- has the equal node. If so, we return its |NodeId|; otherwise, we add the node -- to the DAG. -- The computation is stateful (we are using the State monad) -- Hash-consing proper: insert Node into the DAG if it isn't -- there already, and return its hash code. hashcons :: Node -> State DAG NodeId hashcons e = do DAG m <- get case lookup_key e m of Nothing -> let (k,m') = insert e m in put (DAG m') >> return k Just k -> return k -- * Bottom-up DAG construction: tagless-final helps -- The bottom-up DAG construction maps well to computing a representation -- for a tagless-final expression, which is also evaluated bottom-up. The -- DAG construction can therefore be written as a tagless-final -- interpreter, an instance of the type class Exp. The interpreter maps -- a tagless-final expression to the concrete representation -- * Interpreting Exp as a Node in the current DAG newtype N t = N{unN :: State DAG NodeId} instance Exp N where constant x = N(hashcons $ NConst x) variable x = N(hashcons $ NVar x) add e1 e2 = N(do h1 <- unN e1 h2 <- unN e2 hashcons $ NAdd h1 h2) -- * The state is hidden behind the tagless-final veneer -- run the DAG-construction interpreter and the node, -- as a reference within a DAG. run_expN :: N t -> (NodeId, DAG) run_expN (N m) = runState m (DAG empty) -- We re-interpret exp_mul4 differently this time. -- The DAG-representation makes the sharing patent -- A DAG is printed as the list of |(NodeId,Node)| associations. The -- sharing of the left and right summands below is patent test_sm4 = run_expN exp_mul4 -- (2,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1)]) test_sm8 = run_expN exp_mul8 -- (3,DAG BiMap[(0,NVar "i1"),(1,NAdd 0 0),(2,NAdd 1 1),(3,NAdd 2 2)]) -- * We have constructed netlists, topologically sorted -- a netlist is a low-level representation of a circuit listing the gates and -- their connections, used in circuit manufacturing. Since our BiMap allocated -- monotonically increasing NodeIds, the resulting netlist comes out -- topologically sorted. Therefore, we can straightforwardly generate machine -- code after the standard register allocation. -- We retain all the information about exp_mul4. In addition, all -- sharing is fully explicit. As we can see, the evaluation process finds -- common subexpressions automatically. -- * // -- ------------------------------------------------------------------------ -- * Superficially `effectless' common sub-expression elimination -- sklansky example by Matthew Naylor, with further credit to -- Chalmers folk, Mary Sheeran and Emil Axelsson. -- It is said to be similar to scanl1, but contains more parallelism -- The function sklansky is defined in ExpI.hs -- * To remind what sklansky produces: test_sklansky_o n = ExpI.sklansky addition xs where addition x y = "(" ++ x ++ "+" ++ y ++ ")" xs = Prelude.map (("v"++) . show) [1..n] -- (v1+v2) is used three times {- test_sklansky_o 4 ["v1","(v1+v2)","((v1+v2)+v3)","((v1+v2)+(v3+v4))"] -} -- (v1+v2) is used seven times {- test_sklansky_o 8 ["v1","(v1+v2)", "((v1+v2)+v3)", "((v1+v2)+(v3+v4))", "(((v1+v2)+(v3+v4))+v5)", "(((v1+v2)+(v3+v4))+(v5+v6))", "(((v1+v2)+(v3+v4))+((v5+v6)+v7))", "(((v1+v2)+(v3+v4))+((v5+v6)+(v7+v8)))"] -} -- * sklansky challenge: sharing across expressions, not within an expression -- * We re-write in the tagless-final style -- * scratch that: we use sklansky as it was -- :t ExpI.sklansky -- ExpI.sklansky :: (a -> a -> a) -> [a] -> [a] -- Actually, there is no re-write -- We use Matthew Naylor's code, in pure Haskell, as it was -- * // -- We run it differently though test_sklansky n = runState sk (DAG empty) where sk = sequence (map unN (ExpI.sklansky add xs)) xs = map (variable . show) [1..n] -- Implicit sharing works: hash code 2 (for v1+v2) is used three times {- *ExpF> test_sklansky 4 ([0,2,4,7], DAG BiMap[(0,NVar "1"),(1,NVar "2"), (2,NAdd 0 1),(3,NVar "3"), (4,NAdd 2 3),(5,NVar "4"), (6,NAdd 3 5),(7,NAdd 2 6)]) -} -- We indeed obtained the set of nodes all pointing within the same DAG -- The stateful nature was well-hidden till the very end (running) -- We see the deep sharing: hash code 2, which is v1+v2, is used twice. -- Then code 7, which is (v1+v2)+(v3+v4), is used 4 times -- after being computed {- *ExpF> test_sklansky 8 ([0,2,4,7,9,12,15,19], DAG BiMap[(0,NVar "1"),(1,NVar "2"), (2,NAdd 0 1),(3,NVar "3"), (4,NAdd 2 3),(5,NVar "4"), (6,NAdd 3 5),(7,NAdd 2 6), -- just as it was for test_sklansky 4 (8,NVar "5"),(9,NAdd 7 8), (10,NVar "6"),(11,NAdd 8 10), (12,NAdd 7 11),(13,NVar "7"), (14,NAdd 11 13),(15,NAdd 7 14), (16,NVar "8"),(17,NAdd 13 16), (18,NAdd 11 17),(19,NAdd 7 18)]) -} -- * // -- ------------------------------------------------------------------------ -- * Performance is still a problem -- We have demonstrated the sharing detection technique that represents -- a DSL program as a DAG, eliminating multiple occurrences of common -- subexpressions. Alas, to find all these common subexpressions we have -- to examine the entire expression _tree_, which may take long time for large -- programs (large circuits). -- See ExpLet.hs for the exponential speed-up. -- To force the evaluation do_bench :: (a,DAG) -> Int do_bench (_,DAG d) = size d bench_mul n = do_bench $ run_expN (mul n (variable "i")) -- It takes effort to find the common subexpressions, just because -- we have to traverse the whole tree. bench12 = bench_mul (2^12) bench13 = bench_mul (2^13) {- -- Interpreted code *ExpF> bench_mul (2^11) 12 (0.06 secs, 4688952 bytes) *ExpF> bench_mul (2^12) 13 (0.09 secs, 5344760 bytes) *ExpF> bench_mul (2^13) 14 (0.20 secs, 9791092 bytes) -- compiled with -O2 Prelude ExpF> bench_mul (2^20) 21 (0.44 secs, 80752592 bytes) Prelude ExpF> bench_mul (2^21) 22 (0.87 secs, 160434744 bytes) -} {- *ExpF> test_sklansky 128 ([0,2,4,7,9,12,15,19,21, ... 560,567,575], ...) (0.37 secs, 13650184 bytes) -} -- Not so bad... The implicit sharing detection is quite fast bench_skl n = do_bench $ test_sklansky n {- -- Interpreted code *ExpF> bench_skl 128 576 (0.33 secs, 10854396 bytes) *ExpF> bench_skl 256 1280 (1.79 secs, 38952968 bytes) -- compiled with -O2 Prelude ExpF> bench_skl 128 576 (0.02 secs, 2163436 bytes) Prelude ExpF> bench_skl 256 1280 (0.08 secs, 4333380 bytes) Prelude ExpF> bench_skl 512 2816 (0.37 secs, 13908376 bytes) -} main = do print test_shb print test_sh4 print test_val4 print test_sm4 print test_sm8 print $ test_sklansky_o 4 print $ test_sklansky 4