-- Haskell98! -- Contrasting Lazy and Iteratee IO on the example of Unix wc: -- counting lines and words in a file and in a sequence of files -- -- See the message posted on Haskell-Cafe on Sep 19, 2008: -- http://www.haskell.org/pipermail/haskell-cafe/2008-September/047738.html -- for more discussion. -- The message above refers to an older version of the code. -- The new numbers are given below. -- To compile this code -- ghc --make -O2 -main-is Main.main_nl_iter Wc.hs module Main where import System.Environment import IterateeM -- Counting lines in a file {- time wc -l /usr/share/dict/words 235882 /usr/share/dict/words real 0m0.026s user 0m0.026s sys 0m0.001s -} -- Counting lines with Lazy IO: the baseline -- http://www.haskell.org/pipermail/haskell-cafe/2008-September/047729.html main_nl_lazy = do name:_ <- getArgs file <- readFile name print $ length $ lines file {- time ./Wc /usr/share/dict/words 235882 real 0m0.295s user 0m0.271s sys 0m0.024s -} -- Count NL in a stream of characters. count_nl = ie_cont $ step 0 where step acc (Chunk str) = ie_cont (step $! acc + count str) step acc stream = ie_done acc stream count [] = 0 count ('\n':str) = succ $! count str count (_:str) = count str -- Iteratee-based solution. It seems faster than lazy IO main_nl_iter = do name:_ <- getArgs counter <- run (enum_file name $$ count_nl) print counter {- time ./Wc /usr/share/dict/words opened file /usr/share/dict/words closed file /usr/share/dict/words 235882 real 0m0.137s user 0m0.135s sys 0m0.003s -} -- Counting words in a sequence of files, whose names are given -- on the command line -- Count the stream. This could have been in the IterateeM library stream_count :: Monad m => Iteratee el m Int stream_count = ie_cont $ step 0 where step acc (Chunk []) = ie_cont (step acc) step acc (Chunk [_]) = ie_cont (step $! succ acc) step acc (Chunk ls) = ie_cont (step $! acc + length ls) step acc stream = ie_done acc stream -- For warm-up, we count words in one file, and in two file count_words_1file = do name:_ <- getArgs counter <- run $ joinI $ enum_file name $$ enum_words $$ stream_count print counter count_words_2files = do let names = ["/etc/motd", "/etc/resolv.conf"] let r = enum_file (names !! 1) $$ enum_file (names !! 0) $$ (enum_words $$ stream_count) counter <- run $ joinI $ r print counter -- Counting words in all files given on the command-line -- Example usage: -- find /usr/local/share/doc/ghc6 -name \*.html -print | time xargs Wc -- That counts words in 1174 files. -- Iteratee-based solution main_word_iter = do names <- getArgs let enumerators = foldr (\name -> (enum_file name >.)) enum_eof names counter <- run $ joinI $ enumerators $$ enum_words $$ stream_count print counter {- The composition of enumerators corresponds to the `concatenation' of their sources. Declaratively, the meaning of the above code is: -- all given files are concatenated -- the resulting stream of characters is converted to a stream of words -- the stream of words is counted. Operationally, the code does not open more than one file at a time. More importantly, the code *never* reads more than 4096 characters at a time. A block of the file is read, split into words, counted, and only then another chunk is read. After one file is done, it is closed, and another file is processed. One can see that only one file is being opened at a time by enabling traces. The processing is fully incremental. -- Sample output: opened file /usr/local/share/doc/ghc6/Cabal/authors.html closed file /usr/local/share/doc/ghc6/Cabal/authors.html opened file /usr/local/share/doc/ghc6/Cabal/bugs.html closed file /usr/local/share/doc/ghc6/Cabal/bugs.html .. 3057829 21.64 real 19.73 user 0.53 sys We emphasize that we do not open more than one file at a time: we open the next file only after we are done with the previous one. -} -- The lazy IO solution, for contrast -- -- On the above example, it aborts with an error: -- openFile: resource exhausted (Too many open files) -- Indeed, one of the main drawbacks of Lazy IO is resource mismanagement. -- On GHC 6.8.3, we ge a worse error: -- Wc: internal error: awaitEvent: descriptor out of range -- (GHC version 6.8.3 for i386_unknown_freebsd) -- Please report this as a GHC bug: http://www.haskell.org/ghc/reportabug -- 18.69 real 16.62 user 0.73 sys main_word_lazy = do names <- getArgs files <- mapM readFile names print $ length $ words (concat files) main = main_word_iter