-- Search the stream for a given string module SearchString (search_string) where import IterateeM import Prelude hiding (dropWhile) import Control.Monad.Identity -- for tests -- Search the stream for a given string -- If the string is found, the result is True and -- the stream contains the first element after the found string. -- This iteratee does no buffering and never reads (ahead) more than -- really necessary to locate the string. search_string :: (Eq e, Monad m) => [e] -> Iteratee e m Bool {- This is the Iteratee version of the Scheme function find-string-from-port? STR IN-PORT MAX-NO-CHARS written back in 1995. The algorithm is a simple variation on KMP, which works well when the string to search is short (as is often the case when this function is used). Here are the old notes on the algorithm ; A special care should be taken in a situation when one had achieved a partial ; match with (a head of) STR, and then some unexpected character appeared in ; the stream. It'll be rash to discard all already read characters. Consider ; an example of string "acab" and the stream "bacacab...", specifically when ; a c a _b_ ; b a c a c a b ... ; that is, when 'aca' had matched, but then 'c' showed up in the stream ; while we were looking for 'b'. In that case, discarding all already read ; characters and starting the matching process from scratch, that is, ; from 'c a b ...', would miss a certain match. ; We don't actually need to keep already read characters, or at least ; strlen(str) characters in some kind of buffer. If there has been no match, ; we can safely discard read characters. If there was some partial match, ; we already know the characters before, they are in the STR itself, so ; we don't need a special buffer for that. -} search_string [] = return True -- Optimization for a one-element string search_string [e] = do dropWhile (/= e) headM >>= maybe (return False) (const (return True)) search_string (h:str) = loop where loop = do dropWhile (/= h) headM >>= maybe (return False) (const (loop_suffix 0)) -- loop_suffix k assumes that the prefix of size (k+1) of (h:str) -- has matched already. -- loop_suffix tries to match the suffix of str then. loop_suffix k = heads (Prelude.drop k str) >>= check . (+k) -- Now we have matched the (n+1) prefix of (h:str) check n | n == strl = return True -- nothing matched at all for non-empty str. check 0 = loop -- partial match, of the prefix of (h:str) of size (n+1) -- Check to see if there was a smaller partial match: if there is -- such i>0 that substr(str,0,j) = substr(str,i,n) -- where j = n - i check n = check_loop n self_matches check_loop _ [] = loop check_loop n ((b,_):_) | b >= n = loop check_loop n ((_,e):t) | e < n = check_loop n t check_loop n ((b,_):_) = loop_suffix (n-b-1) strl = length str -- all the locations where a non-empty prefix of -- (h:str) matches within a suffix of (h:str) -- (that is, within str or its suffix) self_matches = sl_loop 0 str -- The length of the common prefix of two strings common (h1:t1) (h2:t2) | h1 == h2 = 1 + common t1 t2 common _ _ = 0 sl_loop _ [] = [] sl_loop k (h1:t) | h == h1 = (k,k+1+common t str) : sl_loop (k+1) t sl_loop k (_:t) = sl_loop (k+1) t -- self_matches when (h:str) is "abracadabra" -- [(2,3),(4,5),(6,10),(9,10)] -- tests tests = and [True == go "acab" "bacacabd", False == go "acad" "bacacabd", True == go "bd" "bacacabd", False == go "be" "bacacabd", True == go "b" "aaaba", False == go "b" "a", False == go "ab" "a", True == go "ab" "aab", True == go "ab" "aaaaaab", True == go "ab" "abracadabra", True == go "abracadabraX" "abracadabracadabraX" ] where go pat str = runIdentity $ run =<< enum_pure_1chunk str (search_string pat)