{-# OPTIONS -fglasgow-exts #-} -- Haskell98 plus higher-ranked types only -- Knuth-Morris-Pratt (packed) string matching -- The code is fast and safe: _all_ string/array access operations -- need no index bound check because the safety of access operations -- is guaranteed statically. module KMP where {- The following ML code is downloaded from \url{http://www-2.cs.cmu.edu/~hwxi/DML/examples/kmpHard.mini} KMP is an imperative algorithm, which also uses indices in a creative way: Index -1 (which is deliberately out-of-bounds) is used as a special value. The code below is actually Dependent ML. The stuff after <| are DML declarations (of dependent types). The function 'sub' is a DML array access operation with no bound check. The function 'length' is a dependently-typed built-in function to obtain the run-time value of the length of a string or any other array. The functions arrayShift, subShift and updateShift are creator, accessor and setter for the shiftArray, whose element type is dependent on the run-time value (the length of the pattern string). The kmpHard.mini file downloaded from the above URL had three small issues in the algorithm, which are corrected below. Incidentally, even in the DML code the dependent types did not -- and were not intended to -- assure the total correctness. Only the safety of array access was at stake. Thus, writing test code is still a good idea; we have to write less test code now. The Haskell code has a simple test suite, see below. (* * This is an implementation of the Knuth-Morris-Pratt string matching * algorithm in standard ML. Also this demonstrates some interesting questions * involved with array bounds checks. *) structure KMP = struct assert sub <| {size:nat, i:int | 0 <= i < size} 'a array(size) * int(i) -> 'a and length <| {n:nat} 'a array(n) -> int(n) fun{slen:nat, plen:nat} kmpMatch(str, pat) = let type intShift = [i:int| 0 <= i+1 < plen ] int(i) assert arrayShift <| {size:nat} int(size) * intShift -> intShift array(size) and subShift <| {size:nat, i:int | 0 <= i < size} intShift array(size) * int(i) -> intShift and updateShift <| {size:nat, i:int | 0 <= i < size} intShift array(size) * int(i) * intShift -> unit val slen = length(str) and plen = length(pat) val shiftArray = arrayShift(plen, ~1) fun loopShift(i, j) = (* calculate the shift array *) if (j = plen) then () else if sub(pat, j) <> sub(pat, i+1) then if (i >= 0) then loopShift(subShift(shiftArray, i), j) else loopShift(~1, j+1) else ((if (i+1 < j) then updateShift(shiftArray, j, i+1) else ()) <| unit; loopShift(subShift(shiftArray, j), j+1)) where loopShift <| {j:int | 0 < j <= plen} intShift * int(j) -> unit val _ = loopShift(~1, 1) fun loop(s, p) = (* this the main search function *) if p < plen then if s < slen then if sub(str, s) = sub(pat, p) then loop(s+1, p+1) else if (p = 0) then loop(s+1, p) else loop(s, subShift(shiftArray, p-1)+1) else ~1 else s - plen where loop <| {s:nat, p:nat | s <= slen /\ p <= plen} int(s) * int(p) -> int in loop(0, 0) end where kmpMatch <| int array(slen) * int array(plen) -> int end -} -- To properly model the DML code above, we will be using -- packed strings. We will model the shiftArray with a STArray. import Data.PackedString import Data.Array.Unboxed import Data.Array.ST import Control.Monad.ST -- The function below is not actually present in the DML code, because -- the DML code does not define how empty strings or patterns are handled. -- Our Haskell approach however forces us to confront the issue, -- and resolve what happens when both the string and the pattern are empty. -- BTW, I don't understand what would happen if the DML function -- kmpMatch is invoked with an empty pattern. Probably some obscure -- compiler error because the type intShift becomes unpopulated. kmpMatch str pat = brandPS str (\bstrlen -> brandPS pat (\bpatlen -> runST (kmpMatch' bstrlen bpatlen)) 0 -- empty pattern, matches the beginning of (non-empty)string ) (-1) -- empty string, doesn't match any pattern -- The KMP algorithm itself, for non-empty str and pat kmpMatch' (bstr,slen) (bpat,plen) = do shiftArray <- arrayShift plen index_m1 let -- loopShift :: IntShift r -> BIndexP1 r -> ST s () loopShift i j = -- calculate the shift array index_p_lt j plen (Else $ return ()) (\j' -> let i1 = intshift_succ i in if bpat !. j' /= bpat !. i1 then index_m_gt i index_m1 (Else $ loopShift index_m1 (index_succ j')) (\i' -> do i'' <- subShift shiftArray i' loopShift i'' j) else do index_lt i1 j' (Else $ return ()) (updateShift shiftArray j') i'' <- subShift shiftArray j' loopShift i'' (index_succ j') ) loopShift index_m1 index_p1 let -- loop :: Nat -> Nat -> ST s Int loop s p = -- this the main search function nat_p_lt p plen (Else $ return $ (unNat s) - (unP1 plen)) (\p' -> nat_p_lt s slen (Else $ return (-1)) (\s' -> if bstr !. s' == bpat !. p' then loop (nat_succ s) (nat_succ p) else index_pred p' (Else $ loop (nat_succ s) p) (\p1 -> do i <- subShift shiftArray p1 loop s $ i2n (intshift_succ i)) ) ) loop nat_0 nat_0 {- Notes on the code: We could have replaced index_p_lt, nat_p_lt, etc. with one (typeclass) overloaded function, e.g., <. However, we prefer being explicit for now (and also avoid multi-parameter classes). It is instructive to compare the _inferred_ type of loopShift with the compare it with the DML annotation where loopShift <| {j:int | 0 < j <= plen} intShift * int(j) -> unit Ditto for the _inferred_ type of 'loop' vs. DML annotation: where loop <| {s:nat, p:nat | s <= slen /\ p <= plen} int(s) * int(p) -> int The DML annotations must be specified by the programmer; they can't be inferred. -} -- Tests test_kmp s1 s2 expected = if found == expected then True else error $ unwords ["Expected",show expected, "; found", show found] where found = kmpMatch (packString s1) (packString s2) test = all id [test_kmp "" "" (-1), test_kmp "1" "" 0, test_kmp "" "1" (-1), test_kmp "123" "123" 0, test_kmp "123" "1234" (-1), test_kmp "pirate" "rat" 2, test_kmp "pirate" "e" 5, test_kmp "pirate" "k" (-1), test_kmp "pirate" "pi" 0, test_kmp "pirate" "te" 4, test_kmp "outrage" "rat" (-1), test_kmp "pirate" "pit" (-1), test_kmp "pirate" "rate" 2, test_kmp "aaaaaaa" "aa" 0, test_kmp "pirate" "pirate" 0, test_kmp "pirate" "pirates" (-1), test_kmp "pirates" "pirate" 0, test_kmp "outrage" "ages" (-1), test_kmp "outrage" "" 0, test_kmp "bacacabd" "acab" 3, test_kmp "bacacabd" "acad" (-1), test_kmp "bacacabd" "bd" 6 ] -- ---------------------------------------------------------------------- -- The following is the implementation of the security kernel -- That is, the functions that correspond to DML's built-in -- dependently-typed functions sub, length, arrayShift, subShift and -- updateShift -- We should stress that all the tagging and untagging below -- involves 'newtype's and so has no run-time cost! -- -- The phantom type 'r' below is a type proxy for a positive natural -- number 'plen'. -- We don't know, at compile time, what that value is. However, our types -- specify propositions about the integers and plen; the implementation -- of the security kernel must be verified to make sure the propositions -- hold. -- The following newtypes must *not* be exported! -- If r is a type proxy for a natural (positive) 'plen', -- BIndex r is a type proxy for an integer i such that 0 <= i < plen -- Compare with the DML declaration {j:int | 0 <= j < plen} newtype BIndex r = BIndex Int -- |BPackedString r| is a non-empty packed string of the -- positive integer size represented by r. newtype BPackedString r = BPackedString PackedString -- brand the packed string brandPS:: PackedString -> (forall r. (BPackedString r, BIndexP1 r) -> w) -> w -> w brandPS str k kempty = let l = lengthPS str in if l > 0 then k (BPackedString str, BIndexP1 l) else kempty -- Access the packed string using the branded index -- The BIndex r type assures that the index is definitely within the bounds -- of the string BPackedString r. This, we could _safely_ use unsafeIndexPS. infixl 5 !. (!.):: BPackedString r -> BIndex r -> Char (BPackedString s) !. (BIndex i) = indexPS s i -- If r is a type proxy for a natural (positive) 'plen', -- BIndexP1 r is a type proxy for an integer i such that 0 < i <= plen -- Compare with the DML declaration {j:int | 0 < j <= plen} newtype BIndexP1 r = BIndexP1 Int -- If r is a type proxy for a natural (positive) 'plen', -- IntShift r is a type proxy for an integer i such -- that 0<=(i+1) < plen -- in DML: type intShift = [i:int| 0 <= i+1 < plen ] int(i) newtype IntShift r = IntShift Int -- I wish STUArray permitted newtype of Int... newtype BShiftArray r s = BShiftArray (STArray s Int (IntShift r)) -- Compare with the DML assertions about the three functions below -- in the DML code. arrayShift :: BIndexP1 r -> IntShift r -> ST s (BShiftArray r s) arrayShift (BIndexP1 r) e = newArray (0,r) e >>= return . BShiftArray -- Actually, here we should use unsafeReadArray, if available -- Compare the type signature below with the DML signature subShift :: BShiftArray r s -> BIndex r -> ST s (IntShift r) subShift (BShiftArray arr) (BIndex i) = readArray arr i -- again, we should use unsafeWriteArray updateShift :: BShiftArray r s -> BIndex r -> IntShift r -> ST s () updateShift (BShiftArray arr) (BIndex i) v = writeArray arr i v -- The index operation library. It is quite general purpose, actually newtype Else w = Else w -- Just a syntactic sugar unP1 (BIndexP1 i) = i -- forget the branding -- Safety proposition: -1 is within [i:int| 0 <= i+1 < plen ] -- for any positive plen, represented by r index_m1 :: IntShift r index_m1 = IntShift (-1) -- Safety proposition: 1 is within {j:int | 0 < j <= plen} -- for any positive plen, represented by r index_p1 :: BIndexP1 r index_p1 = BIndexP1 1 -- Safety proposition: just the definition of IntShift and BIndex intshift_succ :: IntShift r -> BIndex r intshift_succ (IntShift i) = BIndex (succ i) -- Safety proposition: definition of BIndex and BIndexP1 index_succ :: BIndex r -> BIndexP1 r index_succ (BIndex i) = BIndexP1 (succ i) -- Compare two BIndexP1. If the first is less than the second, -- invoke the onless continuation, passing the index converted to -- BIndex. -- Safety proposition: -- {i:int | 0 < i <= plen}, {j:int | 0 < j <= plen}, i < j -- ===> {i:int | 0 <= i < plen}. See the definition of BIndex then. index_p_lt :: BIndexP1 r -> BIndexP1 r -> Else w -> (BIndex r -> w) -> w index_p_lt (BIndexP1 i) (BIndexP1 j) (Else onother) onless = if i < j then onless (BIndex i) else onother -- Compare two IntShift. If the first is greater than the second, -- invoke the ongreater continuation, passing the index converted to -- BIndex. -- Safety proposition: -- {i:int | 0 <= (i+1) < plen}, {j:int | 0 <= (j+1) < plen}, i > j -- ===> {i:int | 0 <= i < plen}. See the definition of BIndex then. index_m_gt :: IntShift r -> IntShift r -> Else w -> (BIndex r -> w) -> w index_m_gt (IntShift i) (IntShift j) (Else onother) ongreater = if i > j then ongreater (BIndex i) else onother -- Compare two Indices. If the first is less than the second, -- invoke the onless continuation, passing the index converted to -- IntShift. -- Safety proposition: -- {i:int | 0 <= i < plen}, {j:int | 0 <= j < plen}, i < j -- ===> {i:int | -1 <= i < plen-1}. See the definition of IntShift then. index_lt:: BIndex r -> BIndex r -> Else w -> (IntShift r -> w) -> w index_lt (BIndex i) (BIndex j) (Else onother) onless = if i < j then onless (IntShift i) else onother -- Decrement the index, if possible. If the index is already zero, -- invoke the onzero continuation -- Safety proposition: -- {i:int | 0 <= i < plen}, i /= 0 ==> -- ===> {i:int | 0 <= (i-1) < plen}. See the definition of BIndex then. index_pred :: BIndex r -> Else w -> (BIndex r -> w) -> w index_pred (BIndex i) (Else onzero) onfurther = if i == 0 then onzero else onfurther (BIndex (pred i)) -- A type proxy for a non-negative integer newtype Nat = Nat Int unNat (Nat i) = i nat_0 = Nat 0 -- obviously, non-negative integer nat_succ :: Nat -> Nat nat_succ (Nat i) = Nat (succ i) -- obviously, Nat is closed over succ i2n :: BIndex r -> Nat i2n (BIndex i) = Nat i -- see the definition of Nat -- Compare a Nat with BIndexP1. If the first is less than the second, -- invoke the onless continuation passing the Nat converted to -- Index. -- Safety proposition: -- {i:int | 0 <= i}, {j:int | 0 < j <= plen}, i < j -- ===> {i:int | 0 <= i < plen}. See the definition of BIndex then. nat_p_lt:: Nat -> BIndexP1 r -> Else w -> (BIndex r -> w) -> w nat_p_lt (Nat i) (BIndexP1 j) (Else onother) onless = if i < j then onless (BIndex i) else onother