{-
 The colored ball example, example 1.1 from

 http://people.csail.mit.edu/milch/papers/blog-chapter.pdf

``An urn contains an unknown number of balls--say, a number chosen from
a Poisson or a uniform distributions. Balls are equally likely to be blue or
green. We draw some balls from the urn, observing the color of each
and replacing it. We cannot tell two identically colored balls apart;
furthermore, observed colors are wrong with probability 0.2.  How many
balls are in the urn? Was the same ball drawn twice?''

-}

module ColoredBalls where

import Syntax
import Util
import qualified Data.IntMap as M
import Data.IntMap ((!))
import Control.Applicative

import qualified Debug.Trace as TR


-- ---------------  The colored-ball model

data Color = Blue | Green deriving (Eq, Show)

opposite_color :: Color -> Color
opposite_color Blue  = Green
opposite_color Green = Blue

-- Distribution for the observing of a ball, with the failure rate 20%
observed_color color =
  categorical [(color,0.8),(opposite_color color,0.2)]

maxBalls :: Int
maxBalls = 8

-- Create the prior for the max amount of balls that may be in the urn
balls_prior n = do
  balls <- sequence . replicate n $ dist uniformly (pure [Blue,Green])
  return $ M.fromList $ zip [1..] balls

{-
Problem with the following model: nballs and b below are strongly correlated.
Hence if nballs is drawn as 5 and b is drawn as, say, 3, all the
proposals to resample nbalss as 1 or 2 will be rejected.

-- The model takes an array of observations as an argument
cballs_model :: [Color] -> Model Int
cballs_model obs = do
  balls  <- balls_prior maxBalls
  nballs <- dist uniformly (pure [1..maxBalls])
  -- The same ball can be drawn twice as we return the drawn balls
  -- into the urn. A ball keeps its true color, no matter how many
  -- times it is drawn.
  let observe_ball color = do
        b <- dist uniformly ((\n -> [1..n]) <$> nballs)
        -- It is more efficient to create if statements, rather than
        -- create an SExp(IntMap Color).
        let observe i =
              let true_color = balls ! i in
              if_ ((== i) <$> b) $
                dist (color `condition` categorical)
                     ((\bc -> [(bc,0.8),(opposite_color bc,0.2)]) <$>
                      true_color)
        foldr observe (return (pure color)) [1..maxBalls]
  mapM_ observe_ball obs
  return nballs
-}

-- Submodel: Given balls color and the observations, perform conditioning
balls_observe :: M.IntMap (SExp Color) -> [Color] -> Int -> Model ()
balls_observe balls obs nballs = mapM_ observe_ball obs >> return (pure ())
 where
  -- The same ball can be drawn twice as we return the drawn balls
  -- into the urn. A ball keeps its true color, no matter how many
  -- times it is drawn.
   observe_ball color = do
        b <- dist uniformly (pure [1..nballs])
        -- It is more efficient to create if statements, rather than
        -- create an SExp(IntMap Color).
        let observe i =
              let true_color = balls ! i in
              if_ ((== i) <$> b) $
                dist (color `condition` observed_color) true_color
        foldr observe (return (pure color)) [1..maxBalls]

-- The model takes a list of observations as an argument
cballs_model :: [Color] -> Model Int
cballs_model obs = do
  balls  <- balls_prior maxBalls
  nballs <- dist uniformly (pure [1..maxBalls])
  let obs_number i =
        if_ ((== i) <$> nballs) $ balls_observe balls obs i
  foldr obs_number (return (pure())) [1..maxBalls]
  return nballs

cbr = hist $ mcmC 10000 (cballs_model (replicate 10 Blue))

-- fromList [(1,4265),(2,2203),(3,1244),(4,805),(5,486),(6,291),(7,565),(8,141)]
-- Good agreement

{-
   Sec 1.6.3, Fig 1.7.
   Ten balls were drawn, and all appeared blue

let t1 = sample_importance (random_selector 17) 10000 (model_nballs 
		 [|Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;|]);;

let
    [(0.031942788306915014, V 8); (0.0422088248548995876, V 7);
     (0.0481489259210673634, V 6); (0.0611501366775565255, V 5);
     (0.0843776503618006435, V 4); (0.123159075103266638, V 3);
     (0.203898291338526577, V 2); (0.405114307435967658, V 1)] =
 snd (Inference.normalize t1);;

(* Do a more serious run, accumulating the statistics *)
let dist_add l1 l2 =
  List.map2 (fun (p1,v1) (p2,v2) ->
    assert (v1 = v2); (p1 +. p2, v1)) l1 l2;;

let average_it ntrials nsamples model =
  let rec loop acc = function 0 -> acc
    | n -> loop (dist_add acc (sample_importance 
				 (random_selector (17+n)) nsamples model))
	        (pred n)
  in snd (Inference.normalize 
	    (loop (sample_importance (random_selector (17+ntrials)) 
		     nsamples model) 
	       (pred ntrials)))
;;

(* It took about a minute *)

(*
let t2 = average_it 5 5000 (model_nballs 
		 [|Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;|]);;

let 
  [(0.0318386565368576138, V 8); (0.0384083436381660176, V 7);
   (0.0445344709122082669, V 6); (0.055953476873295431, V 5);
   (0.0777907061915841, V 4); (0.123751456856916756, V 3);
   (0.211354336837841883, V 2); (0.416368552153129956, V 1)]
 = t2;;
*)

(* The results seem to be in an excellent agreement with Fig 1.7a;
   It seems, 5,000 samples are sufficient
*)

(* We now try the second query: Was the same ball drawn twice? *)

let model_duplicate obs () =
  let nballs = nballs_prior () in
  (* The same ball can be drawn twice as we return the drawn balls
     into the urn. A ball keeps its true color, no matter how many
     times it is drawn.
   *)
  let ball_color = memo (fun b -> uniformly [|Blue; Green|]) in
  (* perform observations *)
  let drawn = Array.fold_left (fun acc obs_color -> 
    let b = uniform_range 1 nballs in
    if observed_color (ball_color b) = obs_color 
    then PMap.insert_with (+) b 1 acc else fail ()) 
    PMap.empty obs
  in PMap.fold (fun x y -> min x y) drawn (1 + (Array.length obs))
;;


let td = sample_importance (random_selector 17) 20000 (model_duplicate
		 [|Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;Blue;|]);;

let   [(0.00673060540800008485, V 10); (0.000852212937600005, V 5);
       (0.00140560366560001852, V 4); (0.00132793777440001672, V 3);
       (0.00193966835520000447, V 2); (0.00410314812000006201, V 1)] = td;;

let [(0.411426914210096362, V 10); (0.0520938783233282424, V 5);
     (0.0859214206871847103, V 4); (0.0811738777814881435, V 3);
     (0.118567605378018456, V 2); (0.250816303619884162, V 1)] =
  snd (Inference.normalize td);;

(* There is 25% chance some ball was drawn only once *)


-}