####################################################################################################

# simulating (using the *monte carlo* approach to studying)
# single-number betting in american roulette,
# in the freeware data science environment R

# unbuffer the output, if relevant 

# set the working directory to the place where you want to store
# any output from this R session

setwd( 'C:/DD/Teaching/AMS-STAT-131/Spring-2020' )

# create the population data set and print it

print( population.data.set <- c( rep( -1, 37 ), 35 ) )

#  [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
# [28] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 35

# set the number n of $1 plays of a single number (the sample size)

n <- 1000

# set the number M of simulation replications in the repeated-sampling data set

M <- 1000000

# set a pseudo-random number generator seed, to make the results replicable

random.number.generator.seed <- 4391557

set.seed( random.number.generator.seed )

# create an empty version of the repeated-sampling data set,
# to be filled below

repeated.sampling.data.set <- rep( NA, M )

# fill the repeated-sampling data set with sums of n IID draws from the population
# data set, and print out how much clock time this took

system.time( {

  for ( m in 1:M ) {

    simulated.sample.data.set <- sample( population.data.set, n, replace = T )

    repeated.sampling.data.set[ m ] <- sum( simulated.sample.data.set )

    if ( ( m %% 10000 ) == 0 ) print( m )

  }

} 

)

# on my home desktop, single-threaded on an Intel Core i7 3770 CPU
# at 3.40GHz, 100000 simulation replications takes about 3 seconds
# and 10 * 100000 = 1000000 replications takes almost exactly
# 10 times as long

#  user  system elapsed       n = 1000   M = 100000
#   2.7     0.0     2.7

#  user  system elapsed       n = 1000   M = 1000000 
# 26.31    0.03   26.36

# my home desktop has 4 hyper-threaded cores, each with 2 threads,
# for an effective number of CPUs of 4 * 2 = 8

# with a little more effort (not shown here),
# the loop above could be parallelized, so that with M = 1000000
# each of the 8 threads can do 1000000 / 8 = 125000 simulation replications
# simultaneously; this will cut the clock time
# by a factor of 7.5 or so (not 8.0, because a bit
# of overhead is needed to keep track of the parallel processing)

# create a new output variable that's 1 every time the simulated sum
# is positive and 0 otherwise

is.the.sum.positive <- 0 + ( repeated.sampling.data.set > 0 )

# create the monte carlo data set, with M rows and 2 columns

monte.carlo.data.set <- cbind( repeated.sampling.data.set, is.the.sum.positive )

# look at the first 5 rows of the monte carlo data set

monte.carlo.data.set[ 1:5, ]

#       repeated.sampling.data.set is.the.sum.positive
#  [1,]                       -496                   0
#  [2,]                        332                   1
#  [3,]                       -136                   0
#  [4,]                          8                   1
#  [5,]                       -208                   0

# the estimated probability p.hat of coming out ahead is now
# just the mean of the 1s and 0s in the second column of
# the monte carlo data set:

estimated.probability.of.coming.out.ahead <- mean( is.the.sum.positive )

# later in the course we'll see that one important feature
# of the monte carlo method is that we can get two kinds of results:
# the estimated probability (p.hat), and an estimate of how accurate
# p.hat is (this accuracy estimate is called the standard error of p.hat)

monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead <-
  sqrt( estimated.probability.of.coming.out.ahead *
  ( 1 - estimated.probability.of.coming.out.ahead ) / M )

# now print out the numerical results

paste( 'n =', n, 'M =', M, 'seed =', random.number.generator.seed )

# [1] "n = 1000 M = 1e+06 seed = 4391557"

paste( 'estimated probability of coming out ahead = ',
  estimated.probability.of.coming.out.ahead, ', give or take ', 
  signif( monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead, 4 ),
  sep = '' )

# [1] "estimated probability of coming out ahead = 0.397075, give or take 0.0004893"

# in lecture, soon we'll talk about an extremely useful
# graphical summary of a numerical data vector (such as
# the M = 1000000 sums in the repeated sampling data set)
# called a histogram

hist( repeated.sampling.data.set, prob = T, main = '', xlab = 'Your Net Gain',
  breaks = 'FD' )

abline( v = 0, lwd = 2, col = 'red' )

# and here's a numerical summary of the M = 1000000 randomly generated sums;
# this table is called a *relative frequency distribution* of the sums:

table( repeated.sampling.data.set ) / M

# repeated.sampling.data.set

#     -856     -784     -748     -712     -676     -640     -604     -568     -532     -496     -460 
# 0.000001 0.000001 0.000004 0.000015 0.000059 0.000129 0.000365 0.000763 0.001604 0.003070 0.005237 

#     -424     -388     -352     -316     -280     -244     -208     -172     -136     -100      -64 
# 0.009049 0.013958 0.020687 0.028949 0.038244 0.048570 0.058727 0.067242 0.073689 0.077270 0.078549 

#      -28        8       44       80      116      152      188      224      260      296      332 
# 0.076743 0.072423 0.065783 0.056999 0.048386 0.039781 0.031229 0.024368 0.018191 0.013029 0.009366 

#      368      404      440      476      512      548      584      620      656      692      728 
# 0.006326 0.004170 0.002764 0.001686 0.001038 0.000665 0.000360 0.000224 0.000139 0.000071 0.000034 

#      764      800      836      872      908 
# 0.000024 0.000013 0.000003 0.000001 0.000002

# the single most probable outcome is $-64 (i.e., we lose $64),
# but all of the possibilities in the set 

#   $ { -208, -172, -136, -100, -64, -28, +8, +44, +80 }

# have at least a 5% chance of occurring

# let's finish this case study with a surprise

# so far we've shown that if we make n = 1000 $1 bets on a single number,
# we have about a 40% chance of coming out ahead

# since the casino is in business to make money,
# you would think that this probability can never be made
# bigger than 50%, no matter how many $1 bets we make

# but watch what happens with n = 35:

n <- 35

M <- 1000000

random.number.generator.seed <- 4391557

set.seed( random.number.generator.seed )

repeated.sampling.data.set <- rep( NA, M )

system.time( {

  for ( m in 1:M ) {

    simulated.sample.data.set <- sample( population.data.set, n, replace = T )

    repeated.sampling.data.set[ m ] <- sum( simulated.sample.data.set )

  }

} 

)

#   user  system elapsed 
#   6.36    0.02    6.37

is.the.sum.positive <- 0 + ( repeated.sampling.data.set > 0 )

estimated.probability.of.coming.out.ahead <- mean( is.the.sum.positive )

monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead <-
  sqrt( estimated.probability.of.coming.out.ahead *
  ( 1 - estimated.probability.of.coming.out.ahead ) / M )

paste( 'n =', n, 'M =', M, 'seed =', random.number.generator.seed )

# [1] "n = 35 M = 1e+06 seed = 4391557"

paste( 'estimated probability of coming out ahead = ',
  estimated.probability.of.coming.out.ahead, ', give or take ', 
  signif( monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead, 4 ),
  sep = '' )

# [1] "estimated probability of coming out ahead = 0.607516, give or take 0.0004883"

# if we play n = 35 times, we have a 60.8% chance of coming out ahead (!)

# when we study what's called the *binomial* distribution
# a bit later in the course, we'll see that the
# probability of coming out ahead as a function of n
# can be computed in an environment such as R
# without using simulation:

winning.probability.with.single.number.bets <- function( n ) {

  return( 1 - pbinom( n / 36, n, 1 / 38 ) )

}

cbind( 1:100, winning.probability.with.single.number.bets( 1:100 ) )

#        [,1]       [,2]
#   [1,]    1 0.02631579
#   [2,]    2 0.05193906
#   [3,]    3 0.07688803
#   [4,]    4 0.10118045
#   [5,]    5 0.12483360

#    .
#    .
#    .

#  [33,]   33 0.58523872
#  [34,]   34 0.59615349
#  [35,]   35 0.60678103
#  [36,]   36 0.24460566
#  [37,]   37 0.25440891

# the binomial calculation can easily be generalized
# to be correct for *any* bet in roulette

# with n as the number of $1 plays, p as our chance of winning
# on any single play, and w as our net gain
# if we do win:

winning.probability.roulette <- function( n, p, w ) {

  return( 1 - pbinom( n / ( w + 1 ), n, p ) )

}

plot.winning.probability.roulette <- function( n, p, w, title ) {

  plot( 1:n, winning.probability.roulette( 1:n, p, w ),
    type = 'l', lwd = 2, col = 'red', xlab = 'Number of Plays',
    ylab = 'P( coming out ahead )', main = title )

  maximum.winning.probability <- 
    max( winning.probability.roulette( 1:n, p, w ) )

  n.maximum.winning.probability <- ( 1:n )[ 
    winning.probability.roulette( 1:n, p, w ) ==
    maximum.winning.probability ]

  abline( v = n.maximum.winning.probability, lwd = 2, col = 'darkcyan', lty = 2 )

  abline( h = maximum.winning.probability, lwd = 2, col = 'darkcyan', lty = 2 )

  abline( h = 0.5, lwd = 2, col = 'black', lty = 1 )

}

plot.winning.probability.roulette( 500, 1 / 38, 35, 'Single Number' )

# this seems like a paradox: how can the casino stay in business
# offering us a gambling strategy with the property that 
# we have almost a 61% chance of winning?

# the secret that unlocks the paradox will become clear
# later in the quarter, when we cover the concept of
# *expected value* of a *random variable*:

# yes, with n = 1000 $1 plays on a single number,
# we have a 60.8% chance of coming out ahead,
# but *on average*, across many gamblers
# all using our strategy, the casino *expects*
# to collect a bit less than $2 from each of us

hist( repeated.sampling.data.set, prob = T, main = '', xlab = 'Your Net Gain',
  breaks = 'FD' )

abline( v = 0, lwd = 2, col = 'red' )

table( repeated.sampling.data.set ) / M

# repeated.sampling.data.set

#      -35        1       37       73      109      145      181      217      253 
# 0.392484 0.372695 0.170882 0.050914 0.010897 0.001849 0.000247 0.000030 0.000002

mean( repeated.sampling.data.set )

# [1] -1.817288

# this number is a monte carlo approximation to what we will soon call
# the *expected value* of our net gain after n = 1000 $1 bets
# on a single number

# crucially from the casino's point of view,
# this number has a minus sign in front of it;
# in other words, on average we expect to lose about $1.82,
# even though we have a 60.8% chance of coming out ahead

# probability theory is full of surprises ...

####################################################################################################