#################################################################################################### # simulating (using the *monte carlo* approach to studying) # single-number betting in american roulette, # in the freeware data science environment R # unbuffer the output, if relevant # set the working directory to the place where you want to store # any output from this R session setwd( 'C:/DD/Teaching/AMS-STAT-131/Spring-2020' ) # create the population data set and print it print( population.data.set <- c( rep( -1, 37 ), 35 ) ) # [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 # [28] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 35 # set the number n of $1 plays of a single number (the sample size) n <- 1000 # set the number M of simulation replications in the repeated-sampling data set M <- 1000000 # set a pseudo-random number generator seed, to make the results replicable random.number.generator.seed <- 4391557 set.seed( random.number.generator.seed ) # create an empty version of the repeated-sampling data set, # to be filled below repeated.sampling.data.set <- rep( NA, M ) # fill the repeated-sampling data set with sums of n IID draws from the population # data set, and print out how much clock time this took system.time( { for ( m in 1:M ) { simulated.sample.data.set <- sample( population.data.set, n, replace = T ) repeated.sampling.data.set[ m ] <- sum( simulated.sample.data.set ) if ( ( m %% 10000 ) == 0 ) print( m ) } } ) # on my home desktop, single-threaded on an Intel Core i7 3770 CPU # at 3.40GHz, 100000 simulation replications takes about 3 seconds # and 10 * 100000 = 1000000 replications takes almost exactly # 10 times as long # user system elapsed n = 1000 M = 100000 # 2.7 0.0 2.7 # user system elapsed n = 1000 M = 1000000 # 26.31 0.03 26.36 # my home desktop has 4 hyper-threaded cores, each with 2 threads, # for an effective number of CPUs of 4 * 2 = 8 # with a little more effort (not shown here), # the loop above could be parallelized, so that with M = 1000000 # each of the 8 threads can do 1000000 / 8 = 125000 simulation replications # simultaneously; this will cut the clock time # by a factor of 7.5 or so (not 8.0, because a bit # of overhead is needed to keep track of the parallel processing) # create a new output variable that's 1 every time the simulated sum # is positive and 0 otherwise is.the.sum.positive <- 0 + ( repeated.sampling.data.set > 0 ) # create the monte carlo data set, with M rows and 2 columns monte.carlo.data.set <- cbind( repeated.sampling.data.set, is.the.sum.positive ) # look at the first 5 rows of the monte carlo data set monte.carlo.data.set[ 1:5, ] # repeated.sampling.data.set is.the.sum.positive # [1,] -496 0 # [2,] 332 1 # [3,] -136 0 # [4,] 8 1 # [5,] -208 0 # the estimated probability p.hat of coming out ahead is now # just the mean of the 1s and 0s in the second column of # the monte carlo data set: estimated.probability.of.coming.out.ahead <- mean( is.the.sum.positive ) # later in the course we'll see that one important feature # of the monte carlo method is that we can get two kinds of results: # the estimated probability (p.hat), and an estimate of how accurate # p.hat is (this accuracy estimate is called the standard error of p.hat) monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead <- sqrt( estimated.probability.of.coming.out.ahead * ( 1 - estimated.probability.of.coming.out.ahead ) / M ) # now print out the numerical results paste( 'n =', n, 'M =', M, 'seed =', random.number.generator.seed ) # [1] "n = 1000 M = 1e+06 seed = 4391557" paste( 'estimated probability of coming out ahead = ', estimated.probability.of.coming.out.ahead, ', give or take ', signif( monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead, 4 ), sep = '' ) # [1] "estimated probability of coming out ahead = 0.397075, give or take 0.0004893" # in lecture, soon we'll talk about an extremely useful # graphical summary of a numerical data vector (such as # the M = 1000000 sums in the repeated sampling data set) # called a histogram hist( repeated.sampling.data.set, prob = T, main = '', xlab = 'Your Net Gain', breaks = 'FD' ) abline( v = 0, lwd = 2, col = 'red' ) # and here's a numerical summary of the M = 1000000 randomly generated sums; # this table is called a *relative frequency distribution* of the sums: table( repeated.sampling.data.set ) / M # repeated.sampling.data.set # -856 -784 -748 -712 -676 -640 -604 -568 -532 -496 -460 # 0.000001 0.000001 0.000004 0.000015 0.000059 0.000129 0.000365 0.000763 0.001604 0.003070 0.005237 # -424 -388 -352 -316 -280 -244 -208 -172 -136 -100 -64 # 0.009049 0.013958 0.020687 0.028949 0.038244 0.048570 0.058727 0.067242 0.073689 0.077270 0.078549 # -28 8 44 80 116 152 188 224 260 296 332 # 0.076743 0.072423 0.065783 0.056999 0.048386 0.039781 0.031229 0.024368 0.018191 0.013029 0.009366 # 368 404 440 476 512 548 584 620 656 692 728 # 0.006326 0.004170 0.002764 0.001686 0.001038 0.000665 0.000360 0.000224 0.000139 0.000071 0.000034 # 764 800 836 872 908 # 0.000024 0.000013 0.000003 0.000001 0.000002 # the single most probable outcome is $-64 (i.e., we lose $64), # but all of the possibilities in the set # $ { -208, -172, -136, -100, -64, -28, +8, +44, +80 } # have at least a 5% chance of occurring # let's finish this case study with a surprise # so far we've shown that if we make n = 1000 $1 bets on a single number, # we have about a 40% chance of coming out ahead # since the casino is in business to make money, # you would think that this probability can never be made # bigger than 50%, no matter how many $1 bets we make # but watch what happens with n = 35: n <- 35 M <- 1000000 random.number.generator.seed <- 4391557 set.seed( random.number.generator.seed ) repeated.sampling.data.set <- rep( NA, M ) system.time( { for ( m in 1:M ) { simulated.sample.data.set <- sample( population.data.set, n, replace = T ) repeated.sampling.data.set[ m ] <- sum( simulated.sample.data.set ) } } ) # user system elapsed # 6.36 0.02 6.37 is.the.sum.positive <- 0 + ( repeated.sampling.data.set > 0 ) estimated.probability.of.coming.out.ahead <- mean( is.the.sum.positive ) monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead <- sqrt( estimated.probability.of.coming.out.ahead * ( 1 - estimated.probability.of.coming.out.ahead ) / M ) paste( 'n =', n, 'M =', M, 'seed =', random.number.generator.seed ) # [1] "n = 35 M = 1e+06 seed = 4391557" paste( 'estimated probability of coming out ahead = ', estimated.probability.of.coming.out.ahead, ', give or take ', signif( monte.carlo.standard.error.of.estimated.probability.of.coming.out.ahead, 4 ), sep = '' ) # [1] "estimated probability of coming out ahead = 0.607516, give or take 0.0004883" # if we play n = 35 times, we have a 60.8% chance of coming out ahead (!) # when we study what's called the *binomial* distribution # a bit later in the course, we'll see that the # probability of coming out ahead as a function of n # can be computed in an environment such as R # without using simulation: winning.probability.with.single.number.bets <- function( n ) { return( 1 - pbinom( n / 36, n, 1 / 38 ) ) } cbind( 1:100, winning.probability.with.single.number.bets( 1:100 ) ) # [,1] [,2] # [1,] 1 0.02631579 # [2,] 2 0.05193906 # [3,] 3 0.07688803 # [4,] 4 0.10118045 # [5,] 5 0.12483360 # . # . # . # [33,] 33 0.58523872 # [34,] 34 0.59615349 # [35,] 35 0.60678103 # [36,] 36 0.24460566 # [37,] 37 0.25440891 # the binomial calculation can easily be generalized # to be correct for *any* bet in roulette # with n as the number of $1 plays, p as our chance of winning # on any single play, and w as our net gain # if we do win: winning.probability.roulette <- function( n, p, w ) { return( 1 - pbinom( n / ( w + 1 ), n, p ) ) } plot.winning.probability.roulette <- function( n, p, w, title ) { plot( 1:n, winning.probability.roulette( 1:n, p, w ), type = 'l', lwd = 2, col = 'red', xlab = 'Number of Plays', ylab = 'P( coming out ahead )', main = title ) maximum.winning.probability <- max( winning.probability.roulette( 1:n, p, w ) ) n.maximum.winning.probability <- ( 1:n )[ winning.probability.roulette( 1:n, p, w ) == maximum.winning.probability ] abline( v = n.maximum.winning.probability, lwd = 2, col = 'darkcyan', lty = 2 ) abline( h = maximum.winning.probability, lwd = 2, col = 'darkcyan', lty = 2 ) abline( h = 0.5, lwd = 2, col = 'black', lty = 1 ) } plot.winning.probability.roulette( 500, 1 / 38, 35, 'Single Number' ) # this seems like a paradox: how can the casino stay in business # offering us a gambling strategy with the property that # we have almost a 61% chance of winning? # the secret that unlocks the paradox will become clear # later in the quarter, when we cover the concept of # *expected value* of a *random variable*: # yes, with n = 1000 $1 plays on a single number, # we have a 60.8% chance of coming out ahead, # but *on average*, across many gamblers # all using our strategy, the casino *expects* # to collect a bit less than $2 from each of us hist( repeated.sampling.data.set, prob = T, main = '', xlab = 'Your Net Gain', breaks = 'FD' ) abline( v = 0, lwd = 2, col = 'red' ) table( repeated.sampling.data.set ) / M # repeated.sampling.data.set # -35 1 37 73 109 145 181 217 253 # 0.392484 0.372695 0.170882 0.050914 0.010897 0.001849 0.000247 0.000030 0.000002 mean( repeated.sampling.data.set ) # [1] -1.817288 # this number is a monte carlo approximation to what we will soon call # the *expected value* of our net gain after n = 1000 $1 bets # on a single number # crucially from the casino's point of view, # this number has a minus sign in front of it; # in other words, on average we expect to lose about $1.82, # even though we have a 60.8% chance of coming out ahead # probability theory is full of surprises ... ####################################################################################################