############################################################################### ############################################################################### # R code to explore the concepts of expected value, variance, # and standard deviation # unbuffer the output # set the working directory to the place you want to store, # e.g., any pdf versions of plots ############################################################################### # first, let's look at the expected value of the # binomial distribution that arose in the tay-sachs case study: # let Y record the number of tay-sachs babies in the family # of n = 5 children, with both parents carriers, so that # the probability of any single child having the disease # is p = 0.25 # the support of the discrete random variable Y is # S_Y = { 0, 1, ..., n } = { 0, 1, 2, 3, 4, 5 } # and Y has a pmf, not a pdf; recall that the pmf of the # binomial( n, p ) distribution, for y in S_Y # f_Y ( y ) = ( n choose y ) * p^y * ( 1 - p )^( n - y ) n <- 5 p <- 0.25 binomial.pmf <- dbinom( 0:n, n, p ) cbind( 0:n, binomial.pmf, cumsum( binomial.pmf ) ) # here is the relative frequency distribution of Y # (the pmf) and Y's cdf: # pmf cdf # y P( Y = y ) F_Y ( y ) = P ( Y <= y ) # 0 0.2373046875 0.2373047 # 1 0.3955078125 0.6328125 # 2 0.2636718750 0.8964844 # 3 0.0878906250 0.9843750 # 4 0.0146484375 0.9990234 # 5 0.0009765625 1.0000000 # here's a spike plot of this pmf: plot( 0, 0, xlim = c( -0.5, 5.5 ), ylim = c( 0, 0.4 ), xlab = 'y', ylab = 'P( Y = y )', type = 'n', main = 'Spike plot of the Binomial( 5, 0.25 ) distribution' ) segments( 0, 0, 0, binomial.pmf[ 1 ], lwd = 2, col = 'red' ) segments( 1, 0, 1, binomial.pmf[ 2 ], lwd = 2, col = 'red' ) segments( 2, 0, 2, binomial.pmf[ 3 ], lwd = 2, col = 'red' ) segments( 3, 0, 3, binomial.pmf[ 4 ], lwd = 2, col = 'red' ) segments( 4, 0, 4, binomial.pmf[ 5 ], lwd = 2, col = 'red' ) segments( 5, 0, 5, binomial.pmf[ 6 ], lwd = 2, col = 'red' ) # in this class we concentrate on three measures of # the *center* of a probability distribution: mean, # median, and mode # the mode (the y having the biggest pmf) of this distribution # is clearly 1 # and the median is also 1, because if we sweep up # all the probability from - infinity to 0 (inclusive) # we've only accumulated about 23.7% of the probability, # but if we sweep from - infinity to 1 (inclusive) # we've caught about 63.3% # the median is the 50%/50% point, which is in between # 23.7% and 63.3%; with discrete distributions people agree # to set the median equal to the y value for which # F_Y ( y - 1 ) < 0.5 and F_Y ( y ) >= 0.5 # what about the mean, otherwise known as the expected value # of the random variable Y? it's the balance point or center of mass # of the distribution # as usual this can be computed in two ways: math and simulation # for a discrete random variable the math looks like this: # E( Y ) = sum, over all y values in the support of Y, # of terms of the form y * P( Y = y ) # R can compute this for the specific values of n and p # in the tay-sachs case study: print( expected.value <- sum( ( 0:n ) * binomial.pmf ) ) # [1] 1.25 # wolfram alpha can compute it for general n and p; the code is # as follows: # first, let's verify that the pmf does indeed sum to 1 # over all y values in S_Y: # wolfram alpha code begins below this line # sum of ( n choose y ) * p^y * ( 1 - p )^( n - y ) for y from 0 to n # now let's get the expected value of Y for general n and p: # sum of y * ( n choose y ) * p^y * ( 1 - p )^( n - y ) for y from 0 to n # wolfram alpha code ends above this line # so the expected value of the binomial( n, p ) distribution is n * p # here's the simulation approach in R set.seed( 123454321 ) M <- 1000000 y.star <- rbinom( M, n, p ) mean( y.star ) # [1] 1.250012 # you can see that this is not as satisfying as the math approach; # we could *conjecture* that with n = 5 and p = 0.25 the expected value # is 5 * 0.25 = 1.25, but we've only demonstrated this up to # monte carlo noise in the 6th and 7th significant figures # here's a histogram of the y.star values, which (as you know) # is a bar-graph version of the spike plot: hist( y.star, prob = T, breaks = ( 0 - 0.5 ):( n + 0.5 ), xlab = 'y', main = 'Histogram of the Binomial( 5, 0.25 ) pmf' ) # and here you can compare the two plots: par( mfrow = c( 2, 1 ) ) plot( 0, 0, xlim = c( -0.5, 5.5 ), ylim = c( 0, 0.4 ), xlab = 'y', ylab = 'P( Y = y )', type = 'n', main = 'Spike plot of the Binomial( 5, 0.25 ) distribution' ) segments( 0, 0, 0, binomial.pmf[ 1 ], lwd = 2, col = 'red' ) segments( 1, 0, 1, binomial.pmf[ 2 ], lwd = 2, col = 'red' ) segments( 2, 0, 2, binomial.pmf[ 3 ], lwd = 2, col = 'red' ) segments( 3, 0, 3, binomial.pmf[ 4 ], lwd = 2, col = 'red' ) segments( 4, 0, 4, binomial.pmf[ 5 ], lwd = 2, col = 'red' ) segments( 5, 0, 5, binomial.pmf[ 6 ], lwd = 2, col = 'red' ) hist( y.star, prob = T, breaks = ( 0 - 0.5 ):( n + 0.5 ), xlab = 'y', main = 'Histogram of the Binomial( 5, 0.25 ) pmf', ylab = 'PMF' ) par( mfrow = c( 1, 1 ) ) ############################################################################### # now, what about the variance and standard deviation of this distribution? # the second thing you would think of doing after summarizing # the center of a distribution is to quantify how *spread out* it is # in this class we focus on three measures of spread: # the interquartile range, the variance, and the standard deviation (SD) # here's let's just look at the variance and the SD # the variance of a random variable Y with mean mu_Y is defined to be # variance of Y = expected value of the derived random variable # ( Y - mu_Y )^2 # and the SD is just the square root of the variance # for a discrete random variable, such as Y in the tay-sachs case study, # the variance expression becomes # V( Y ) = sum, over all y values in the support of Y, # of terms of the form ( y - mu_Y )^2 * P( Y = y ) # notation: i use V( Y ) for the variance of Y; # degroot and schervish (and many other people) # use Var( y ); but 'Var' is inconsistent with the notation # E( Y ) that everybody uses for the expected value, # so i stick with V( Y ) for the variance # R can compute this for the specific values of n and p # in the tay-sachs case study: mu.Y <- expected.value print( variance <- sum( ( 0:n - mu.Y )^2 * binomial.pmf ) ) # [1] 0.9375 # wolfram alpha can compute this for general n and p; the code is # as follows: # wolfram alpha code begins below this line # sum of ( y - n * p )^2 * ( n choose y ) * p^y * ( 1 - p )^( n - y ) for y from 0 to n # wolfram alpha code ends above this line # so the variance of the binomial( n, p ) distribution is n * p * ( 1 - p ), # and the SD of that distribution is sqrt( n * p * ( 1 - p ) ) # R can of course do this numerically and exactly for the specific values of n and p: print( SD <- sqrt( sum( ( 0:n - mu.Y )^2 * binomial.pmf ) ) ) # [1] 0.9682458 # the simulation approach is easy: having already generated 1,000,000 # random draws from the binomial( 5, 0.25 ) distribution and stored them # in the R object y.star, we can just use the built-in functions 'var' # and 'sd': print( c( var( y.star ), sd( y.star ) ) ) # [1] 0.9383789 0.9686996 # these are only monte-carlo accurate to about 3 or 4 significant figures, # even with 1,000,000 simulated IID draws, whereas the monte-carlo # estimate of the mean was accurate to about 5 sigfigs (why are the variance # and SD less accurately estimated via monte-carlo than the mean?) # technical note: 'var' and 'sd' in R use slightly different formulas # as a function of sample size (in this case, M), with the property that # the difference vanishes as M increases; here, with M = 1000000, # the difference would only show up in the 6th significant figure ############################################################################### # in this class, the interpretation of the expected value (mean) # and standard deviation (SD) of a random variable is as follows: # let mu_Y and sigma_Y be the mean and SD of Y, respectively; then # each time a Y value is drawn at random from the PMF of Y, # we expect that Y value to be around mu_Y, give or take about sigma_Y # in other words, # when i say 'give or take' i mean ( plus or minus 1 SD around the mean ) # this terminology is standard in some books but not in others; # for example, degroot and schervish never mention the phrase 'give or take' ############################################################################### # if time permits: how about the expected value, variance, and standard deviation # of the continuous Uniform( a, b ) distribution, for real a and b with a < b? ############################################################################### ###############################################################################