###############################################################################
###############################################################################

# R code to explore the concepts of expected value, variance,
# and standard deviation

# unbuffer the output

# set the working directory to the place you want to store,
# e.g., any pdf versions of plots

###############################################################################

# first, let's look at the expected value of the 
# binomial distribution that arose in the tay-sachs case study:

# let Y record the number of tay-sachs babies in the family
# of n = 5 children, with both parents carriers, so that
# the probability of any single child having the disease
# is p = 0.25

# the support of the discrete random variable Y is 

#   S_Y = { 0, 1, ..., n } = { 0, 1, 2, 3, 4, 5 } 

# and Y has a pmf, not a pdf; recall that the pmf of the
# binomial( n, p ) distribution, for y in S_Y

#   f_Y ( y ) = ( n choose y ) * p^y * ( 1 - p )^( n - y )

n <- 5

p <- 0.25

binomial.pmf <- dbinom( 0:n, n, p )

cbind( 0:n, binomial.pmf, cumsum( binomial.pmf ) )

# here is the relative frequency distribution of Y
# (the pmf) and Y's cdf:

#       pmf                cdf
# y  P( Y = y )    F_Y ( y ) = P ( Y <= y )  

# 0 0.2373046875         0.2373047
# 1 0.3955078125         0.6328125
# 2 0.2636718750         0.8964844
# 3 0.0878906250         0.9843750
# 4 0.0146484375         0.9990234
# 5 0.0009765625         1.0000000

# here's a spike plot of this pmf:

plot( 0, 0, xlim = c( -0.5, 5.5 ), ylim = c( 0, 0.4 ),
  xlab = 'y', ylab = 'P( Y = y )', type = 'n',
  main = 'Spike plot of the Binomial( 5, 0.25 ) distribution' )

segments( 0, 0, 0, binomial.pmf[ 1 ], lwd = 2, col = 'red' )

segments( 1, 0, 1, binomial.pmf[ 2 ], lwd = 2, col = 'red' )

segments( 2, 0, 2, binomial.pmf[ 3 ], lwd = 2, col = 'red' )

segments( 3, 0, 3, binomial.pmf[ 4 ], lwd = 2, col = 'red' )

segments( 4, 0, 4, binomial.pmf[ 5 ], lwd = 2, col = 'red' )

segments( 5, 0, 5, binomial.pmf[ 6 ], lwd = 2, col = 'red' )

# in this class we concentrate on three measures of
# the *center* of a probability distribution: mean,
# median, and mode

# the mode (the y having the biggest pmf) of this distribution
# is clearly 1

# and the median is also 1, because if we sweep up
# all the probability from - infinity to 0 (inclusive)
# we've only accumulated about 23.7% of the probability,
# but if we sweep from - infinity to 1 (inclusive)
# we've caught about 63.3%

# the median is the 50%/50% point, which is in between 
# 23.7% and 63.3%; with discrete distributions people agree
# to set the median equal to the y value for which

#   F_Y ( y - 1 ) < 0.5 and F_Y ( y ) >= 0.5

# what about the mean, otherwise known as the expected value
# of the random variable Y? it's the balance point or center of mass 
# of the distribution

# as usual this can be computed in two ways: math and simulation

# for a discrete random variable the math looks like this:

#   E( Y ) = sum, over all y values in the support of Y,
#     of terms of the form y * P( Y = y )

# R can compute this for the specific values of n and p
# in the tay-sachs case study:

print( expected.value <- sum( ( 0:n ) * binomial.pmf ) )

# [1] 1.25

# wolfram alpha can compute it for general n and p; the code is
# as follows:

# first, let's verify that the pmf does indeed sum to 1
# over all y values in S_Y:

# wolfram alpha code begins below this line

#   sum of ( n choose y ) * p^y * ( 1 - p )^( n - y ) for y from 0 to n

# now let's get the expected value of Y for general n and p:

#   sum of y * ( n choose y ) * p^y * ( 1 - p )^( n - y ) for y from 0 to n

# wolfram alpha code ends above this line

# so the expected value of the binomial( n, p ) distribution is n * p

# here's the simulation approach in R

set.seed( 123454321 )

M <- 1000000

y.star <- rbinom( M, n, p )

mean( y.star )

# [1] 1.250012

# you can see that this is not as satisfying as the math approach;
# we could *conjecture* that with n = 5 and p = 0.25 the expected value
# is 5 * 0.25 = 1.25, but we've only demonstrated this up to
# monte carlo noise in the 6th and 7th significant figures

# here's a histogram of the y.star values, which (as you know)
# is a bar-graph version of the spike plot:

hist( y.star, prob = T, breaks = ( 0 - 0.5 ):( n + 0.5 ),
  xlab = 'y', main = 'Histogram of the Binomial( 5, 0.25 ) pmf' )

# and here you can compare the two plots:

par( mfrow = c( 2, 1 ) )

plot( 0, 0, xlim = c( -0.5, 5.5 ), ylim = c( 0, 0.4 ),
  xlab = 'y', ylab = 'P( Y = y )', type = 'n',
  main = 'Spike plot of the Binomial( 5, 0.25 ) distribution' )

segments( 0, 0, 0, binomial.pmf[ 1 ], lwd = 2, col = 'red' )

segments( 1, 0, 1, binomial.pmf[ 2 ], lwd = 2, col = 'red' )

segments( 2, 0, 2, binomial.pmf[ 3 ], lwd = 2, col = 'red' )

segments( 3, 0, 3, binomial.pmf[ 4 ], lwd = 2, col = 'red' )

segments( 4, 0, 4, binomial.pmf[ 5 ], lwd = 2, col = 'red' )

segments( 5, 0, 5, binomial.pmf[ 6 ], lwd = 2, col = 'red' )

hist( y.star, prob = T, breaks = ( 0 - 0.5 ):( n + 0.5 ),
  xlab = 'y', main = 'Histogram of the Binomial( 5, 0.25 ) pmf',
  ylab = 'PMF' )

par( mfrow = c( 1, 1 ) )

###############################################################################

# now, what about the variance and standard deviation of this distribution?

# the second thing you would think of doing after summarizing
# the center of a distribution is to quantify how *spread out* it is

# in this class we focus on three measures of spread:
# the interquartile range, the variance, and the standard deviation (SD)

# here's let's just look at the variance and the SD

# the variance of a random variable Y with mean mu_Y is defined to be 

#   variance of Y = expected value of the derived random variable
#                     ( Y - mu_Y )^2

# and the SD is just the square root of the variance

# for a discrete random variable, such as Y in the tay-sachs case study,
# the variance expression becomes

#   V( Y ) = sum, over all y values in the support of Y,
#     of terms of the form ( y - mu_Y )^2 * P( Y = y )

# notation: i use V( Y ) for the variance of Y;
# degroot and schervish (and many other people)
# use Var( y ); but 'Var' is inconsistent with the notation
# E( Y ) that everybody uses for the expected value,
# so i stick with V( Y ) for the variance

# R can compute this for the specific values of n and p
# in the tay-sachs case study:

mu.Y <- expected.value

print( variance <- sum( ( 0:n - mu.Y )^2 * binomial.pmf ) )

# [1] 0.9375

# wolfram alpha can compute this for general n and p; the code is
# as follows:

# wolfram alpha code begins below this line

#   sum of ( y - n * p )^2 * ( n choose y ) * p^y * ( 1 - p )^( n - y ) for y from 0 to n

# wolfram alpha code ends above this line

# so the variance of the binomial( n, p ) distribution is n * p * ( 1 - p ),
# and the SD of that distribution is sqrt( n * p * ( 1 - p ) )

# R can of course do this numerically and exactly for the specific values of n and p:

print( SD <- sqrt( sum( ( 0:n - mu.Y )^2 * binomial.pmf ) ) )

# [1] 0.9682458

# the simulation approach is easy: having already generated 1,000,000
# random draws from the binomial( 5, 0.25 ) distribution and stored them
# in the R object y.star, we can just use the built-in functions 'var'
# and 'sd':

print( c( var( y.star ), sd( y.star ) ) )

# [1] 0.9383789 0.9686996

# these are only monte-carlo accurate to about 3 or 4 significant figures,
# even with 1,000,000 simulated IID draws, whereas the monte-carlo
# estimate of the mean was accurate to about 5 sigfigs (why are the variance
# and SD less accurately estimated via monte-carlo than the mean?)

# technical note: 'var' and 'sd' in R use slightly different formulas
# as a function of sample size (in this case, M), with the property that 
# the difference vanishes as M increases; here, with M = 1000000, 
# the difference would only show up in the 6th significant figure

###############################################################################

# in this class, the interpretation of the expected value (mean) 
# and standard deviation (SD) of a random variable is as follows: 
# let mu_Y and sigma_Y be the mean and SD of Y, respectively; then

#   each time a Y value is drawn at random from the PMF of Y,
#   we expect that Y value to be around mu_Y, give or take about sigma_Y

# in other words, 

#   when i say 'give or take' i mean ( plus or minus 1 SD around the mean )

# this terminology is standard in some books but not in others;
# for example, degroot and schervish never mention the phrase 'give or take'

###############################################################################

# if time permits: how about the expected value, variance, and standard deviation
# of the continuous Uniform( a, b ) distribution, for real a and b with a < b?

###############################################################################
###############################################################################