#simulations to check the predicted values for various statistics 
#derived from McClelland's mathematical derivations related to 
#Gelman & Park (2008)

options(digits = 3)
sampleSize <- 10000

# sample x from a standord normal distribution
x <- rnorm(sampleSize,0,1)
# let y = x + e where e is from a standard normal distribution
y <- 1*x + rnorm(sampleSize,0,1)

#this should be 0.707
cor(x,y)

#use the empirical quantiles to split into upper and lower groups
quant <- quantile(x, probs=c(.27,1-.27))
#should be {-0.613, 0.613}
quant


#do the splitting.  the middle obs will have NAs
#the upper and lower groups will have xc values equal to the mean x in the respective groups
xc <- rep(NA,sampleSize)
xc[x < quant[1]] <- mean(x[x < quant[1]])
xc[x > quant[2]] <- mean(x[x > quant[2]])


#create a dataset with middle observations excluded
ends <- na.omit(data.frame(xc, x, y))

#the cell means should be {-1.225,1.225}
tapply(ends$xc, ends$xc, mean)

#the variance of the original x in the end groups should be 1.75
var(ends$x)

# variance of the discrete values of x should be 1.5
var(ends$xc)

# correlation between the discrete values of x and the remainig values of y
# should be 0.738
cor(ends$xc, ends$y)

#grabbing the variance of the estimate of the slope is a bit complicated
#we get it from the variance-covariance matrix of the parameter estimates
#it should be 0.0001
vb <- vcov(alllm <- lm(y ~ x))[2,2]
vb

#and for the split analysis it should be 0.000154
vbsplit <- vcov(cutlm <- lm(ends$y ~ ends$xc))[2,2]
vbsplit

#and the relative efficiency should be 0.647
vb/vbsplit

#for interest, here are the two model summaries
#note that both analyses are unbiased so the slope estimates in both should be 1.0
#and note that the residual error is greater in the cut analysis.  

summary(alllm)

summary(cutlm)