#simulations to check the predicted values for various statistics #derived from McClelland's mathematical derivations related to #Gelman & Park (2008) options(digits = 3) sampleSize <- 10000 # sample x from a standord normal distribution x <- rnorm(sampleSize,0,1) # let y = x + e where e is from a standard normal distribution y <- 1*x + rnorm(sampleSize,0,1) #this should be 0.707 cor(x,y) #use the empirical quantiles to split into upper and lower groups quant <- quantile(x, probs=c(.27,1-.27)) #should be {-0.613, 0.613} quant #do the splitting. the middle obs will have NAs #the upper and lower groups will have xc values equal to the mean x in the respective groups xc <- rep(NA,sampleSize) xc[x < quant[1]] <- mean(x[x < quant[1]]) xc[x > quant[2]] <- mean(x[x > quant[2]]) #create a dataset with middle observations excluded ends <- na.omit(data.frame(xc, x, y)) #the cell means should be {-1.225,1.225} tapply(ends$xc, ends$xc, mean) #the variance of the original x in the end groups should be 1.75 var(ends$x) # variance of the discrete values of x should be 1.5 var(ends$xc) # correlation between the discrete values of x and the remainig values of y # should be 0.738 cor(ends$xc, ends$y) #grabbing the variance of the estimate of the slope is a bit complicated #we get it from the variance-covariance matrix of the parameter estimates #it should be 0.0001 vb <- vcov(alllm <- lm(y ~ x))[2,2] vb #and for the split analysis it should be 0.000154 vbsplit <- vcov(cutlm <- lm(ends$y ~ ends$xc))[2,2] vbsplit #and the relative efficiency should be 0.647 vb/vbsplit #for interest, here are the two model summaries #note that both analyses are unbiased so the slope estimates in both should be 1.0 #and note that the residual error is greater in the cut analysis. summary(alllm) summary(cutlm)