Experimental Design

Intro to Statistics and Experimental Design Video

This video introduces different types of cause and effect relationships, null hypotheses testing, and types of variables. The lecture was geared towards biology, but can be applied to most disciplines. Dr. Gotelli discusses the differences between explanatory and response (predictor) variables, continuous and discrete variables, and summarizes how to choose analyses and visualizations based on data types.

Back Home

Example Code

Uniform distribution

library(tidyverse)

# uniform
# params specify minimum and maximum

#runif for random data
qplot(x=runif(n=100,min=0,max=5),color=I("black"),fill=I("goldenrod"))

qplot(runif(n=1000,min=0,max=5),color=I("black"),fill=I("goldenrod"))

Normal distribution

# normal 
myNorm <- rnorm(n=100,mean=100,sd=2)
qplot(myNorm,color=I("black"),fill=I("goldenrod"))

# problems with normal when mean is small but zero is not allowed.
myNorm <- rnorm(n=100,mean=2,sd=2)
qplot(myNorm,color=I("black"),fill=I("goldenrod"))

summary(myNorm)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.9719  0.9542  2.1434  2.2089  3.5570  6.7063

tossZeroes <- myNorm[myNorm>0]
qplot(tossZeroes,color=I("black"),fill=I("goldenrod"))

summary(tossZeroes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1295  1.4300  2.7076  2.7412  3.7277  6.7063

Gamma distribution

# gamma distribution, continuous positive values, but bounded at 0

myGamma <- rgamma(n=100,shape=1,scale=10)
qplot(myGamma,color=I("black"),fill=I("goldenrod"))

# gamma with shape= 1 is an exponential with scale = mean

# shape <=1 gives a mode near zero; very small shape rounds to zero
myGamma <- rgamma(n=100,shape=0.1,scale=1)
qplot(myGamma,color=I("black"),fill=I("goldenrod"))

# large shape parameters moves towards a normal
myGamma <- rgamma(n=100,shape=20,scale=1)
qplot(myGamma,color=I("black"),fill=I("goldenrod"))

# scale parameter changes mean- and the variance!
qplot(rgamma(n=100,shape=2,scale=100),color=I("black"),fill=I("goldenrod"))

qplot(rgamma(n=100,shape=2,scale=10),color=I("black"),fill=I("goldenrod"))

qplot(rgamma(n=100,shape=2,scale=1),color=I("black"),fill=I("goldenrod"))

qplot(rgamma(n=100,shape=2,scale=0.1),color=I("black"),fill=I("goldenrod"))

# unlike the normal, the two parameters affect both mean and variance

# mean = shape*scale
# variance= shape*scale^2

Beta distribution

# beta distribution 
# bounded at 0 and 1
# analagous to a binomial, but result is a continuous distribution of probabilities
# parameter shape1 = number of successes + 1
# parameter shape2 = number of failures + 1
# interpret these in terms of a coin you are tossing

# shape1 = 1, shape2 = 1 = "no data"
myBeta <- rbeta(n=1000,shape1=1,shape2=1)
qplot(myBeta,xlim=c(0,1), color=I("black"),fill=I("goldenrod"), ylim = c(0,50))

# shape1 = 2, shape1 = 1 = "1 coin toss, comes up heads!"
myBeta <- rbeta(n=1000,shape1=2,shape2=1)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"))

# two tosses, 1 head and 1 tail
myBeta <- rbeta(n=1000,shape1=2,shape2=2)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"))

# two tosses, both heads
myBeta <- rbeta(n=1000,shape1=2,shape2=1)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"))

# let's get more data
myBeta <- rbeta(n=1000,shape1=20,shape2=20)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"))

myBeta <- rbeta(n=1000,shape1=500,shape2=500)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"))

# if the coin is biased
myBeta <- rbeta(n=1000,shape1=1000,shape2=500)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"))

myBeta <- rbeta(n=1000,shape1=10,shape2=5)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"))

# shape parameters less than 1.0 give us a u-shaped distribution
myBeta <- rbeta(n=1000,shape1=0.1,shape2=0.1)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"), ylim=c(0,50))

myBeta <- rbeta(n=1000,shape1=0.5,shape2=0.2)
qplot(myBeta,xlim=c(0,1),color=I("black"),fill=I("goldenrod"), ylim=c(0,100))

Estimating parameters from data

# estimating parameters from data
# maximum likelihood estimator theta versus P(data|theta)

# use fitdistr function, feeding it data and a distribution type)
library(MASS)
x <- rnorm(1000,mean=92.5,sd=2.5)
qplot(x,color=I("black"),fill=I("goldenrod"))

fitdistr(x,"normal")

##       mean           sd     
##   92.65104796    2.42752454 
##  ( 0.07676507) ( 0.05428110)

# compare to true parameters
# compare to parameters estimated from simple means and standard deviations
mean(x)

## [1] 92.65105

sd(x)

## [1] 2.428739

# but how do we "know" what distribution to fit?
fitdistr(x,"gamma")

##       shape           rate    
##   1455.2543384     15.7068309 
##  (  65.2678636) (   0.7045679)

z <- fitdistr(x,"gamma")

# find components of z
#str(z)
# rate = 1/scale
# so here is the estimate of the mean
z$estimate[1]/z$estimate[2]

##    shape 
## 92.65105

# and here is the estimate of the variance
z$estimate[1]/z$estimate[2]^2

##    shape 
## 5.898774

Archetype Experimental Designs

independent versus dependent variables
discrete versus continuous variables
continuous variables (integer and real)
direction of cause and effect, x axis is independent
continuous versus discrete (natural or arbitrary or statistical bins)

Set-up

library(tidyverse)

Linear regression analysis in R

Regression (dependent: continuous, independent: continuous)

linear model of \(y = a + bx\)
statistical tests for null of hypothesis of slope and/or intercept = 0
confidence and prediction intervals of uncertainty
goodness of fit tests for linearity

Data Frame construction

n = 50  # number of observations (rows)


varA <- runif(n) # random uniform values (independent)
varB <- runif(n) # a second random column (dependent)
varC <- 5.5 + varA*10 # a noisy linear relationship with varA
ID <- seq_len(n) # creates a sequence from 1:n (if n > 0!)
regData <- data.frame(ID,varA,varB,varC)
head(regData)

##   ID       varA       varB      varC
## 1  1 0.57559749 0.04938020 11.255975
## 2  2 0.01291939 0.77790333  5.629194
## 3  3 0.36984295 0.68628128  9.198430
## 4  4 0.84100871 0.70253375 13.910087
## 5  5 0.95281213 0.06491577 15.028121
## 6  6 0.73136244 0.27618608 12.813624

#str(regData)

Basic regression analysis in R

# model
regModel <- lm(varB~varA,data=regData)

# model output
regModel # printed output is sparse

## 
## Call:
## lm(formula = varB ~ varA, data = regData)
## 
## Coefficients:
## (Intercept)         varA  
##     0.45275     -0.03961

#str(regModel) # complicated, but has "coefficients"
head(regModel$residuals) # contains residuals

##          1          2          3          4          5          6 
## -0.3805714  0.3256660  0.2481805  0.2830942 -0.3500956 -0.1475962

# 'summary' of model has elements
summary(regModel) #

## 
## Call:
## lm(formula = varB ~ varA, data = regData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40363 -0.23145 -0.08821  0.23799  0.56524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.45275    0.08207   5.517 1.36e-06 ***
## varA        -0.03961    0.14543  -0.272    0.787    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.294 on 48 degrees of freedom
## Multiple R-squared:  0.001543,   Adjusted R-squared:  -0.01926 
## F-statistic: 0.07417 on 1 and 48 DF,  p-value: 0.7865

summary(regModel)$coefficients

##                Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)  0.45274903 0.08206654  5.5168531 1.359235e-06
## varA        -0.03960656 0.14543097 -0.2723392 7.865293e-01

#str(summary(regModel))

# best to examine entire matrix of coefficients:
summary(regModel)$coefficients[] #shows all

##                Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)  0.45274903 0.08206654  5.5168531 1.359235e-06
## varA        -0.03960656 0.14543097 -0.2723392 7.865293e-01

# can pull results from this, but a little wordy
summary(regModel)$coefficients[1,4]   #p value for intercept

## [1] 1.359235e-06

summary(regModel)$coefficients["(Intercept)","Pr(>|t|)"] # uggh

## [1] 1.359235e-06

# alternatively unfurl this into a 1D atomic vector with names
z <- unlist(summary(regModel))
# str(z)
# z
z$coefficients7

## [1] 1.359235e-06

# grab what we need and put into a tidy  list

regSum <- list(intercept=z$coefficients1,
               slope=z$coefficients2,
               interceptP=z$coefficients7,
               slopeP=z$coefficients8,
               r2=z$r.squared)

# much easier to query and use
print(regSum)

## $intercept
## [1] 0.452749
## 
## $slope
## [1] -0.03960656
## 
## $interceptP
## [1] 1.359235e-06
## 
## $slopeP
## [1] 0.7865293
## 
## $r2
## [1] 0.001542796

#regSum$r2
#regSum[[5]]

Basic ggplot of regression model

regPlot <- ggplot(data=regData,aes(x=varA,y=varB)) +
           geom_point() +
           stat_smooth(method=lm,se=0.99) # default se=0.95 
print(regPlot)

# ggsave(filename="Plot1.pdf",plot=regPlot,device="pdf")

One-way ANOVA in R

Data frame construction

nGroup <- 3 # number of treatment groups
nName <- c("Control","Treat1", "Treat2") # names of groups
nSize <- c(12,17,9) # number of observations in each group
nMean <- c(40,41,60) # mean of each group
nSD <- c(5,5,5) # standardd deviation of each group

ID <- 1:(sum(nSize)) # id vector for each row
resVar <- c(rnorm(n=nSize[1],mean=nMean[1],sd=nSD[1]),
            rnorm(n=nSize[2],mean=nMean[2],sd=nSD[2]),
            rnorm(n=nSize[3],mean=nMean[3],sd=nSD[3]))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(ID,TGroup,resVar)
#str(ANOdata)

Basic ANOVA in R

ANOmodel <- aov(resVar~TGroup,data=ANOdata)
print(ANOmodel)

## Call:
##    aov(formula = resVar ~ TGroup, data = ANOdata)
## 
## Terms:
##                    TGroup Residuals
## Sum of Squares  2803.8489  626.7327
## Deg. of Freedom         2        35
## 
## Residual standard error: 4.231625
## Estimated effects may be unbalanced

print(summary(ANOmodel))

##             Df Sum Sq Mean Sq F value  Pr(>F)    
## TGroup       2 2803.8  1401.9   78.29 1.2e-13 ***
## Residuals   35  626.7    17.9                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

z <- summary(ANOmodel)
#str(z)
aggregate(resVar~TGroup,data=ANOdata,FUN=mean)

##    TGroup   resVar
## 1 Control 41.60425
## 2  Treat1 40.20936
## 3  Treat2 60.94171

unlist(z)

##          Df1          Df2      Sum Sq1      Sum Sq2     Mean Sq1     Mean Sq2 
## 2.000000e+00 3.500000e+01 2.803849e+03 6.267327e+02 1.401924e+03 1.790665e+01 
##     F value1     F value2      Pr(>F)1      Pr(>F)2 
## 7.829072e+01           NA 1.202280e-13           NA

unlist(z)[7]

## F value1 
## 78.29072

ANOsum <- list(Fval=unlist(z)[7],probF=unlist(z)[9])
ANOsum

## $Fval
## F value1 
## 78.29072 
## 
## $probF
##     Pr(>F)1 
## 1.20228e-13

Basic ggplot of ANOVA data

ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
           geom_boxplot()
print(ANOPlot)

# ggsave(filename="Plot2.pdf",plot=ANOPlot,device="pdf")

Logistic regression analysis in R

Data frame construction

xVar <- sort(rgamma(n=200,shape=5,scale=5))
yVar <- sample(rep(c(1,0),each=100),prob=seq_len(200))
lRegData <- data.frame(xVar,yVar)

Logistic regression analysis in R

lRegModel <- glm(yVar ~ xVar,
                 data=lRegData,
                 family=binomial(link=logit))
summary(lRegModel)

## 
## Call:
## glm(formula = yVar ~ xVar, family = binomial(link = logit), data = lRegData)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.64924    0.47554  -5.571 2.53e-08 ***
## xVar         0.10539    0.01841   5.724 1.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 199  degrees of freedom
## Residual deviance: 230.57  on 198  degrees of freedom
## AIC: 234.57
## 
## Number of Fisher Scoring iterations: 4

summary(lRegModel)$coefficients

##               Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -2.6492393 0.47554456 -5.570959 2.533407e-08
## xVar         0.1053913 0.01841079  5.724430 1.037815e-08

Basic ggplot of logistic regression

lRegPlot <- ggplot(data=lRegData, aes(x=xVar,y=yVar)) +
            geom_point() +
            stat_smooth(method=glm, method.args=list(family=binomial))
print(lRegPlot)

Contingency table analysis in R

Data

# integer counts of different data groups
vec1 <- c(50,66,22)
vec2 <- c(120,22,30)
dataMatrix <- rbind(vec1,vec2)
rownames(dataMatrix) <- c("Cold","Warm")
colnames(dataMatrix) <-c("Aphaenogaster",
                         "Camponotus",
                         "Crematogaster")
#str(dataMatrix)

Basic contingency table analysis in R

print(chisq.test(dataMatrix))

## 
##  Pearson's Chi-squared test
## 
## data:  dataMatrix
## X-squared = 48.914, df = 2, p-value = 2.391e-11

Plotting contingency table analyses

# some simple plots using baseR
mosaicplot(x=dataMatrix,
           col=c("goldenrod","grey","black"),
           shade=FALSE)

barplot(height=dataMatrix,
        beside=TRUE,
        col=c("cornflowerblue","tomato"))

dFrame <- as.data.frame(dataMatrix)
dFrame <- cbind(dFrame,list(Treatment=c("Cold","Warm")))
dFrame <- gather(dFrame,key=Species,Aphaenogaster:Crematogaster,value=Counts) 

p <- ggplot(data=dFrame,aes(x=Species,y=Counts,fill=Treatment)) + geom_bar(stat="identity",position="dodge",color=I("black")) +
  scale_fill_manual(values=c("cornflowerblue","coral"))
print(p)