#install.packages("lme4_1.1-17.zip", repos = NULL, type = "win.binary")
library(lme4)
## Warning: package 'lme4' was built under R version 3.5.3
## Loading required package: Matrix
Why “generalized”
- Distribution assumption (not necessarily “normal distribution”)
- Binomial, Poisson, Gamma distributions etc.
- Regression covariates are linked with a “link function”
GLM in R
- “glm”" (almost same syntax as “lm”)
- Estimate, CI, testing, AIC, …
Assumption
- You are analysing “age at sexual maturity (ASM)” of a fish species
- You observe the numbers of mature and immature individuals by age
- Your primary interest is the estimation of “age at 50% maturity (ASM50)”
Structural model
- Maturity is a funciton of age, \(p(a)\)
- \(p(a)\) should be a monotonically increasing function of \(a\) like a sigmoid function
Logistic function
\(p(a)=\displaystyle \frac{e^{\beta_0 + \beta_1 a}}{1+e^{\beta_0 + \beta_1 a}} \ \Longrightarrow \ \log \frac{p(a)}{1-p(a)} = \beta_0 + \beta_1 a\)
Probit function
\(p(a)=\displaystyle \int_{-\infty}^a f(z) dz\)
Exercise 1.
- Let \(ASM50\) be the age at 50% maturity
- Let \(ASM95\) be the age at 95% maturity
\(p(ASM50) = 0.5\)
\(p(ASM95) = 0.95\)
So, \(ASM50\) and \(ASM95\) are functions of \(a\) and \(b\) as
\(ASM50 = \displaystyle -\frac{\beta_0}{\beta_1}\)
\(ASM95 = \displaystyle ...\)
Therefore, the parameters \(a\) and \(b\) are expressed with \(ASM50\) and \(ASM95\) as
\(a = \displaystyle ...\)
\(b = \displaystyle ...\)
Assume \(ASM50\) and \(ASM95\) are 5 and 7, resepectively.
LogisticFn <- function(x,beta0,beta1) {1.0/(1.0+exp(-beta0-beta1*x))}
Age <- seq(0,10,0.01)
plot(Age, LogisticFn(Age,-5,1), type="l", ylab="Maturity")
points(Age, LogisticFn(Age,-2.5,0.5), type="l")
points(Age, LogisticFn(Age,-10,2), type="l")
Distributional assumption
- Two state (mature/immmature) -> Binomial distribution
- \(N_a\): number of sampled individuals of age \(a\)
- \(Y_a\): number of mature individuals among \(N_a\) individuals
- \(p(a)\): maturity (probability) in age \(a\)
- \(Y_a \sim Bin(N_a, p(a))\)
Generation of example data - Assume that \(ASM50\) and \(ASM95\) are 5 and 7, resepectively - Assume that Sample size for each age is 10
Exercise 2.
HOW TO DRAW YOUR DATA ??? Do by yourself!
Amin <- 1
Amax <- 10
AgeRange <- Age <- Amin:Amax
NAgeClass<- length(AgeRange)
Nvec <- rep(10, NAgeClass)
pvec <- LogisticFn(AgeRange, -5, 1)
pvec
## [1] 0.01798621 0.04742587 0.11920292 0.26894142 0.50000000 0.73105858
## [7] 0.88079708 0.95257413 0.98201379 0.99330715
Yvec <- rbinom(NAgeClass, Nvec, pvec)
Yvec
## [1] 1 0 0 0 5 8 10 9 9 10
Maturity.obs <- Yvec/Nvec
Maturity.obs
## [1] 0.1 0.0 0.0 0.0 0.5 0.8 1.0 0.9 0.9 1.0
plot(Age, LogisticFn(Age,-5,1), type="l", ylab="Maturity")
points(Age, Maturity.obs)
Res.glm.logit <- glm(cbind(Yvec,Nvec-Yvec)~Age, family=binomial)
Res.glm.logit
##
## Call: glm(formula = cbind(Yvec, Nvec - Yvec) ~ Age, family = binomial)
##
## Coefficients:
## (Intercept) Age
## -5.776 1.090
##
## Degrees of Freedom: 9 Total (i.e. Null); 8 Residual
## Null Deviance: 95.09
## Residual Deviance: 15.83 AIC: 30.72
names(Res.glm.logit)
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "call" "formula" "terms"
## [25] "data" "offset" "control"
## [28] "method" "contrasts" "xlevels"
Res.glm.logit$coef
## (Intercept) Age
## -5.776169 1.090184
beta0.est <- Res.glm.logit$coef[1]
beta1.est <- Res.glm.logit$coef[2]
plot(Age, LogisticFn(Age,-5,1), type="l", ylab="Maturity")
points(Age, Maturity.obs)
points(Age, LogisticFn(Age,beta0.est,beta1.est), type="l", col="red")
pred.se<-predict (Res.glm.logit, se.fit=TRUE) #Linear predictor
#Linear predictor's CI
Lower <- pred.se$fit - qnorm(0.975, 0, 1)* pred.se$se.fit
Upper <- pred.se$fit + qnorm(0.975, 0, 1)* pred.se$se.fit
#Logistic transformation of CI
logit.t<-function(x){ 1/(1+exp(-x)) }
pred.logit<- sapply(pred.se$fit, logit.t)
Lower.logit <- sapply(Lower, logit.t)
Upper.logit<- sapply(Upper, logit.t)
pcon.logit<-cbind(pred.logit, Lower.logit, Upper.logit)
plot(AgeRange, Maturity.obs, xlim=c(0,10), ylim=c(0,1))
matlines(AgeRange, pcon.logit, lty=c(1,2,2), col="blue", lwd=1.5)
Method 1. Delta method
Method 2. Profile likelihood
#Maturity function with respect to ASM50 and ASM95
MaturityFn <- function(a,ASM50,ASM95){
beta0 <- -log(19)*ASM50/(ASM95-ASM50)
beta1 <- log(19)/(ASM95-ASM50)
LogisticFn(a,beta0,beta1)
}
ProfileASM50 <- function(ASM50)
{
NLL <- function(ASM95)
{
pp <- MaturityFn(AgeRange,ASM50,ASM95)
obj <- (-1.0)*sum(dbinom(Yvec,Nvec,pp,log=T))
obj
}
tmp <- optim(11,NLL,method="BFGS")
as.numeric((-1.0)*tmp$value)
}
Age.axis <- seq(0,10,0.1)
Res.prof <- unlist(lapply(Age.axis, ProfileASM50))
Likeprof <- -2*(Res.prof - max(Res.prof))
plot(Age.axis, Likeprof, type="l")
Age.axis <- seq(4,6,0.01)
Res.prof <- unlist(lapply(Age.axis, ProfileASM50))
Likeprof <- -2*(Res.prof - max(Res.prof))
plot(Age.axis, Likeprof, type="l")
abline(qchisq(0.95,1),0, lty=2, col="gray")
Assumption
- There is a traditional knowledge about ASM50 (e.g. H0: ASM50=6)
- You have to check if this knowldege is still valid or not
- So, you are now testing H0 against H1: ASM50 \(\ne\) 6
Likelihood ratio test
- Generic method for statistical test
- Applicable in theory if likelihood functions for H0 and H1 are eplicitly written
In general, if H0 is relevant to 1-dimentional quantity (say H0: \(\theta=\theta_0\)), \[ T(\theta_0) = -2 \log \frac{L(\theta_0)}{L(\hat{\theta})} \longrightarrow \chi^2 (1) \\ T \leq \chi^2(1, 1-\alpha) \Rightarrow \mbox{H0 is accepted} \\ \]
ProfileASM50(6)
## [1] -15.89234
max(Res.prof)
## [1] -13.35835
Tstat <- -2*(ProfileASM50(6)-max(Res.prof))
Tstat
## [1] 5.067978
AIC
- A sort of estimate of distance from the true model to estimated model
- c-AIC
- …
- AIC = -2maxloglikelihood + 2(#paramters)
Res.glm.logit <- glm(cbind(Yvec,Nvec-Yvec)~Age, family=binomial)
Res.glm.logit$aic
## [1] 30.71667
Res.glm.probit <- glm(cbind(Yvec,Nvec-Yvec)~Age, family=binomial(link=probit))
Res.glm.probit$aic
## [1] 33.21207
Assumption
- You are analysing “age at sexual maturity (ASM)” of a fish species
- “Every year”“, you observe the numbers of mature and immature individuals by age
- Your primary interest is if age at 50% maturity (ASM50) changes over years
Structural model
- Maturity is a funciton of age and y, \(p(y,a)\)
- \(p(y,a)\) is a monotonically increasing function of \(a\) given year \(y\)
Logistic function
\(p(y,a)= \displaystyle \frac{e^{\beta_{y,0} + \beta_1 a}}{1+e^{\beta_{y,0} + \beta_1 a}} \ \Longrightarrow \ \log \frac{p(y,a)}{1-p(y,a)} = \beta_{y,0} + \beta_1 a\)
Assumption
- Intercept \(\beta_0\) randomly changes across years
- This implies that ASM50 also randomly changes accordingly
\[ \frac{p(y,a)}{1-p(y,a)} = \beta_{y,0} + \beta_1 \\ \beta_{y,0} \sim N(\mu, \sigma^2) \]
\(ASM50_y = \displaystyle -\frac{\beta_{y,0}}{\beta_1}\)
Nyear <- 15
Nmat <- array(10, c(Nyear,NAgeClass))
pmat <- array(NA, c(Nyear,NAgeClass))
Ymat <- array(NA, c(Nyear,NAgeClass))
mu <- (-1.0)*5
sigma <- 0.5
beta0.vec <- rnorm(Nyear, mu, sigma)
ASM.true <- -beta0.vec/1
ASM.true
## [1] 5.008443 4.538578 5.179211 5.254151 4.400310 4.940621 5.088998
## [8] 5.425360 4.130369 5.115666 4.730154 4.882767 4.911967 4.366774
## [15] 5.246979
for(y in 1:Nyear){
pmat[y,]<-LogisticFn(AgeRange, beta0.vec[y], 1)
Ymat[y,]<-rbinom(NAgeClass, Nmat[y,], pmat[y,])
}
Data <- data.frame(
Mature=as.vector(Ymat),
Immature=as.vector(Nmat-Ymat),
Age=rep(AgeRange,Nyear),
Year=sort(rep(1:15,NAgeClass))
)
head(Data)
## Mature Immature Age Year
## 1 0 10 1 1
## 2 1 9 2 1
## 3 0 10 3 1
## 4 0 10 4 1
## 5 2 8 5 1
## 6 1 9 6 1
tail(Data)
## Mature Immature Age Year
## 145 10 0 5 15
## 146 10 0 6 15
## 147 10 0 7 15
## 148 10 0 8 15
## 149 10 0 9 15
## 150 10 0 10 15
plot(AgeRange, pmat[1,], xlab="Age", ylab="Maturity", type="l")
for(y in 1:Nyear) points(AgeRange, pmat[y,], type="l", col=y)
abline(0.5, 0, lty=2)
#library(lme4)
Res.glmer1.logit <-
glmer(cbind(Mature, Immature) ~ Age + (1|Year), family=binomial, data=Data)
summary(Res.glmer1.logit)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(Mature, Immature) ~ Age + (1 | Year)
## Data: Data
##
## AIC BIC logLik deviance df.resid
## 415.2 424.2 -204.6 409.2 147
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7333 -0.7062 0.2399 0.6205 2.4881
##
## Random effects:
## Groups Name Variance Std.Dev.
## Year (Intercept) 8.869 2.978
## Number of obs: 150, groups: Year, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.41835 0.79741 0.525 0.59984
## Age 0.07328 0.02822 2.597 0.00941 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Age -0.191
Res.glmer2.logit <-
glmer(cbind(Mature, Immature) ~ 1 + (Age|Year), family=binomial, data=Data)
summary(Res.glmer2.logit)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(Mature, Immature) ~ 1 + (Age | Year)
## Data: Data
##
## AIC BIC logLik deviance df.resid
## 421.9 433.9 -206.9 413.9 146
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0307 -0.6605 0.2186 0.6227 2.3958
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Year (Intercept) 10.750461 3.27879
## Age 0.004871 0.06979 -0.68
## Number of obs: 150, groups: Year, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2530 0.8818 1.421 0.155
Res.glmer3.logit <-
glmer(cbind(Mature, Immature) ~ (1+Age|Year), family=binomial, data=Data)
summary(Res.glmer3.logit)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(Mature, Immature) ~ (1 + Age | Year)
## Data: Data
##
## AIC BIC logLik deviance df.resid
## 421.9 433.9 -206.9 413.9 146
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0307 -0.6605 0.2186 0.6227 2.3958
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Year (Intercept) 10.750461 3.27879
## Age 0.004871 0.06979 -0.68
## Number of obs: 150, groups: Year, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2530 0.8818 1.421 0.155