#install.packages("lme4_1.1-17.zip", repos = NULL, type = "win.binary")
library(lme4)
## Warning: package 'lme4' was built under R version 3.5.3
## Loading required package: Matrix

A. Simple GLM Binomial

A1. Generalized linear model (GLM)

Why “generalized”
- Distribution assumption (not necessarily “normal distribution”)
- Binomial, Poisson, Gamma distributions etc.
- Regression covariates are linked with a “link function”

GLM in R
- “glm”" (almost same syntax as “lm”)
- Estimate, CI, testing, AIC, …

Maturity analysis

Assumption
- You are analysing “age at sexual maturity (ASM)” of a fish species
- You observe the numbers of mature and immature individuals by age
- Your primary interest is the estimation of “age at 50% maturity (ASM50)”

Structural model
- Maturity is a funciton of age, \(p(a)\)
- \(p(a)\) should be a monotonically increasing function of \(a\) like a sigmoid function

  1. Logistic function
    \(p(a)=\displaystyle \frac{e^{\beta_0 + \beta_1 a}}{1+e^{\beta_0 + \beta_1 a}} \ \Longrightarrow \ \log \frac{p(a)}{1-p(a)} = \beta_0 + \beta_1 a\)

  2. Probit function
    \(p(a)=\displaystyle \int_{-\infty}^a f(z) dz\)

More about logistic function

Exercise 1.
- Let \(ASM50\) be the age at 50% maturity
- Let \(ASM95\) be the age at 95% maturity

\(p(ASM50) = 0.5\)
\(p(ASM95) = 0.95\)

So, \(ASM50\) and \(ASM95\) are functions of \(a\) and \(b\) as

\(ASM50 = \displaystyle -\frac{\beta_0}{\beta_1}\)
\(ASM95 = \displaystyle ...\)

Therefore, the parameters \(a\) and \(b\) are expressed with \(ASM50\) and \(ASM95\) as

\(a = \displaystyle ...\)
\(b = \displaystyle ...\)

Drawing logistic function

Assume \(ASM50\) and \(ASM95\) are 5 and 7, resepectively.

LogisticFn <- function(x,beta0,beta1) {1.0/(1.0+exp(-beta0-beta1*x))}
Age <- seq(0,10,0.01)
plot(Age, LogisticFn(Age,-5,1), type="l", ylab="Maturity")
points(Age, LogisticFn(Age,-2.5,0.5), type="l")
points(Age, LogisticFn(Age,-10,2), type="l")

A2. Statistical model and generation of example data

Distributional assumption
- Two state (mature/immmature) -> Binomial distribution
- \(N_a\): number of sampled individuals of age \(a\)
- \(Y_a\): number of mature individuals among \(N_a\) individuals
- \(p(a)\): maturity (probability) in age \(a\)
- \(Y_a \sim Bin(N_a, p(a))\)

Generation of example data - Assume that \(ASM50\) and \(ASM95\) are 5 and 7, resepectively - Assume that Sample size for each age is 10

Exercise 2.
HOW TO DRAW YOUR DATA ??? Do by yourself!

Amin <- 1
Amax <- 10
AgeRange <- Age <- Amin:Amax
NAgeClass<- length(AgeRange)
Nvec <- rep(10, NAgeClass)
pvec <- LogisticFn(AgeRange, -5, 1)
pvec
##  [1] 0.01798621 0.04742587 0.11920292 0.26894142 0.50000000 0.73105858
##  [7] 0.88079708 0.95257413 0.98201379 0.99330715
Yvec <- rbinom(NAgeClass, Nvec, pvec)
Yvec
##  [1]  1  0  0  0  5  8 10  9  9 10
Maturity.obs <- Yvec/Nvec
Maturity.obs
##  [1] 0.1 0.0 0.0 0.0 0.5 0.8 1.0 0.9 0.9 1.0
plot(Age, LogisticFn(Age,-5,1), type="l", ylab="Maturity")
points(Age, Maturity.obs)

Statistical estimation using “glm” function

Res.glm.logit <- glm(cbind(Yvec,Nvec-Yvec)~Age, family=binomial)
Res.glm.logit
## 
## Call:  glm(formula = cbind(Yvec, Nvec - Yvec) ~ Age, family = binomial)
## 
## Coefficients:
## (Intercept)          Age  
##      -5.776        1.090  
## 
## Degrees of Freedom: 9 Total (i.e. Null);  8 Residual
## Null Deviance:       95.09 
## Residual Deviance: 15.83     AIC: 30.72
names(Res.glm.logit)
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"
Res.glm.logit$coef
## (Intercept)         Age 
##   -5.776169    1.090184
beta0.est <- Res.glm.logit$coef[1]
beta1.est <- Res.glm.logit$coef[2]

plot(Age, LogisticFn(Age,-5,1), type="l", ylab="Maturity")
points(Age, Maturity.obs)
points(Age, LogisticFn(Age,beta0.est,beta1.est), type="l", col="red")

pred.se<-predict (Res.glm.logit, se.fit=TRUE)  #Linear predictor
#Linear predictor's CI
Lower <- pred.se$fit - qnorm(0.975, 0, 1)* pred.se$se.fit
Upper <- pred.se$fit + qnorm(0.975, 0, 1)* pred.se$se.fit
#Logistic transformation of CI
logit.t<-function(x){ 1/(1+exp(-x)) }
pred.logit<- sapply(pred.se$fit, logit.t)
Lower.logit <- sapply(Lower, logit.t)
Upper.logit<- sapply(Upper, logit.t)
pcon.logit<-cbind(pred.logit, Lower.logit, Upper.logit)
plot(AgeRange, Maturity.obs, xlim=c(0,10), ylim=c(0,1))
matlines(AgeRange, pcon.logit, lty=c(1,2,2), col="blue", lwd=1.5)

A3. Confidence interval for your parmeter of interest (ASM)

Method 1. Delta method

Method 2. Profile likelihood

#Maturity function with respect to ASM50 and ASM95
MaturityFn <- function(a,ASM50,ASM95){
  beta0 <- -log(19)*ASM50/(ASM95-ASM50)
  beta1 <- log(19)/(ASM95-ASM50)
  LogisticFn(a,beta0,beta1)
}

ProfileASM50 <- function(ASM50)
{
 NLL <- function(ASM95)
 {
  pp <- MaturityFn(AgeRange,ASM50,ASM95) 
  obj <- (-1.0)*sum(dbinom(Yvec,Nvec,pp,log=T))
  obj
 }
 tmp <- optim(11,NLL,method="BFGS")
 as.numeric((-1.0)*tmp$value)
}

Age.axis <- seq(0,10,0.1)
Res.prof <- unlist(lapply(Age.axis, ProfileASM50))
Likeprof <- -2*(Res.prof - max(Res.prof)) 
plot(Age.axis, Likeprof, type="l")

Age.axis <- seq(4,6,0.01)
Res.prof <- unlist(lapply(Age.axis, ProfileASM50))
Likeprof <- -2*(Res.prof - max(Res.prof)) 
plot(Age.axis, Likeprof, type="l")
abline(qchisq(0.95,1),0, lty=2, col="gray")

A4. Statistical test

Assumption
- There is a traditional knowledge about ASM50 (e.g. H0: ASM50=6)
- You have to check if this knowldege is still valid or not
- So, you are now testing H0 against H1: ASM50 \(\ne\) 6

Likelihood ratio test
- Generic method for statistical test
- Applicable in theory if likelihood functions for H0 and H1 are eplicitly written

In general, if H0 is relevant to 1-dimentional quantity (say H0: \(\theta=\theta_0\)), \[ T(\theta_0) = -2 \log \frac{L(\theta_0)}{L(\hat{\theta})} \longrightarrow \chi^2 (1) \\ T \leq \chi^2(1, 1-\alpha) \Rightarrow \mbox{H0 is accepted} \\ \]

Example

ProfileASM50(6)
## [1] -15.89234
max(Res.prof)
## [1] -13.35835
Tstat <- -2*(ProfileASM50(6)-max(Res.prof))
Tstat 
## [1] 5.067978

A5. Model selection

AIC
- A sort of estimate of distance from the true model to estimated model
- c-AIC
- …
- AIC = -2maxloglikelihood + 2(#paramters)

Res.glm.logit <- glm(cbind(Yvec,Nvec-Yvec)~Age, family=binomial)
Res.glm.logit$aic
## [1] 30.71667
Res.glm.probit <- glm(cbind(Yvec,Nvec-Yvec)~Age, family=binomial(link=probit))
Res.glm.probit$aic
## [1] 33.21207

B. GLMM

Maturity analysis

Assumption
- You are analysing “age at sexual maturity (ASM)” of a fish species
- “Every year”“, you observe the numbers of mature and immature individuals by age
- Your primary interest is if age at 50% maturity (ASM50) changes over years

Structural model
- Maturity is a funciton of age and y, \(p(y,a)\)
- \(p(y,a)\) is a monotonically increasing function of \(a\) given year \(y\)

Logistic function
\(p(y,a)= \displaystyle \frac{e^{\beta_{y,0} + \beta_1 a}}{1+e^{\beta_{y,0} + \beta_1 a}} \ \Longrightarrow \ \log \frac{p(y,a)}{1-p(y,a)} = \beta_{y,0} + \beta_1 a\)

Random intercept: Data generation with random year effect

Assumption
- Intercept \(\beta_0\) randomly changes across years
- This implies that ASM50 also randomly changes accordingly

\[ \frac{p(y,a)}{1-p(y,a)} = \beta_{y,0} + \beta_1 \\ \beta_{y,0} \sim N(\mu, \sigma^2) \]

\(ASM50_y = \displaystyle -\frac{\beta_{y,0}}{\beta_1}\)

Nyear <- 15

Nmat <- array(10, c(Nyear,NAgeClass))
pmat <- array(NA, c(Nyear,NAgeClass))
Ymat <- array(NA, c(Nyear,NAgeClass))

mu <- (-1.0)*5
sigma <- 0.5

beta0.vec <- rnorm(Nyear, mu, sigma)
ASM.true <- -beta0.vec/1
ASM.true
##  [1] 5.008443 4.538578 5.179211 5.254151 4.400310 4.940621 5.088998
##  [8] 5.425360 4.130369 5.115666 4.730154 4.882767 4.911967 4.366774
## [15] 5.246979
for(y in 1:Nyear){
  pmat[y,]<-LogisticFn(AgeRange, beta0.vec[y], 1)
  Ymat[y,]<-rbinom(NAgeClass, Nmat[y,], pmat[y,])
}

Data <- data.frame(
  Mature=as.vector(Ymat), 
  Immature=as.vector(Nmat-Ymat), 
  Age=rep(AgeRange,Nyear), 
  Year=sort(rep(1:15,NAgeClass))
)
head(Data)
##   Mature Immature Age Year
## 1      0       10   1    1
## 2      1        9   2    1
## 3      0       10   3    1
## 4      0       10   4    1
## 5      2        8   5    1
## 6      1        9   6    1
tail(Data)
##     Mature Immature Age Year
## 145     10        0   5   15
## 146     10        0   6   15
## 147     10        0   7   15
## 148     10        0   8   15
## 149     10        0   9   15
## 150     10        0  10   15

plot(AgeRange, pmat[1,], xlab="Age", ylab="Maturity", type="l")
for(y in 1:Nyear) points(AgeRange, pmat[y,], type="l", col=y)
abline(0.5, 0, lty=2)

#library(lme4)
Res.glmer1.logit <- 
glmer(cbind(Mature, Immature) ~ Age + (1|Year), family=binomial, data=Data)
summary(Res.glmer1.logit)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(Mature, Immature) ~ Age + (1 | Year)
##    Data: Data
## 
##      AIC      BIC   logLik deviance df.resid 
##    415.2    424.2   -204.6    409.2      147 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7333 -0.7062  0.2399  0.6205  2.4881 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Year   (Intercept) 8.869    2.978   
## Number of obs: 150, groups:  Year, 15
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.41835    0.79741   0.525  0.59984   
## Age          0.07328    0.02822   2.597  0.00941 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##     (Intr)
## Age -0.191
Res.glmer2.logit <- 
glmer(cbind(Mature, Immature) ~ 1 + (Age|Year), family=binomial, data=Data)
summary(Res.glmer2.logit)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(Mature, Immature) ~ 1 + (Age | Year)
##    Data: Data
## 
##      AIC      BIC   logLik deviance df.resid 
##    421.9    433.9   -206.9    413.9      146 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0307 -0.6605  0.2186  0.6227  2.3958 
## 
## Random effects:
##  Groups Name        Variance  Std.Dev. Corr 
##  Year   (Intercept) 10.750461 3.27879       
##         Age          0.004871 0.06979  -0.68
## Number of obs: 150, groups:  Year, 15
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   1.2530     0.8818   1.421    0.155
Res.glmer3.logit <- 
glmer(cbind(Mature, Immature) ~  (1+Age|Year), family=binomial, data=Data)
summary(Res.glmer3.logit)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(Mature, Immature) ~ (1 + Age | Year)
##    Data: Data
## 
##      AIC      BIC   logLik deviance df.resid 
##    421.9    433.9   -206.9    413.9      146 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0307 -0.6605  0.2186  0.6227  2.3958 
## 
## Random effects:
##  Groups Name        Variance  Std.Dev. Corr 
##  Year   (Intercept) 10.750461 3.27879       
##         Age          0.004871 0.06979  -0.68
## Number of obs: 150, groups:  Year, 15
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   1.2530     0.8818   1.421    0.155