## Warning: package 'knitr' was built under R version 3.5.3
This is a quick-and-dirty introduction of R computation, which will be a basis of Shijie’s data-limited package. Any feedbacks would be appreciated for improvement.
Before starting the workshop, please visit the following website and download the applications which we will use in the workshop
First, please visit the following web site and download “R 3.6.xx for Windows” for example:
https://cran.r-project.org/
%
Then please visit the web site below and download the free version of “RStudio Desktop”:
https://www.rstudio.com/products/rstudio/download/
%
I will explain how you can create a memo/report of your analysis with R in a easy way with Rmarkdown. I’m writing this handout using it.
Please install the following packages in R by using a command before the seminar.
install.packages("package name")
install.packages("knitr")
install.packages("rmarkdown")
Then you can call those libraries.
library(knitr)
library(rmarkdown)
a <- c(10,20,30,40,50)
a
[1] 10 20 30 40 50
sum(a)
[1] 150
mean(a)
[1] 30
a+50
[1] 60 70 80 90 100
2*a
[1] 20 40 60 80 100
a^2
[1] 100 400 900 1600 2500
log(a)
[1] 2.302585 2.995732 3.401197 3.688879 3.912023
a[3]
[1] 30
a[c(1,2,3)]
[1] 10 20 30
a[-c(2,4)]
[1] 10 30 50
b <- seq(1,9,2)
a+b
[1] 11 23 35 47 59
a*b
[1] 10 60 150 280 450
b/a
[1] 0.1000000 0.1500000 0.1666667 0.1750000 0.1800000
ss <- 0
for(i in 1:10){
ss <- ss + i
}
ss
[1] 55
# Just in case, this is done simply as
a<- seq(1,10)
a
[1] 1 2 3 4 5 6 7 8 9 10
sum(a)
[1] 55
Start with your capital $20 and contiune 100 times betting with $1.
Gambling <- function(p, Bet, Capital, Maxit=101, Nsim=100)
{
X <- array(0, c(Nsim, Maxit))
for(i in 1:Nsim)
{
X[i,1] <- Capital
for(j in 2:Maxit)
{
Tmp <- X[i,(j-1)] + sample(Bet*c(-1,1),1,prob=c(1-p,p))
if(Tmp >0) X[i,j] <- Tmp
if(Tmp<=0) X[i,j] <- -10
}
}
plot(X[1,], type="l", ylim=c(0, max(X)*1.2),
xlab="Trial", ylab="State", col="grey")
for(i in 1:Nsim) points(X[i,], type="l", col="grey")
abline(Capital, 0, lty=2, col="red")
hist(X[,Maxit], col="orange", xlab="Final state", main="")
return(X)
}
Then again let us run function “Gambling”
par(mfrow=c(2,2), mar=c(3,3,3,3))
Res <- Gambling(p=0.5, Bet=1, Capital=20, Maxit=100, Nsim=100)
Res <- Gambling(p=0.5, Bet=1, Capital=20, Maxit=1000, Nsim=100)
We shall start with some regression analyses for estimating biological parameters using some regression methods.
In R, lots of types of regression analyses can be conducted (see table below).
%
The common allometric function is defined as follows: \[ W = a L^b. \]
Our example data set is a combination of length and weight observed for rainbow trouts in Ohizumi experimental station in Japan.
We shall start reading the data and showing a simple scatter plot.
# Reading and plotting the data
Data <- read.csv("Rainbowtrout.csv", header=T)
names(Data)
[1] "Length" "Weight"
head(Data)
Length Weight
1 7.5 10
2 9.2 9
3 9.5 25
4 9.7 35
5 10.5 20
6 10.6 35
Length <- Data$Length
Weight <- Data$Weight
plot(Length, Weight, xlab="Length (cm)", ylab="Weight (g)",
main="Allometric data for rainbow trouts")
Then we conduct a simple regression analysis for the following simple linear model.
\[ \log W_i = \log a + b \log L_i + \varepsilon_i, \quad \varepsilon_i \sim N(0,\sigma^2). \]
# Conducting regression analysis
res <- lm(log(Weight)~log(Length))
summary(res)
Call:
lm(formula = log(Weight) ~ log(Length))
Residuals:
Min 1Q Median 3Q Max
-0.59889 -0.08977 -0.00832 0.14493 0.69146
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.04898 0.28442 -10.72 6.67e-13 ***
log(Length) 2.60235 0.08474 30.71 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.286 on 37 degrees of freedom
Multiple R-squared: 0.9622, Adjusted R-squared: 0.9612
F-statistic: 943.1 on 1 and 37 DF, p-value: < 2.2e-16
plot(log(Length), log(Weight))
abline(res)
We can extract the parameter estimates of \(\alpha=\log a\) and \(\beta=b\).
# List of objects
names(res)
[1] "coefficients" "residuals" "effects" "rank"
[5] "fitted.values" "assign" "qr" "df.residual"
[9] "xlevels" "call" "terms" "model"
names(summary(res))
[1] "call" "terms" "residuals" "coefficients"
[5] "aliased" "sigma" "df" "r.squared"
[9] "adj.r.squared" "fstatistic" "cov.unscaled"
# Extracting parameter estimates (the both are same meanings)
summary(res)$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.048982 0.28442057 -10.71998 6.667534e-13
log(Length) 2.602353 0.08474089 30.70953 6.255463e-28
coef(res)
(Intercept) log(Length)
-3.048982 2.602353
alpha.est <- as.numeric(coef(res)[1])
beta.est <- as.numeric(coef(res)[2])
# Parameter transformation
a.est <- exp(alpha.est)
b.est <- beta.est
Est <- c(a.est, b.est); Est
[1] 0.04740718 2.60235273
We can also extract confidence intervals of parameter estimates.
# Use a function "confint"
confint(res)
2.5 % 97.5 %
(Intercept) -3.625272 -2.472691
log(Length) 2.430651 2.774054
confint(res)[1,]
2.5 % 97.5 %
-3.625272 -2.472691
confint(res)[2,]
2.5 % 97.5 %
2.430651 2.774054
# For the following summary
alpha.ci <- confint(res)[1,]
beta.ci <- confint(res)[2,]
a.ci <- exp(alpha.ci)
b.ci <- beta.ci
CI <- rbind(a.ci, b.ci); CI
2.5 % 97.5 %
a.ci 0.02664184 0.08435757
b.ci 2.43065137 2.77405408
Then a summary is given as follows:
Out <- cbind(Est, CI)
row.names(Out) <- c("a", "b")
Out
Est 2.5 % 97.5 %
a 0.04740718 0.02664184 0.08435757
b 2.60235273 2.43065137 2.77405408
There two types of confidence intervals: - Confidence interval for regression estimates
- Confidence interval for prediction
# Confidence interval of the curve
plot(Length, Weight, xlab="Length (mm)", ylab="Weight (g)", main="With CI for estimates")
pred <- exp(predict(res, int="c", level = 0.95))
matlines(Length, pred, lty=c(1,2,2), col=1)
# Confidence interval of the prediction
plot(Length, Weight, xlab="Length (mm)", ylab="Weight (g)", main="With CI for prediction")
pred <- exp(predict(res, int="p", level = 0.95))
matlines(Length, pred, lty=c(1,2,2), col=1)
As in the case of allometric function, we will conduct a regression analysis. However, we will analyze the data without any transformation using the “nonlinear regression model”.
A common and general formulation is as follows: \[ y_i = f(x_i, \theta) + \varepsilon_i, \quad \varepsilon_i \sim N(0,\sigma^2). \]
Here, we use a data set for alfonsino (in Japanese, “Kinmedai”). Like in the case of rainbow trouts, we shall read the data file.
Data <- read.csv("Alfonsino.csv", header=T)
Age <- Data$Age
Length <- Data$Length
Sex <- as.numeric(Data$Sex) #"1" for female, "2" for male
mark <- c(19,2)
color <- c("red","blue")
plot(Length~Age,pch=mark[Sex],col=color[Sex],xlim=c(0,25),ylim=c(0,50))
legend(18,20,pch=mark,col=color,legend=c("Female","Male"),bty="n")
We will employ the famous von Bertalanffy growth function defined as \[ L_t = L_{\infty} (1-e^{-K (t-t_0)}). \]
# Definition of von Bertalanffy
growth.VB <- function(t,Linf,K,t0) Linf*(1.0-exp(-K*(t-t0)))
We then estimate the parameters \((L_{\infty}, K, t_0, \sigma)\) using “nls” function in R.
We now assume a same growth function between female and male fish. We call the model as “res000” (“0” means common and “1” does different).
# Setting a set of initial values for parameters (except for sigma)
start <- list(Linf=max(Length),K=0.1,t0=0)
# Estimation
res.000 <- nls(Length~growth.VB(t=Age,Linf, K, t0), start=start)
# Showing the estimation summary
summary(res.000)
Formula: Length ~ growth.VB(t = Age, Linf, K, t0)
Parameters:
Estimate Std. Error t value Pr(>|t|)
Linf 47.95842 2.91479 16.453 < 2e-16 ***
K 0.08936 0.02413 3.704 0.00027 ***
t0 -6.38637 2.14489 -2.977 0.00324 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.564 on 215 degrees of freedom
Number of iterations to convergence: 4
Achieved convergence tolerance: 5.63e-06
# Confidence interval for the growth paramters
confint(res.000)
2.5% 97.5%
Linf 44.10249477 58.3766050
K 0.04465578 0.1360594
t0 -12.34296242 -3.3029532
# Visual presentation
taxis <- seq(0,25,0.5); tlen <- length(taxis);
pred.000 <- predict(res.000, list(Age=taxis))
plot(Length~Age,xlim=c(0,25),ylim=c(0,50), main="Results under res.000 with prediction CI")
points(taxis, pred.000, type="l")
points(taxis, pred.000-1.96*sigma(res.000), type="l", lty=2)
points(taxis, pred.000+1.96*sigma(res.000), type="l", lty=2)
# Diagnostics
e <- residuals(res.000)/sigma(res.000)
plot(fitted(res.000), e, xlab="Fitted values", ylab="Standardized residuals")
abline(0,0); abline(-1.96,0,lty=2); abline(1.96,0,lty=2);
# Setting a set of initial values for female and male simultaneously
start <- list(Linf=rep(max(Length),2),K=c(0.1,0.1),t0=c(0,0))
# Estimation
res.111 <- nls(Length~growth.VB(t=Age,Linf[Sex], K[Sex], t0[Sex]), start=start)
# Showing the estimation summary
summary(res.111)
Formula: Length ~ growth.VB(t = Age, Linf[Sex], K[Sex], t0[Sex])
Parameters:
Estimate Std. Error t value Pr(>|t|)
Linf1 48.15050 4.52391 10.644 < 2e-16 ***
Linf2 47.70053 3.75965 12.687 < 2e-16 ***
K1 0.09797 0.04315 2.270 0.02418 *
K2 0.08646 0.02995 2.887 0.00429 **
t01 -5.12851 3.38334 -1.516 0.13106
t02 -6.94091 2.80413 -2.475 0.01410 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.567 on 212 degrees of freedom
Number of iterations to convergence: 4
Achieved convergence tolerance: 3.178e-06
# Confidence interval for the growth paramters
confint(res.111)
2.5% 97.5%
Linf1 43.09967561 81.7105984
Linf2 43.18112305 66.9658334
K1 0.02164258 0.1833475
K2 0.03031102 0.1453427
t01 -18.02360053 -0.9526554
t02 -16.13978541 -3.1310768
# Visual presentation
taxis <- seq(0,25,0.5); tlen <- length(taxis);
pred.female.111 <- predict(res.111, list(Age=taxis, Sex=rep(1,tlen)))
pred.male.111 <- predict(res.111, list(Age=taxis, Sex=rep(2,tlen)) )
plot(Length~Age,pch=mark[Sex],col=color[Sex],xlim=c(0,25),ylim=c(0,50),
main="Results under res.111 with prediction CI")
points(taxis, pred.female.111, type="l", col="red", lty=1)
points(taxis, pred.female.111-1.96*sigma(res.111), type="l", col="red", lty=2)
points(taxis, pred.female.111+1.96*sigma(res.111), type="l", col="red", lty=2)
points(taxis, pred.male.111, type="l", col="blue", lty=1)
points(taxis, pred.male.111-1.96*sigma(res.111), type="l", col="blue", lty=2)
points(taxis, pred.male.111+1.96*sigma(res.111), type="l", col="blue", lty=2)
legend(18,20,pch=mark,col=color,lty=c(1,2),legend=c("Female","Male"),bty="n")
# Diagnostics for female
e <- residuals(res.111)/sigma(res.111)
plot(fitted(res.111)[Sex==1], e[Sex==1], col="red",
xlab="Fitted values", ylab="Standardized residuals")
abline(0,0); abline(-1.96,0,lty=2); abline(1.96,0,lty=2);
# Diagnostics for male
plot(fitted(res.111)[Sex==2], e[Sex==2], col="blue",
xlab="Fitted values", ylab="Standardized residuals")
abline(0,0); abline(-1.96,0,lty=2); abline(1.96,0,lty=2);
We now try to compare which model is appropriate in terms of model fitting. Statistical hypothesis testing can be used for sequential, but the easiest way is to use AIC (Akaike Information Criterion), which is defined as \[ AIC = -2*max(loglikelihood) + 2*(Num \ of \ parameters). \] Smaller AIC values indicate better models. In R, a function “AIC” can be used to get the value.
# AIC values.
AIC(res.000)
[1] 1034.122
AIC(res.111)
[1] 1037.531
# Plot (not very different)
plot(Length~Age,pch=mark[Sex],col=color[Sex],xlim=c(0,25),ylim=c(0,50))
points(taxis, pred.000, type="l", lwd=2)
points(taxis, pred.female.111, type="l", col="red", lty=1)
points(taxis, pred.male.111, type="l", col="blue", lty=2)
Note 1: you can analyze the data with another assumption such as a case that only \(L_{\infty}\) is different between sex (as res.100) by modifying the code above.
Note 2: the code above assume a constant sigma across all the ages. In our example, the age coverage is limited, and the residuals show a similar level of variance over the ages. However, in case the date include broad range of ages, better approach is to assume heteroscedastic variance or taking logarithm of the length data.
%
%
%
\[ B_{t+1} = B_t + rB_t \left\{1-\left(\frac{B_t}{K}\right)^z \right\} - C_t \]
Assume that the CPUE is proportional to the population biomass as follows: \[ CPUE_t = q B_t. \]
Here \(q\) is a proportional coefficient called “catchability coefficient”.
Let us assume the following time series model and try to estimate the parameters in the model. \[ \log CPUE_t = \log q + log B_t + \varepsilon_t, \quad \varepsilon_t \sim N(0, \sigma^2) \] ## Set-up of exercises for production model using ML estimation
# Data for Indian Ocean albacore
Data <- read.csv("IO_Albacore.csv", header=T)
Year <- Data$Year
Catch <- Data$Catch
CPUE <- Data$CPUE
CV <- Data$CV
TT <- length(Year)
start.year <- min(Year)
final.year <- max(Year)
par(mai = c(1, 1.1, 1, 1.1))
plot(Year, Catch, type = "h", lwd=5, col="gray", ylab="Catch (tons)")
par(new =T)
plot(Year, CPUE, type="l", lwd=2, col="red", axes=FALSE, ylab="", ylim=c(0,2))
axis(4); mtext("CPUE", side=4, line=3)
PDM.PT<-function(r, K, z, B1, TT, Catch)
{
TT <- TT
B<-rep(NA,TT)
B[1]<-B1
for(t in 1:(TT-1))
{
tmp<-B[t]+r*B[t]*(1-(B[t]/K)^z)-Catch[t]
B[t+1]<-max(tmp,0.001)
}
B
}
NLL.PT <- function(par,z)
{
r<-exp(par[1])
K<-exp(par[2])
q<-exp(par[3])
sigma <-exp(par[4])
B<-PDM.PT(r, K, z, K, TT, Catch)
yobs <- log(CPUE)
mu <- log(q) + log(B)
pdf <- dnorm(yobs, mu, sigma, log=T)
loglike <- sum(pdf, na.rm=T)
negloglike <- -loglike
return(negloglike)
}
init.MSY <- 2*mean(Catch)
init.r <- 0.4
init.K <- 4*init.MSY/init.r
init<-log(c(init.r, init.K, 10^(-6), 1))
NLL.PT(init, z=1)
[1] 46.552
NLL.PT(2*init, z=1)
[1] 74.13025
res.SC<-optim(init, NLL.PT, z=1, method="BFGS", control=list(reltol=10^(-8)))
res <- res.SC; res
$par
[1] -0.9788076 12.7297969 -12.4127161 -1.4274852
$value
[1] -0.2648306
$counts
function gradient
172 51
$convergence
[1] 0
$message
NULL
r.est<-exp(res$par[1])
K.est<-exp(res$par[2])
q.est<-exp(res$par[3])
sigma.est<-exp(res$par[4])
MSY.est <- r.est*K.est/4
MSYL.est <- K.est/2
data.frame(r.est, K.est, q.est, sigma.est)
r.est K.est q.est sigma.est
1 0.3757589 337660.7 4.066548e-06 0.2399115
Predicted <- PDM.PT(r.est, K.est, z=1, K.est, TT, Catch)
# Fittness
par(mfrow=c(2,2))
plot(log(CPUE), log(q.est*Predicted), xlim=c(-1,1), ylim=c(-1,1)); abline(0,1)
# Time series plots for CPUE observation and its prediction by the model
plot(Year, CPUE, ylim=c(0, max(CPUE, na.rm=T)) )
points(Year, q.est*Predicted, type="l")
# Times series of estimated biomass
plot(Year, Predicted, type="l", ylim=c(0, 1.1*K.est), xaxs="i",yaxs="i")
# Times series of depletion level against the carrying capacity (B/K)
plot(Year, Predicted/K.est, type="l", ylim=c(0,1), xaxs="i",yaxs="i", ylab="Depletion")
rect(xleft=-1, ybottom=0, xright=2010, ytop=0.4, lwd=0, col=rgb(1,0,0,0.3))
rect(xleft=-1, ybottom=0.4, xright=2010, ytop=0.6, lwd=0, col=rgb(1,1,0,0.3))
rect(xleft=-1, ybottom=0.6, xright=2010, ytop=1, lwd=0, col=rgb(0,1,0, 0.3))
# Definition of B-ratio and F-ratio
Bratio <- Predicted/MSYL.est
Fratio <- Catch/(MSY.est) #actually Hratio
plot(Bratio,Fratio,type="n",xlim=c(0,2),ylim=c(0,2),
xlab="B/Bmsy",ylab="F/Fmsy", main="Kobe plot")
rect(xleft=0,ybottom=0,xright=1,ytop=1,lwd=0,col="#FEFF63FF")
rect(xleft=0,ybottom=1,xright=1,ytop=2,lwd=0,col="#FF4949FF")
rect(xleft=1,ybottom=1,xright=2,ytop=2,lwd=0,col="#FFBE7DFF")
rect(xleft=1,ybottom=0,xright=2,ytop=1,lwd=0,col="#85FF68FF")
points(Bratio,Fratio,type="l",lwd=1.8)
points(Bratio[1:(T-1)],Fratio[1:(T-1)],lwd=1,cex=3,pch=21,bg="skyblue")
points(Bratio[T],Fratio[T],lwd=1,cex=3,pch=21,bg="blue")
num <- c(seq((start.year-1900),99,1),seq(0,(final.year-2000),1))
text(Bratio,Fratio,labels=num,adj=0.5,cex=1, col="white")